diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 90afb1ce29684..0000000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,137 +0,0 @@ -version: 2.1 - -jobs: - test-arm: - machine: - image: ubuntu-2004:2022.04.1 - resource_class: arm.large - environment: - ENV_FILE: ci/deps/circle-310-arm64.yaml - PYTEST_WORKERS: auto - PATTERN: "not single_cpu and not slow and not network and not clipboard and not arm_slow and not db" - PYTEST_TARGET: "pandas" - PANDAS_CI: "1" - steps: - - checkout - - run: .circleci/setup_env.sh - - run: > - PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH - LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD - ci/run_tests.sh - linux-musl: - docker: - - image: quay.io/pypa/musllinux_1_1_aarch64 - resource_class: arm.large - steps: - # Install pkgs first to have git in the image - # (needed for checkout) - - run: | - apk update - apk add git - apk add musl-locales - - checkout - - run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev - . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 - python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" - python -m pip list --no-cache-dir - - run: | - . ~/virtualenvs/pandas-dev/bin/activate - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml - build-aarch64: - parameters: - cibw-build: - type: string - machine: - image: ubuntu-2004:2022.04.1 - resource_class: arm.large - environment: - TRIGGER_SOURCE: << pipeline.trigger_source >> - steps: - - checkout - - run: - name: Check if build is necessary - command: | - # Check if tag is defined or TRIGGER_SOURCE is scheduled - if [[ -n "$CIRCLE_TAG" ]]; then - echo 'export IS_PUSH="true"' >> "$BASH_ENV" - elif [[ $TRIGGER_SOURCE == "scheduled_pipeline" ]]; then - echo 'export IS_SCHEDULE_DISPATCH="true"' >> "$BASH_ENV" - # Look for the build label/[wheel build] in commit - # grep takes a regex, so need to escape brackets - elif (git log --format=oneline -n 1 $CIRCLE_SHA1) | grep -q '\[wheel build\]'; then - : # Do nothing - elif ! (curl https://api.github.com/repos/pandas-dev/pandas/issues/$CIRCLE_PR_NUMBER | jq '.labels' | grep -q 'Build'); then - circleci-agent step halt - fi - - run: - name: Build aarch64 wheels - no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that - command: | - pip3 install cibuildwheel==2.15.0 - # When this is a nightly wheel build, allow picking up NumPy 2.0 dev wheels: - if [[ "$IS_SCHEDULE_DISPATCH" == "true" || "$IS_PUSH" != 'true' ]]; then - export CIBW_ENVIRONMENT="PIP_EXTRA_INDEX_URL=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - fi - cibuildwheel --prerelease-pythons --output-dir wheelhouse - - environment: - CIBW_BUILD: << parameters.cibw-build >> - - - run: - name: Install Anaconda Client & Upload Wheels - command: | - echo "Install Mambaforge" - MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/23.1.0-0/Mambaforge-23.1.0-0-Linux-aarch64.sh" - echo "Downloading $MAMBA_URL" - wget -q $MAMBA_URL -O minimamba.sh - chmod +x minimamba.sh - - MAMBA_DIR="$HOME/miniconda3" - rm -rf $MAMBA_DIR - ./minimamba.sh -b -p $MAMBA_DIR - - export PATH=$MAMBA_DIR/bin:$PATH - - mamba install -y -c conda-forge anaconda-client - - source ci/upload_wheels.sh - set_upload_vars - upload_wheels - - store_artifacts: - path: wheelhouse/ - -workflows: - test: - # Don't run trigger this one when scheduled pipeline runs - when: - not: - equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] - jobs: - - test-arm - test-musl: - # Don't run trigger this one when scheduled pipeline runs - when: - not: - equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] - jobs: - - linux-musl - build-wheels: - jobs: - - build-aarch64: - filters: - tags: - only: /^v.*/ - matrix: - parameters: - cibw-build: ["cp39-manylinux_aarch64", - "cp310-manylinux_aarch64", - "cp311-manylinux_aarch64", - "cp312-manylinux_aarch64", - "cp39-musllinux_aarch64", - "cp310-musllinux_aarch64", - "cp311-musllinux_aarch64", - "cp312-musllinux_aarch64",] diff --git a/.gitattributes b/.gitattributes index 19c6fd2fd1d47..bc7dec642df0f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -61,14 +61,13 @@ pandas/_version.py export-subst *.pxi export-ignore # Ignoring stuff from the top level -.circleci export-ignore .github export-ignore asv_bench export-ignore ci export-ignore doc export-ignore gitpod export-ignore MANIFEST.in export-ignore -scripts export-ignore +scripts/** export-ignore typings export-ignore web export-ignore CITATION.cff export-ignore @@ -82,3 +81,6 @@ setup.py export-ignore # csv_dir_path fixture checks the existence of the directory # exclude the whole directory to avoid running related tests in sdist pandas/tests/io/parser/data export-ignore + +# Include cibw script in sdist since it's needed for building wheels +scripts/cibw_before_build.sh -export-ignore diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index b4778b74df335..e4b209d83913d 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -1,20 +1,13 @@ name: Run tests and report results -inputs: - preload: - description: Preload arguments for sanitizer - required: false - asan_options: - description: Arguments for Address Sanitizer (ASAN) - required: false runs: using: composite steps: - name: Test - run: ${{ inputs.asan_options }} ${{ inputs.preload }} ci/run_tests.sh + run: ci/run_tests.sh shell: bash -el {0} - name: Publish test results - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Test results path: test-data.xml diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index ceeebfcd1c90c..3eb68bdd2a15c 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -14,3 +14,9 @@ runs: condarc-file: ci/.condarc cache-environment: true cache-downloads: true + + - name: Uninstall pyarrow + if: ${{ env.REMOVE_PYARROW == '1' }} + run: | + micromamba remove -y pyarrow + shell: bash -el {0} diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index b49b9a67c4743..dacf740e5d4d8 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.1.x + - 2.3.x pull_request: branches: - main - - 2.1.x + - 2.3.x env: ENV_FILE: environment.yml @@ -51,6 +51,11 @@ jobs: # TODO: The doctests have to be run first right now, since the Cython doctests only work # with pandas installed in non-editable mode # This can be removed once pytest-cython doesn't require C extensions to be installed inplace + + - name: Extra installs + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 + - name: Run doctests run: cd ci && ./code_checks.sh doctests if: ${{ steps.build.outcome == 'success' && always() }} diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index da232404e6ff5..3abe9c92bcefa 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -4,13 +4,13 @@ on: push: branches: - main - - 2.1.x + - 2.3.x tags: - '*' pull_request: branches: - main - - 2.1.x + - 2.3.x env: ENV_FILE: environment.yml @@ -46,6 +46,10 @@ jobs: - name: Build Pandas uses: ./.github/actions/build_pandas + - name: Extra installs + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 + - name: Test website run: python -m pytest web/ diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 04d8b8e006985..485a890e26abd 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.1.x + - 2.3.x pull_request: branches: - main - - 2.1.x + - 2.3.x types: [ labeled, opened, synchronize, reopened ] permissions: @@ -24,7 +24,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"] + extra: ["test", "pyarrow", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"] fail-fast: false name: Install Extras - ${{ matrix.extra }} concurrency: @@ -53,7 +53,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - python-version: ['3.9', '3.10', '3.11'] + python-version: ['3.10', '3.11'] fail-fast: false name: Test Conda Forge Recipe - Python ${{ matrix.python-version }} concurrency: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 6ca4d19196874..321b633bbb6bb 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.1.x + - 2.3.x pull_request: branches: - main - - 2.1.x + - 2.3.x paths-ignore: - "doc/**" - "web/**" @@ -22,21 +22,25 @@ defaults: jobs: ubuntu: - runs-on: ubuntu-22.04 + runs-on: ${{ matrix.platform }} timeout-minutes: 90 strategy: matrix: - env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] + platform: [ubuntu-22.04, ubuntu-24.04-arm] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] + pandas_future_infer_string: ["0"] include: - name: "Downstream Compat" env_file: actions-311-downstream_compat.yaml pattern: "not slow and not network and not single_cpu" pytest_target: "pandas/tests/test_downstream.py" + platform: ubuntu-22.04 - name: "Minimum Versions" - env_file: actions-39-minimum_versions.yaml + env_file: actions-310-minimum_versions.yaml pattern: "not slow and not network and not single_cpu" + platform: ubuntu-22.04 - name: "Locale: it_IT" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" @@ -47,6 +51,7 @@ jobs: # Also install it_IT (its encoding is ISO8859-1) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "it_IT" + platform: ubuntu-22.04 - name: "Locale: zh_CN" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" @@ -57,70 +62,76 @@ jobs: # Also install zh_CN (its encoding is gb2312) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "zh_CN" - - name: "Copy-on-Write 3.9" - env_file: actions-39.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "1" + platform: ubuntu-22.04 - name: "Copy-on-Write 3.10" env_file: actions-310.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "1" + platform: ubuntu-22.04 - name: "Copy-on-Write 3.11" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "1" + platform: ubuntu-22.04 - name: "Copy-on-Write 3.12" env_file: actions-312.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "1" + platform: ubuntu-22.04 - name: "Copy-on-Write 3.11 (warnings)" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "warn" + platform: ubuntu-22.04 - name: "Copy-on-Write 3.10 (warnings)" env_file: actions-310.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "warn" - - name: "Copy-on-Write 3.9 (warnings)" - env_file: actions-39.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "warn" + platform: ubuntu-22.04 + - name: "Future infer strings" + env_file: actions-312.yaml + pandas_future_infer_string: "1" + pandas_copy_on_write: "1" + platform: ubuntu-22.04 + - name: "Future infer strings (without pyarrow)" + env_file: actions-311.yaml + pandas_future_infer_string: "1" + pandas_copy_on_write: "1" + platform: ubuntu-22.04 - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" test_args: "--max-worker-restart 0" + platform: ubuntu-22.04 - name: "Numpy Dev" env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" - test_args: "-W error::FutureWarning" + test_args: "-W error::DeprecationWarning -W error::FutureWarning" + platform: ubuntu-22.04 - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" - - name: "ASAN / UBSAN" - env_file: actions-311-sanitizers.yaml - pattern: "not slow and not network and not single_cpu and not skip_ubsan" - asan_options: "ASAN_OPTIONS=detect_leaks=0" - preload: LD_PRELOAD=$(gcc -print-file-name=libasan.so) - meson_args: --config-settings=setup-args="-Db_sanitize=address,undefined" - cflags_adds: -fno-sanitize-recover=all - pytest_workers: -1 # disable pytest-xdist as it swallows stderr from ASAN + pandas_future_infer_string: "1" + pandas_copy_on_write: "1" + platform: ubuntu-22.04 fail-fast: false - name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} + name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }}-${{ matrix.platform }} env: PATTERN: ${{ matrix.pattern }} LANG: ${{ matrix.lang || 'C.UTF-8' }} LC_ALL: ${{ matrix.lc_all || '' }} PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} - PANDAS_CI: ${{ matrix.pandas_ci || '1' }} + PANDAS_CI: '1' + PANDAS_FUTURE_INFER_STRING: ${{ matrix.pandas_future_infer_string || '0' }} TEST_ARGS: ${{ matrix.test_args || '' }} PYTEST_WORKERS: ${{ matrix.pytest_workers || 'auto' }} PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} - NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }} # Clipboard tests QT_QPA_PLATFORM: offscreen + REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }} concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }} + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }}-${{ matrix.pandas_future_infer_string }}-${{ matrix.platform }} cancel-in-progress: true services: @@ -167,8 +178,8 @@ jobs: fetch-depth: 0 - name: Extra installs - run: sudo apt-get update && sudo apt-get install -y ${{ matrix.extra_apt }} - if: ${{ matrix.extra_apt }} + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 ${{ matrix.extra_apt || ''}} - name: Generate extra locales # These extra locales will be available for locale.setlocale() calls in tests @@ -190,18 +201,12 @@ jobs: - name: Test (not single_cpu) uses: ./.github/actions/run-tests if: ${{ matrix.name != 'Pypy' }} - with: - preload: ${{ matrix.preload }} - asan_options: ${{ matrix.asan_options }} env: # Set pattern to not single_cpu if not already set PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }} - name: Test (single_cpu) uses: ./.github/actions/run-tests - with: - preload: ${{ matrix.preload }} - asan_options: ${{ matrix.asan_options }} env: PATTERN: 'single_cpu' PYTEST_WORKERS: 0 @@ -211,8 +216,9 @@ jobs: timeout-minutes: 90 strategy: matrix: - os: [macos-latest, windows-latest] - env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] + # Note: Don't use macOS latest since macos 14 appears to be arm64 only + os: [macos-13, macos-14, windows-latest] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] fail-fast: false runs-on: ${{ matrix.os }} name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} @@ -224,8 +230,7 @@ jobs: PANDAS_CI: 1 PYTEST_TARGET: pandas PATTERN: "not slow and not db and not network and not single_cpu" - # GH 47443: PYTEST_WORKERS > 0 crashes Windows builds with memory related errors - PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '0' }} + PYTEST_WORKERS: 'auto' steps: - name: Checkout @@ -266,12 +271,14 @@ jobs: fi - name: Build environment and Run Tests # https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388 + # Note: Pinned to Cython 3.0.10 to avoid numerical instability in 32-bit environments + # https://github.com/pandas-dev/pandas/pull/61423 run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" - python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython==3.0.10 python-dateutil "pytz<2024.2" pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -309,7 +316,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil "pytz<2024.2" pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" python -m pip list --no-cache-dir @@ -343,7 +350,7 @@ jobs: # To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs # to the corresponding posix/windows-macos/sdist etc. workflows. # Feel free to modify this comment as necessary. - if: false # Uncomment this to freeze the workflow, comment it to unfreeze + # if: false # Uncomment this to freeze the workflow, comment it to unfreeze defaults: run: shell: bash -eou pipefail {0} @@ -351,7 +358,8 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-22.04, macOS-latest, windows-latest] + # Separate out macOS 13 and 14, since macOS 14 is arm64 only + os: [ubuntu-22.04, macOS-13, macOS-14, windows-latest] timeout-minutes: 90 @@ -374,7 +382,7 @@ jobs: - name: Set up Python Dev Version uses: actions/setup-python@v5 with: - python-version: '3.12-dev' + python-version: '3.13-dev' - name: Build Environment run: | @@ -382,7 +390,7 @@ jobs: python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov + python -m pip install python-dateutil "pytz<2024.2" tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror" python -m pip list diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 841559c8e9799..e5d13307973e0 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -94,10 +94,23 @@ jobs: buildplat: - [ubuntu-22.04, manylinux_x86_64] - [ubuntu-22.04, musllinux_x86_64] - - [macos-12, macosx_*] + - [ubuntu-24.04-arm, manylinux_aarch64] + - [macos-13, macosx_x86_64] + # Note: M1 images on Github Actions start from macOS 14 + - [macos-14, macosx_arm64] - [windows-2022, win_amd64] # TODO: support PyPy? - python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] + python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]] + include: + # TODO: Remove this plus installing build deps in cibw_before_build.sh + # after pandas can be built with a released NumPy/Cython + - python: ["cp313t", "3.13"] + cibw_build_frontend: 'pip; args: --no-build-isolation' + # TODO: Build free-threaded wheels for Windows + exclude: + - buildplat: [windows-2022, win_amd64] + python: ["cp313t", "3.13"] + env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} @@ -128,7 +141,7 @@ jobs: # Python version used to build sdist doesn't matter # wheel will be built from sdist with the correct version - name: Unzip sdist (macOS) - if: ${{ matrix.buildplat[1] == 'macosx_*' }} + if: ${{ startsWith(matrix.buildplat[1], 'macosx') }} run: | tar -xzf ./dist/${{ env.sdist_name }} -C ./dist @@ -137,26 +150,14 @@ jobs: shell: bash -el {0} run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - - name: Build normal wheels - if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }} - uses: pypa/cibuildwheel@v2.16.2 - with: - package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} - env: - CIBW_PRERELEASE_PYTHONS: True - CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} - - - name: Build nightly wheels (with NumPy pre-release) - if: ${{ (env.IS_SCHEDULE_DISPATCH == 'true' && env.IS_PUSH != 'true') }} - uses: pypa/cibuildwheel@v2.16.2 + - name: Build wheels + uses: pypa/cibuildwheel@v2.21.3 with: - package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} + package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: - # The nightly wheels should be build witht he NumPy 2.0 pre-releases - # which requires the additional URL. - CIBW_ENVIRONMENT: PIP_EXTRA_INDEX_URL=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple CIBW_PRERELEASE_PYTHONS: True CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} + CIBW_BUILD_FRONTEND: ${{ matrix.cibw_build_frontend || 'pip' }} - name: Set up Python uses: mamba-org/setup-micromamba@v1 @@ -181,17 +182,17 @@ jobs: shell: pwsh run: | $TST_CMD = @" - python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0; + python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytz<2024.2; python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); - python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`'; + python -c `'import pandas as pd; pd.test(extra_args=[`\"--no-strict-data-files`\", `\"-m not clipboard and not single_cpu and not slow and not network and not db`\"])`'; "@ # add rc to the end of the image name if the Python version is unreleased - docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} - docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD + docker pull python:${{ matrix.python[1] == '3.13' && '3.13-rc' || format('{0}-windowsservercore', matrix.python[1]) }} + docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.13' && '3.13-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD - uses: actions/upload-artifact@v4 with: - name: ${{ matrix.python[0] }}-${{ startsWith(matrix.buildplat[1], 'macosx') && 'macosx' || matrix.buildplat[1] }} + name: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} path: ./wheelhouse/*.whl - name: Upload wheels & sdist diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2a070e9a49b97..9b3a9827e67e2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,6 +32,8 @@ repos: # TODO: remove autofixe-only rules when they are checked by ruff name: ruff-selected-autofixes alias: ruff-selected-autofixes + files: ^pandas + exclude: ^pandas/tests args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix] - repo: https://github.com/jendrikseipp/vulture rev: 'v2.10' @@ -272,13 +274,6 @@ repos: language: python types: [rst] files: ^doc/source/(development|reference)/ - - id: unwanted-patterns-bare-pytest-raises - name: Check for use of bare pytest raises - language: python - entry: python scripts/validate_unwanted_patterns.py --validation-type="bare_pytest_raises" - types: [python] - files: ^pandas/tests/ - exclude: ^pandas/tests/extension/ - id: unwanted-patterns-private-function-across-module name: Check for use of private functions across modules language: python @@ -356,18 +351,6 @@ repos: files: ^pandas/ exclude: ^(pandas/_libs/|pandas/tests/|pandas/errors/__init__.py$|pandas/_version.py) types: [python] - - id: future-annotations - name: import annotations from __future__ - entry: 'from __future__ import annotations' - language: pygrep - args: [--negate] - files: ^pandas/ - types: [python] - exclude: | - (?x) - /(__init__\.py)|(api\.py)|(_version\.py)|(testing\.py)|(conftest\.py)$ - |/tests/ - |/_testing/ - id: check-test-naming name: check that test names start with 'test' entry: python -m scripts.check_test_naming diff --git a/MANIFEST.in b/MANIFEST.in index 9894381ed6252..a7d7d7eb4e062 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -62,3 +62,6 @@ prune pandas/tests/io/parser/data # Selectively re-add *.cxx files that were excluded above graft pandas/_libs/src graft pandas/_libs/include + +# Include cibw script in sdist since it's needed for building wheels +include scripts/cibw_before_build.sh diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 9ac83db4f85b9..dae6107db4d92 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -408,6 +408,9 @@ def time_read_stringcsv(self, engine): def time_read_bytescsv(self, engine): read_csv(self.data(self.BytesIO_input), engine=engine) + def peakmem_read_csv(self, engine): + read_csv(self.data(self.BytesIO_input), engine=engine) + class ReadCSVCategorical(BaseIO): fname = "__test__.csv" diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml similarity index 93% rename from ci/deps/actions-39-minimum_versions.yaml rename to ci/deps/actions-310-minimum_versions.yaml index fd71315d2e7ac..cac0814acfbd5 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -4,12 +4,12 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.9 + - python=3.10 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies @@ -22,7 +22,7 @@ dependencies: # required dependencies - python-dateutil=2.8.2 - - numpy=1.22.4, <2 + - numpy=1.22.4 - pytz=2020.1 # optional dependencies diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 4b62ecc79e4ef..0572091d88cd9 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -5,23 +5,23 @@ dependencies: - python=3.10 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 # required dependencies - python-dateutil - - numpy<2 - - pytz + - numpy + # pytz 2024.2 timezones cause wrong results + - pytz<2024.2 # optional dependencies - beautifulsoup4>=4.11.2 @@ -52,7 +52,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 @@ -61,3 +61,4 @@ dependencies: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 95c0319d6f5b8..f7e9ad045ed04 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -6,9 +6,9 @@ dependencies: - python=3.11 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies @@ -21,8 +21,9 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 - - pytz + - numpy + # pytz 2024.2 timezones cause wrong results + - pytz<2024.2 # optional dependencies - beautifulsoup4>=4.11.2 @@ -53,7 +54,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index b62e8630f2059..d714e99c765e7 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -5,8 +5,8 @@ dependencies: - python=3.11 # build dependencies - - versioneer[toml] - - meson[ninja]=1.2.1 + - versioneer + - meson=1.2.1 - meson-python=0.13.1 - cython>=0.29.33 @@ -21,7 +21,8 @@ dependencies: # pandas dependencies - python-dateutil - - pytz + # pytz 2024.2 timezones cause wrong results + - pytz<2024.2 - pip - pip: diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 5455b9b84b034..40b936472d409 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -5,8 +5,8 @@ dependencies: - python=3.11 # build dependencies - - versioneer[toml] - - meson[ninja]=1.2.1 + - versioneer + - meson=1.2.1 - cython>=0.29.33 - meson-python=0.13.1 @@ -18,13 +18,14 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 - - pytz + - numpy + # pytz 2024.2 timezones cause wrong results + - pytz<2024.2 - pip - pip: - "tzdata>=2022.7" - - "--extra-index-url https://pypi.fury.io/arrow-nightlies/" + - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--prefer-binary" - "--pre" - "pyarrow" diff --git a/ci/deps/actions-311-sanitizers.yaml b/ci/deps/actions-311-sanitizers.yaml deleted file mode 100644 index dcd381066b0ea..0000000000000 --- a/ci/deps/actions-311-sanitizers.yaml +++ /dev/null @@ -1,32 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.11 - - # build dependencies - - versioneer[toml] - - cython>=0.29.33 - - meson[ninja]=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 - - boto3 - - hypothesis>=6.46.1 - - pyqt>=5.15.9 - - # required dependencies - - python-dateutil - - numpy<2 - - pytz - - # pandas dependencies - - pip - - - pip: - - "tzdata>=2022.7" diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 52074ae00ea18..db89be7780bf0 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -5,23 +5,23 @@ dependencies: - python=3.11 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 # required dependencies - python-dateutil - - numpy<2 - - pytz + - numpy + # pytz 2024.2 timezones cause wrong results + - pytz<2024.2 # optional dependencies - beautifulsoup4>=4.11.2 @@ -52,7 +52,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 @@ -60,4 +60,4 @@ dependencies: - pip: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 4c51e9e6029e3..4d690501571a7 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -5,23 +5,23 @@ dependencies: - python=3.12 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 # required dependencies - python-dateutil - - numpy<2 - - pytz + - numpy + # pytz 2024.2 timezones cause wrong results + - pytz<2024.2 # optional dependencies - beautifulsoup4>=4.11.2 @@ -52,7 +52,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 @@ -61,3 +61,4 @@ dependencies: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml deleted file mode 100644 index cbe8f77c15730..0000000000000 --- a/ci/deps/actions-39.yaml +++ /dev/null @@ -1,63 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.9 - - # build dependencies - - versioneer[toml] - - cython>=0.29.33 - - meson[ninja]=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 - - boto3 - - # required dependencies - - python-dateutil - - numpy<2 - - pytz - - # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 - - bottleneck>=1.3.6 - - fastparquet>=2022.12.0 - - fsspec>=2022.11.0 - - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 - - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 - - odfpy>=1.4.1 - - qtpy>=2.3.0 - - openpyxl>=3.1.0 - - psycopg2>=2.9.6 - - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyqt>=5.15.9 - - pyreadstat>=1.2.0 - - pytables>=3.8.0 - - python-calamine>=0.1.7 - - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 - - sqlalchemy>=2.0.0 - - tabulate>=0.9.0 - - xarray>=2022.12.0 - - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 - - - pip: - - adbc-driver-postgresql>=0.8.0 - - adbc-driver-sqlite>=0.8.0 - - tzdata>=2022.7 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 5a5a01f7aec72..ba518312df24c 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -8,9 +8,9 @@ dependencies: - python=3.9[build=*_pypy] # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies @@ -20,8 +20,9 @@ dependencies: - hypothesis>=6.46.1 # required - - numpy<2 + - numpy - python-dateutil + # pytz 2024.2 timezones cause wrong results - pytz - pip: - tzdata>=2022.7 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml deleted file mode 100644 index 8e106445cd4e0..0000000000000 --- a/ci/deps/circle-310-arm64.yaml +++ /dev/null @@ -1,61 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.10 - - # build dependencies - - versioneer[toml] - - cython>=0.29.33 - - meson[ninja]=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 - - boto3 - - # required dependencies - - python-dateutil - - numpy<2 - - pytz - - # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 - - bottleneck>=1.3.6 - - fastparquet>=2022.12.0 - - fsspec>=2022.11.0 - - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 - - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 - - odfpy>=1.4.1 - - qtpy>=2.3.0 - - openpyxl>=3.1.0 - - psycopg2>=2.9.6 - - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyqt>=5.15.9 - - pyreadstat>=1.2.0 - - pytables>=3.8.0 - - python-calamine>=0.1.7 - - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 - - sqlalchemy>=2.0.0 - - tabulate>=0.9.0 - - xarray>=2022.12.0 - - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 - - pip: - - adbc-driver-postgresql>=0.8.0 - - adbc-driver-sqlite>=0.8.0 diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 48ef21686a26f..39ab0890a32d1 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -10,7 +10,7 @@ echo PYTHONHASHSEED=$PYTHONHASHSEED COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml" -PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" +PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fE -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" diff --git a/doc/source/conf.py b/doc/source/conf.py index be6150d4e54ba..3f3241f81af59 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -254,7 +254,9 @@ "json_url": "https://pandas.pydata.org/versions.json", "version_match": switcher_version, }, - "show_version_warning_banner": True, + # This shows a warning for patch releases since the + # patch version doesn't compare as equal (e.g. 2.2.1 != 2.2.0 but it should be) + "show_version_warning_banner": False, "icon_links": [ { "name": "Mastodon", diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 1d7eca5223544..b9f7d64d4b2f8 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -21,7 +21,7 @@ Instructions for installing :ref:`from source `, Python version support ---------------------- -Officially Python 3.9, 3.10 and 3.11. +Officially Python 3.9, 3.10, 3.11 and 3.12. Installing pandas ----------------- diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index fefb02dd916cd..1d9019ff22c23 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -49,6 +49,7 @@ Conversion DataFrame.infer_objects DataFrame.copy DataFrame.bool + DataFrame.to_numpy Indexing, iteration ~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index af262f9e6c336..d40f6e559b8fa 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -177,6 +177,7 @@ Reindexing / selection / label manipulation :toctree: api/ Series.align + Series.case_when Series.drop Series.droplevel Series.drop_duplicates @@ -341,7 +342,6 @@ Datetime properties Series.dt.tz Series.dt.freq Series.dt.unit - Series.dt.normalize Datetime methods ^^^^^^^^^^^^^^^^ diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index f7d89110e6c8f..2ed446324f6b9 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -160,11 +160,10 @@ Here is a sample (using 100 column x 100,000 row ``DataFrames``): .. csv-table:: :header: "Operation", "0.11.0 (ms)", "Prior Version (ms)", "Ratio to Prior" :widths: 25, 25, 25, 25 - :delim: ; - ``df1 > df2``; 13.32; 125.35; 0.1063 - ``df1 * df2``; 21.71; 36.63; 0.5928 - ``df1 + df2``; 22.04; 36.50; 0.6039 + ``df1 > df2``, 13.32, 125.35, 0.1063 + ``df1 * df2``, 21.71, 36.63, 0.5928 + ``df1 + df2``, 22.04, 36.50, 0.6039 You are highly encouraged to install both libraries. See the section :ref:`Recommended Dependencies ` for more installation info. diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst index 050c3901c3420..a083297925007 100644 --- a/doc/source/user_guide/copy_on_write.rst +++ b/doc/source/user_guide/copy_on_write.rst @@ -317,7 +317,7 @@ you are modifying one object inplace. .. ipython:: python df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df2 = df.reset_index() + df2 = df.reset_index(drop=True) df2.iloc[0, 0] = 100 This creates two objects that share data and thus the setitem operation will trigger a @@ -328,7 +328,7 @@ held by the object. .. ipython:: python df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df = df.reset_index() + df = df.reset_index(drop=True) df.iloc[0, 0] = 100 No copy is necessary in this example. diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 99c85ac66623d..26eb656357bf6 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -315,19 +315,8 @@ Why not make NumPy like R? Many people have suggested that NumPy should simply emulate the ``NA`` support present in the more domain-specific statistical programming language `R -`__. Part of the reason is the NumPy type hierarchy: - -.. csv-table:: - :header: "Typeclass","Dtypes" - :widths: 30,70 - :delim: | - - ``numpy.floating`` | ``float16, float32, float64, float128`` - ``numpy.integer`` | ``int8, int16, int32, int64`` - ``numpy.unsignedinteger`` | ``uint8, uint16, uint32, uint64`` - ``numpy.object_`` | ``object_`` - ``numpy.bool_`` | ``bool_`` - ``numpy.character`` | ``bytes_, str_`` +`__. Part of the reason is the +`NumPy type hierarchy `__. The R language, by contrast, only has a handful of built-in data types: ``integer``, ``numeric`` (floating-point), ``character``, and diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 11863f8aead31..ea08ffe061244 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -509,29 +509,28 @@ listed below, those with a ``*`` do *not* have an efficient, GroupBy-specific, i .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - - :meth:`~.DataFrameGroupBy.any`;Compute whether any of the values in the groups are truthy - :meth:`~.DataFrameGroupBy.all`;Compute whether all of the values in the groups are truthy - :meth:`~.DataFrameGroupBy.count`;Compute the number of non-NA values in the groups - :meth:`~.DataFrameGroupBy.cov` * ;Compute the covariance of the groups - :meth:`~.DataFrameGroupBy.first`;Compute the first occurring value in each group - :meth:`~.DataFrameGroupBy.idxmax`;Compute the index of the maximum value in each group - :meth:`~.DataFrameGroupBy.idxmin`;Compute the index of the minimum value in each group - :meth:`~.DataFrameGroupBy.last`;Compute the last occurring value in each group - :meth:`~.DataFrameGroupBy.max`;Compute the maximum value in each group - :meth:`~.DataFrameGroupBy.mean`;Compute the mean of each group - :meth:`~.DataFrameGroupBy.median`;Compute the median of each group - :meth:`~.DataFrameGroupBy.min`;Compute the minimum value in each group - :meth:`~.DataFrameGroupBy.nunique`;Compute the number of unique values in each group - :meth:`~.DataFrameGroupBy.prod`;Compute the product of the values in each group - :meth:`~.DataFrameGroupBy.quantile`;Compute a given quantile of the values in each group - :meth:`~.DataFrameGroupBy.sem`;Compute the standard error of the mean of the values in each group - :meth:`~.DataFrameGroupBy.size`;Compute the number of values in each group - :meth:`~.DataFrameGroupBy.skew` *;Compute the skew of the values in each group - :meth:`~.DataFrameGroupBy.std`;Compute the standard deviation of the values in each group - :meth:`~.DataFrameGroupBy.sum`;Compute the sum of the values in each group - :meth:`~.DataFrameGroupBy.var`;Compute the variance of the values in each group + + :meth:`~.DataFrameGroupBy.any`,Compute whether any of the values in the groups are truthy + :meth:`~.DataFrameGroupBy.all`,Compute whether all of the values in the groups are truthy + :meth:`~.DataFrameGroupBy.count`,Compute the number of non-NA values in the groups + :meth:`~.DataFrameGroupBy.cov` * ,Compute the covariance of the groups + :meth:`~.DataFrameGroupBy.first`,Compute the first occurring value in each group + :meth:`~.DataFrameGroupBy.idxmax`,Compute the index of the maximum value in each group + :meth:`~.DataFrameGroupBy.idxmin`,Compute the index of the minimum value in each group + :meth:`~.DataFrameGroupBy.last`,Compute the last occurring value in each group + :meth:`~.DataFrameGroupBy.max`,Compute the maximum value in each group + :meth:`~.DataFrameGroupBy.mean`,Compute the mean of each group + :meth:`~.DataFrameGroupBy.median`,Compute the median of each group + :meth:`~.DataFrameGroupBy.min`,Compute the minimum value in each group + :meth:`~.DataFrameGroupBy.nunique`,Compute the number of unique values in each group + :meth:`~.DataFrameGroupBy.prod`,Compute the product of the values in each group + :meth:`~.DataFrameGroupBy.quantile`,Compute a given quantile of the values in each group + :meth:`~.DataFrameGroupBy.sem`,Compute the standard error of the mean of the values in each group + :meth:`~.DataFrameGroupBy.size`,Compute the number of values in each group + :meth:`~.DataFrameGroupBy.skew` * ,Compute the skew of the values in each group + :meth:`~.DataFrameGroupBy.std`,Compute the standard deviation of the values in each group + :meth:`~.DataFrameGroupBy.sum`,Compute the sum of the values in each group + :meth:`~.DataFrameGroupBy.var`,Compute the variance of the values in each group Some examples: @@ -835,19 +834,18 @@ The following methods on GroupBy act as transformations. .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - - :meth:`~.DataFrameGroupBy.bfill`;Back fill NA values within each group - :meth:`~.DataFrameGroupBy.cumcount`;Compute the cumulative count within each group - :meth:`~.DataFrameGroupBy.cummax`;Compute the cumulative max within each group - :meth:`~.DataFrameGroupBy.cummin`;Compute the cumulative min within each group - :meth:`~.DataFrameGroupBy.cumprod`;Compute the cumulative product within each group - :meth:`~.DataFrameGroupBy.cumsum`;Compute the cumulative sum within each group - :meth:`~.DataFrameGroupBy.diff`;Compute the difference between adjacent values within each group - :meth:`~.DataFrameGroupBy.ffill`;Forward fill NA values within each group - :meth:`~.DataFrameGroupBy.pct_change`;Compute the percent change between adjacent values within each group - :meth:`~.DataFrameGroupBy.rank`;Compute the rank of each value within each group - :meth:`~.DataFrameGroupBy.shift`;Shift values up or down within each group + + :meth:`~.DataFrameGroupBy.bfill`,Back fill NA values within each group + :meth:`~.DataFrameGroupBy.cumcount`,Compute the cumulative count within each group + :meth:`~.DataFrameGroupBy.cummax`,Compute the cumulative max within each group + :meth:`~.DataFrameGroupBy.cummin`,Compute the cumulative min within each group + :meth:`~.DataFrameGroupBy.cumprod`,Compute the cumulative product within each group + :meth:`~.DataFrameGroupBy.cumsum`,Compute the cumulative sum within each group + :meth:`~.DataFrameGroupBy.diff`,Compute the difference between adjacent values within each group + :meth:`~.DataFrameGroupBy.ffill`,Forward fill NA values within each group + :meth:`~.DataFrameGroupBy.pct_change`,Compute the percent change between adjacent values within each group + :meth:`~.DataFrameGroupBy.rank`,Compute the rank of each value within each group + :meth:`~.DataFrameGroupBy.shift`,Shift values up or down within each group In addition, passing any built-in aggregation method as a string to :meth:`~.DataFrameGroupBy.transform` (see the next section) will broadcast the result @@ -1095,11 +1093,10 @@ efficient, GroupBy-specific, implementation. .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - :meth:`~.DataFrameGroupBy.head`;Select the top row(s) of each group - :meth:`~.DataFrameGroupBy.nth`;Select the nth row(s) of each group - :meth:`~.DataFrameGroupBy.tail`;Select the bottom row(s) of each group + :meth:`~.DataFrameGroupBy.head`,Select the top row(s) of each group + :meth:`~.DataFrameGroupBy.nth`,Select the nth row(s) of each group + :meth:`~.DataFrameGroupBy.tail`,Select the bottom row(s) of each group Users can also use transformations along with Boolean indexing to construct complex filtrations within groups. For example, suppose we are given groups of products and diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 4954ee1538697..6c7aa15bfb75d 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -101,13 +101,14 @@ well). Any of the axes accessors may be the null slice ``:``. Axes left out of the specification are assumed to be ``:``, e.g. ``p.loc['a']`` is equivalent to ``p.loc['a', :]``. -.. csv-table:: - :header: "Object Type", "Indexers" - :widths: 30, 50 - :delim: ; - Series; ``s.loc[indexer]`` - DataFrame; ``df.loc[row_indexer,column_indexer]`` +.. ipython:: python + + ser = pd.Series(range(5), index=list("abcde")) + ser.loc[["a", "c", "e"]] + + df = pd.DataFrame(np.arange(25).reshape(5, 5), index=list("abcde"), columns=list("abcde")) + df.loc[["a", "c", "e"], ["b", "d"]] .. _indexing.basics: @@ -123,10 +124,9 @@ indexing pandas objects with ``[]``: .. csv-table:: :header: "Object Type", "Selection", "Return Value Type" :widths: 30, 30, 60 - :delim: ; - Series; ``series[label]``; scalar value - DataFrame; ``frame[colname]``; ``Series`` corresponding to colname + Series, ``series[label]``, scalar value + DataFrame, ``frame[colname]``, ``Series`` corresponding to colname Here we construct a simple time series data set to use for illustrating the indexing functionality: @@ -1730,7 +1730,7 @@ Returning a view versus a copy .. warning:: :ref:`Copy-on-Write ` - will become the new default in pandas 3.0. This means than chained indexing will + will become the new default in pandas 3.0. This means that chained indexing will never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary anymore. See :ref:`this section ` diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 6148086452d54..64777eb920d5a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -16,27 +16,26 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like .. csv-table:: :header: "Format Type", "Data Description", "Reader", "Writer" :widths: 30, 100, 60, 60 - :delim: ; - - text;`CSV `__;:ref:`read_csv`;:ref:`to_csv` - text;Fixed-Width Text File;:ref:`read_fwf` - text;`JSON `__;:ref:`read_json`;:ref:`to_json` - text;`HTML `__;:ref:`read_html`;:ref:`to_html` - text;`LaTeX `__;;:ref:`Styler.to_latex` - text;`XML `__;:ref:`read_xml`;:ref:`to_xml` - text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` - binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` - binary;`OpenDocument `__;:ref:`read_excel`; - binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` - binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` - binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` - binary;`ORC Format `__;:ref:`read_orc`;:ref:`to_orc` - binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` - binary;`SAS `__;:ref:`read_sas`; - binary;`SPSS `__;:ref:`read_spss`; - binary;`Python Pickle Format `__;:ref:`read_pickle`;:ref:`to_pickle` - SQL;`SQL `__;:ref:`read_sql`;:ref:`to_sql` - SQL;`Google BigQuery `__;:ref:`read_gbq`;:ref:`to_gbq` + + text,`CSV `__, :ref:`read_csv`, :ref:`to_csv` + text,Fixed-Width Text File, :ref:`read_fwf` , NA + text,`JSON `__, :ref:`read_json`, :ref:`to_json` + text,`HTML `__, :ref:`read_html`, :ref:`to_html` + text,`LaTeX `__, :ref:`Styler.to_latex` , NA + text,`XML `__, :ref:`read_xml`, :ref:`to_xml` + text, Local clipboard, :ref:`read_clipboard`, :ref:`to_clipboard` + binary,`MS Excel `__ , :ref:`read_excel`, :ref:`to_excel` + binary,`OpenDocument `__, :ref:`read_excel`, NA + binary,`HDF5 Format `__, :ref:`read_hdf`, :ref:`to_hdf` + binary,`Feather Format `__, :ref:`read_feather`, :ref:`to_feather` + binary,`Parquet Format `__, :ref:`read_parquet`, :ref:`to_parquet` + binary,`ORC Format `__, :ref:`read_orc`, :ref:`to_orc` + binary,`Stata `__, :ref:`read_stata`, :ref:`to_stata` + binary,`SAS `__, :ref:`read_sas` , NA + binary,`SPSS `__, :ref:`read_spss` , NA + binary,`Python Pickle Format `__, :ref:`read_pickle`, :ref:`to_pickle` + SQL,`SQL `__, :ref:`read_sql`,:ref:`to_sql` + SQL,`Google BigQuery `__;:ref:`read_gbq`;:ref:`to_gbq` :ref:`Here ` is an informal performance comparison for some of these IO methods. @@ -1838,14 +1837,13 @@ with optional parameters: .. csv-table:: :widths: 20, 150 - :delim: ; - ``split``; dict like {index -> [index], columns -> [columns], data -> [values]} - ``records``; list like [{column -> value}, ... , {column -> value}] - ``index``; dict like {index -> {column -> value}} - ``columns``; dict like {column -> {index -> value}} - ``values``; just the values array - ``table``; adhering to the JSON `Table Schema`_ + ``split``, dict like {index -> [index]; columns -> [columns]; data -> [values]} + ``records``, list like [{column -> value}; ... ] + ``index``, dict like {index -> {column -> value}} + ``columns``, dict like {column -> {index -> value}} + ``values``, just the values array + ``table``, adhering to the JSON `Table Schema`_ * ``date_format`` : string, type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601. * ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10. @@ -2033,14 +2031,13 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` .. csv-table:: :widths: 20, 150 - :delim: ; - ``split``; dict like {index -> [index], columns -> [columns], data -> [values]} - ``records``; list like [{column -> value}, ... , {column -> value}] - ``index``; dict like {index -> {column -> value}} - ``columns``; dict like {column -> {index -> value}} - ``values``; just the values array - ``table``; adhering to the JSON `Table Schema`_ + ``split``, dict like {index -> [index]; columns -> [columns]; data -> [values]} + ``records``, list like [{column -> value} ...] + ``index``, dict like {index -> {column -> value}} + ``columns``, dict like {column -> {index -> value}} + ``values``, just the values array + ``table``, adhering to the JSON `Table Schema`_ * ``dtype`` : if True, infer dtypes, if a dict of column to dtype, then use those, if ``False``, then don't infer dtypes at all, default is True, apply only to the data. @@ -3471,20 +3468,15 @@ saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. See the :ref:`cookbook` for some advanced strategies. -.. warning:: - - The `xlrd `__ package is now only for reading - old-style ``.xls`` files. +.. note:: - Before pandas 1.3.0, the default argument ``engine=None`` to :func:`~pandas.read_excel` - would result in using the ``xlrd`` engine in many cases, including new - Excel 2007+ (``.xlsx``) files. pandas will now default to using the - `openpyxl `__ engine. + When ``engine=None``, the following logic will be used to determine the engine: - It is strongly encouraged to install ``openpyxl`` to read Excel 2007+ - (``.xlsx``) files. - **Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.** - This is no longer supported, switch to using ``openpyxl`` instead. + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. + - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used. + - Otherwise ``openpyxl`` will be used. .. _io.excel_reader: diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index b262de5d71439..29df2994fbc35 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -156,7 +156,7 @@ fits in memory, you can work with datasets that are much larger than memory. Chunking works well when the operation you're performing requires zero or minimal coordination between chunks. For more complicated workflows, you're better off - :ref:`using another library `. + :ref:`using other libraries `. Suppose we have an even larger "logical dataset" on disk that's a directory of parquet files. Each file in the directory represents a different year of the entire dataset. @@ -219,160 +219,10 @@ different library that implements these out-of-core algorithms for you. .. _scale.other_libraries: -Use Dask --------- +Use Other Libraries +------------------- -pandas is just one library offering a DataFrame API. Because of its popularity, -pandas' API has become something of a standard that other libraries implement. -The pandas documentation maintains a list of libraries implementing a DataFrame API -in `the ecosystem page `_. - -For example, `Dask`_, a parallel computing library, has `dask.dataframe`_, a -pandas-like API for working with larger than memory datasets in parallel. Dask -can use multiple threads or processes on a single machine, or a cluster of -machines to process data in parallel. - - -We'll import ``dask.dataframe`` and notice that the API feels similar to pandas. -We can use Dask's ``read_parquet`` function, but provide a globstring of files to read in. - -.. ipython:: python - :okwarning: - - import dask.dataframe as dd - - ddf = dd.read_parquet("data/timeseries/ts*.parquet", engine="pyarrow") - ddf - -Inspecting the ``ddf`` object, we see a few things - -* There are familiar attributes like ``.columns`` and ``.dtypes`` -* There are familiar methods like ``.groupby``, ``.sum``, etc. -* There are new attributes like ``.npartitions`` and ``.divisions`` - -The partitions and divisions are how Dask parallelizes computation. A **Dask** -DataFrame is made up of many pandas :class:`pandas.DataFrame`. A single method call on a -Dask DataFrame ends up making many pandas method calls, and Dask knows how to -coordinate everything to get the result. - -.. ipython:: python - - ddf.columns - ddf.dtypes - ddf.npartitions - -One major difference: the ``dask.dataframe`` API is *lazy*. If you look at the -repr above, you'll notice that the values aren't actually printed out; just the -column names and dtypes. That's because Dask hasn't actually read the data yet. -Rather than executing immediately, doing operations build up a **task graph**. - -.. ipython:: python - :okwarning: - - ddf - ddf["name"] - ddf["name"].value_counts() - -Each of these calls is instant because the result isn't being computed yet. -We're just building up a list of computation to do when someone needs the -result. Dask knows that the return type of a :class:`pandas.Series.value_counts` -is a pandas :class:`pandas.Series` with a certain dtype and a certain name. So the Dask version -returns a Dask Series with the same dtype and the same name. - -To get the actual result you can call ``.compute()``. - -.. ipython:: python - :okwarning: - - %time ddf["name"].value_counts().compute() - -At that point, you get back the same thing you'd get with pandas, in this case -a concrete pandas :class:`pandas.Series` with the count of each ``name``. - -Calling ``.compute`` causes the full task graph to be executed. This includes -reading the data, selecting the columns, and doing the ``value_counts``. The -execution is done *in parallel* where possible, and Dask tries to keep the -overall memory footprint small. You can work with datasets that are much larger -than memory, as long as each partition (a regular pandas :class:`pandas.DataFrame`) fits in memory. - -By default, ``dask.dataframe`` operations use a threadpool to do operations in -parallel. We can also connect to a cluster to distribute the work on many -machines. In this case we'll connect to a local "cluster" made up of several -processes on this single machine. - -.. code-block:: python - - >>> from dask.distributed import Client, LocalCluster - - >>> cluster = LocalCluster() - >>> client = Client(cluster) - >>> client - - -Once this ``client`` is created, all of Dask's computation will take place on -the cluster (which is just processes in this case). - -Dask implements the most used parts of the pandas API. For example, we can do -a familiar groupby aggregation. - -.. ipython:: python - :okwarning: - - %time ddf.groupby("name")[["x", "y"]].mean().compute().head() - -The grouping and aggregation is done out-of-core and in parallel. - -When Dask knows the ``divisions`` of a dataset, certain optimizations are -possible. When reading parquet datasets written by dask, the divisions will be -known automatically. In this case, since we created the parquet files manually, -we need to supply the divisions manually. - -.. ipython:: python - :okwarning: - - N = 12 - starts = [f"20{i:>02d}-01-01" for i in range(N)] - ends = [f"20{i:>02d}-12-13" for i in range(N)] - - divisions = tuple(pd.to_datetime(starts)) + (pd.Timestamp(ends[-1]),) - ddf.divisions = divisions - ddf - -Now we can do things like fast random access with ``.loc``. - -.. ipython:: python - :okwarning: - - ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute() - -Dask knows to just look in the 3rd partition for selecting values in 2002. It -doesn't need to look at any other data. - -Many workflows involve a large amount of data and processing it in a way that -reduces the size to something that fits in memory. In this case, we'll resample -to daily frequency and take the mean. Once we've taken the mean, we know the -results will fit in memory, so we can safely call ``compute`` without running -out of memory. At that point it's just a regular pandas object. - -.. ipython:: python - :okwarning: - - @savefig dask_resample.png - ddf[["x", "y"]].resample("1D").mean().cumsum().compute().plot() - -.. ipython:: python - :suppress: - - import shutil - - shutil.rmtree("data/timeseries") - -These Dask examples have all be done using multiple processes on a single -machine. Dask can be `deployed on a cluster -`_ to scale up to even larger -datasets. - -You see more dask examples at https://examples.dask.org. - -.. _Dask: https://dask.org -.. _dask.dataframe: https://docs.dask.org/en/latest/dataframe.html +There are other libraries which provide similar APIs to pandas and work nicely with pandas DataFrame, +and can give you the ability to scale your large dataset processing and analytics +by parallel runtime, distributed memory, clustering, etc. You can find more information +in `the ecosystem page `_. diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index cf27fc8385223..ad2690ae395be 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -726,57 +726,56 @@ Method summary .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - - :meth:`~Series.str.cat`;Concatenate strings - :meth:`~Series.str.split`;Split strings on delimiter - :meth:`~Series.str.rsplit`;Split strings on delimiter working from the end of the string - :meth:`~Series.str.get`;Index into each element (retrieve i-th element) - :meth:`~Series.str.join`;Join strings in each element of the Series with passed separator - :meth:`~Series.str.get_dummies`;Split strings on the delimiter returning DataFrame of dummy variables - :meth:`~Series.str.contains`;Return boolean array if each string contains pattern/regex - :meth:`~Series.str.replace`;Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence - :meth:`~Series.str.removeprefix`;Remove prefix from string, i.e. only remove if string starts with prefix. - :meth:`~Series.str.removesuffix`;Remove suffix from string, i.e. only remove if string ends with suffix. - :meth:`~Series.str.repeat`;Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) - :meth:`~Series.str.pad`;"Add whitespace to left, right, or both sides of strings" - :meth:`~Series.str.center`;Equivalent to ``str.center`` - :meth:`~Series.str.ljust`;Equivalent to ``str.ljust`` - :meth:`~Series.str.rjust`;Equivalent to ``str.rjust`` - :meth:`~Series.str.zfill`;Equivalent to ``str.zfill`` - :meth:`~Series.str.wrap`;Split long strings into lines with length less than a given width - :meth:`~Series.str.slice`;Slice each string in the Series - :meth:`~Series.str.slice_replace`;Replace slice in each string with passed value - :meth:`~Series.str.count`;Count occurrences of pattern - :meth:`~Series.str.startswith`;Equivalent to ``str.startswith(pat)`` for each element - :meth:`~Series.str.endswith`;Equivalent to ``str.endswith(pat)`` for each element - :meth:`~Series.str.findall`;Compute list of all occurrences of pattern/regex for each string - :meth:`~Series.str.match`;"Call ``re.match`` on each element, returning matched groups as list" - :meth:`~Series.str.extract`;"Call ``re.search`` on each element, returning DataFrame with one row for each element and one column for each regex capture group" - :meth:`~Series.str.extractall`;"Call ``re.findall`` on each element, returning DataFrame with one row for each match and one column for each regex capture group" - :meth:`~Series.str.len`;Compute string lengths - :meth:`~Series.str.strip`;Equivalent to ``str.strip`` - :meth:`~Series.str.rstrip`;Equivalent to ``str.rstrip`` - :meth:`~Series.str.lstrip`;Equivalent to ``str.lstrip`` - :meth:`~Series.str.partition`;Equivalent to ``str.partition`` - :meth:`~Series.str.rpartition`;Equivalent to ``str.rpartition`` - :meth:`~Series.str.lower`;Equivalent to ``str.lower`` - :meth:`~Series.str.casefold`;Equivalent to ``str.casefold`` - :meth:`~Series.str.upper`;Equivalent to ``str.upper`` - :meth:`~Series.str.find`;Equivalent to ``str.find`` - :meth:`~Series.str.rfind`;Equivalent to ``str.rfind`` - :meth:`~Series.str.index`;Equivalent to ``str.index`` - :meth:`~Series.str.rindex`;Equivalent to ``str.rindex`` - :meth:`~Series.str.capitalize`;Equivalent to ``str.capitalize`` - :meth:`~Series.str.swapcase`;Equivalent to ``str.swapcase`` - :meth:`~Series.str.normalize`;Return Unicode normal form. Equivalent to ``unicodedata.normalize`` - :meth:`~Series.str.translate`;Equivalent to ``str.translate`` - :meth:`~Series.str.isalnum`;Equivalent to ``str.isalnum`` - :meth:`~Series.str.isalpha`;Equivalent to ``str.isalpha`` - :meth:`~Series.str.isdigit`;Equivalent to ``str.isdigit`` - :meth:`~Series.str.isspace`;Equivalent to ``str.isspace`` - :meth:`~Series.str.islower`;Equivalent to ``str.islower`` - :meth:`~Series.str.isupper`;Equivalent to ``str.isupper`` - :meth:`~Series.str.istitle`;Equivalent to ``str.istitle`` - :meth:`~Series.str.isnumeric`;Equivalent to ``str.isnumeric`` - :meth:`~Series.str.isdecimal`;Equivalent to ``str.isdecimal`` + + :meth:`~Series.str.cat`,Concatenate strings + :meth:`~Series.str.split`,Split strings on delimiter + :meth:`~Series.str.rsplit`,Split strings on delimiter working from the end of the string + :meth:`~Series.str.get`,Index into each element (retrieve i-th element) + :meth:`~Series.str.join`,Join strings in each element of the Series with passed separator + :meth:`~Series.str.get_dummies`,Split strings on the delimiter returning DataFrame of dummy variables + :meth:`~Series.str.contains`,Return boolean array if each string contains pattern/regex + :meth:`~Series.str.replace`,Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence + :meth:`~Series.str.removeprefix`,Remove prefix from string i.e. only remove if string starts with prefix. + :meth:`~Series.str.removesuffix`,Remove suffix from string i.e. only remove if string ends with suffix. + :meth:`~Series.str.repeat`,Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) + :meth:`~Series.str.pad`,Add whitespace to the sides of strings + :meth:`~Series.str.center`,Equivalent to ``str.center`` + :meth:`~Series.str.ljust`,Equivalent to ``str.ljust`` + :meth:`~Series.str.rjust`,Equivalent to ``str.rjust`` + :meth:`~Series.str.zfill`,Equivalent to ``str.zfill`` + :meth:`~Series.str.wrap`,Split long strings into lines with length less than a given width + :meth:`~Series.str.slice`,Slice each string in the Series + :meth:`~Series.str.slice_replace`,Replace slice in each string with passed value + :meth:`~Series.str.count`,Count occurrences of pattern + :meth:`~Series.str.startswith`,Equivalent to ``str.startswith(pat)`` for each element + :meth:`~Series.str.endswith`,Equivalent to ``str.endswith(pat)`` for each element + :meth:`~Series.str.findall`,Compute list of all occurrences of pattern/regex for each string + :meth:`~Series.str.match`,Call ``re.match`` on each element returning matched groups as list + :meth:`~Series.str.extract`,Call ``re.search`` on each element returning DataFrame with one row for each element and one column for each regex capture group + :meth:`~Series.str.extractall`,Call ``re.findall`` on each element returning DataFrame with one row for each match and one column for each regex capture group + :meth:`~Series.str.len`,Compute string lengths + :meth:`~Series.str.strip`,Equivalent to ``str.strip`` + :meth:`~Series.str.rstrip`,Equivalent to ``str.rstrip`` + :meth:`~Series.str.lstrip`,Equivalent to ``str.lstrip`` + :meth:`~Series.str.partition`,Equivalent to ``str.partition`` + :meth:`~Series.str.rpartition`,Equivalent to ``str.rpartition`` + :meth:`~Series.str.lower`,Equivalent to ``str.lower`` + :meth:`~Series.str.casefold`,Equivalent to ``str.casefold`` + :meth:`~Series.str.upper`,Equivalent to ``str.upper`` + :meth:`~Series.str.find`,Equivalent to ``str.find`` + :meth:`~Series.str.rfind`,Equivalent to ``str.rfind`` + :meth:`~Series.str.index`,Equivalent to ``str.index`` + :meth:`~Series.str.rindex`,Equivalent to ``str.rindex`` + :meth:`~Series.str.capitalize`,Equivalent to ``str.capitalize`` + :meth:`~Series.str.swapcase`,Equivalent to ``str.swapcase`` + :meth:`~Series.str.normalize`,Return Unicode normal form. Equivalent to ``unicodedata.normalize`` + :meth:`~Series.str.translate`,Equivalent to ``str.translate`` + :meth:`~Series.str.isalnum`,Equivalent to ``str.isalnum`` + :meth:`~Series.str.isalpha`,Equivalent to ``str.isalpha`` + :meth:`~Series.str.isdigit`,Equivalent to ``str.isdigit`` + :meth:`~Series.str.isspace`,Equivalent to ``str.isspace`` + :meth:`~Series.str.islower`,Equivalent to ``str.islower`` + :meth:`~Series.str.isupper`,Equivalent to ``str.isupper`` + :meth:`~Series.str.istitle`,Equivalent to ``str.istitle`` + :meth:`~Series.str.isnumeric`,Equivalent to ``str.isnumeric`` + :meth:`~Series.str.isdecimal`,Equivalent to ``str.isdecimal`` diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index ec024f36d78b1..ae96d0f8296f2 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -10,12 +10,23 @@ This is the list of changes to pandas between each release. For full details, see the `commit logs `_. For install and upgrade instructions, see :ref:`install`. +Version 2.3 +----------- + +.. toctree:: + :maxdepth: 2 + + v2.3.0 + Version 2.2 ----------- .. toctree:: :maxdepth: 2 + v2.2.3 + v2.2.2 + v2.2.1 v2.2.0 Version 2.1 diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 51b4c4f297b07..d4eb5742ef928 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -432,7 +432,7 @@ In a future version, these will raise an error and you should cast to a common d In [3]: ser[0] = 'not an int64' FutureWarning: - Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. + Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'not an int64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first. In [4]: ser diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 57b83a294963b..73b1103c1bd37 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -42,4 +42,4 @@ Bug fixes Contributors ~~~~~~~~~~~~ -.. contributors:: v2.1.3..v2.1.4|HEAD +.. contributors:: v2.1.3..v2.1.4 diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d1481639ca5a0..e015afb17dce5 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_220: -What's new in 2.2.0 (Month XX, 2024) ------------------------------------- +What's new in 2.2.0 (January 19, 2024) +-------------------------------------- These are the changes in pandas 2.2.0. See :ref:`release` for a full changelog including other versions of pandas. @@ -123,7 +123,7 @@ nullability handling. with pg_dbapi.connect(uri) as conn: df.to_sql("pandas_table", conn, index=False) - # for roundtripping + # for round-tripping with pg_dbapi.connect(uri) as conn: df2 = pd.read_sql("pandas_table", conn) @@ -176,7 +176,7 @@ leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql` .. code-block:: ipython - # for roundtripping + # for round-tripping with pg_dbapi.connect(uri) as conn: df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow") @@ -188,6 +188,26 @@ For a full list of ADBC drivers and their development status, see the `ADBC Driv Implementation Status `_ documentation. +.. _whatsnew_220.enhancements.case_when: + +Create a pandas Series based on one or more conditions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`Series.case_when` function has been added to create a Series object based on one or more conditions. (:issue:`39154`) + +.. ipython:: python + + import pandas as pd + + df = pd.DataFrame(dict(a=[1, 2, 3], b=[4, 5, 6])) + default=pd.Series('default', index=df.index) + default.case_when( + caselist=[ + (df.a == 1, 'first'), # condition, replacement + (df.a.gt(1) & df.b.eq(5), 'second'), # condition, replacement + ], + ) + .. _whatsnew_220.enhancements.to_numpy_ea: ``to_numpy`` for NumPy nullable and Arrow types converts to suitable NumPy dtype @@ -251,6 +271,14 @@ DataFrame. (:issue:`54938`) ) series.struct.explode() +Use :meth:`Series.struct.field` to index into a (possible nested) +struct field. + + +.. ipython:: python + + series.struct.field("project") + .. _whatsnew_220.enhancements.list_accessor: Series.list accessor for PyArrow list data @@ -306,22 +334,23 @@ Other enhancements - :meth:`~DataFrame.to_sql` with method parameter set to ``multi`` works with Oracle on the backend - :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`). - :func:`get_dummies` now returning extension dtypes ``boolean`` or ``bool[pyarrow]`` that are compatible with the input dtype (:issue:`56273`) -- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) +- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"`` (:issue:`54480`) - :func:`read_sas` returns ``datetime64`` dtypes with resolutions better matching those stored natively in SAS, and avoids returning object-dtype in cases that cannot be stored with ``datetime64[ns]`` dtype (:issue:`56127`) -- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`) +- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs` (:issue:`54264`) - :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`) +- :meth:`DataFrame.apply` now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) -- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`) +- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`) - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`) -- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) +- Implement :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` for :class:`ArrowDtype` and masked dtypes (:issue:`56267`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) +- Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`) - Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`) -- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as "BMS" (:issue:`56243`) -- Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`) +- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as ``"BMS"`` (:issue:`56243`) +- Improved error message when constructing :class:`Period` with invalid offsets such as ``"QS"`` (:issue:`55785`) - The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`) - .. --------------------------------------------------------------------------- .. _whatsnew_220.notable_bug_fixes: @@ -386,6 +415,8 @@ index levels when joining on two indexes with different levels (:issue:`34133`). left = pd.DataFrame({"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"])) right = pd.DataFrame({"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"])) + left + right result = left.join(right) *Old Behavior* @@ -405,36 +436,67 @@ index levels when joining on two indexes with different levels (:issue:`34133`). result -.. --------------------------------------------------------------------------- -.. _whatsnew_220.api_breaking: - -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - .. _whatsnew_220.api_breaking.deps: Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Some minimum supported versions of dependencies were updated. -If installed, we now require: - -+-----------------+-----------------+----------+---------+ -| Package | Minimum Version | Required | Changed | -+=================+=================+==========+=========+ -| | | X | X | -+-----------------+-----------------+----------+---------+ - -For `optional libraries `_ the general recommendation is to use the latest version. -The following table lists the lowest version per library that is currently being tested throughout the development of pandas. -Optional libraries below the lowest tested version may still work, but are not considered supported. - -+-----------------+-----------------+---------+ -| Package | Minimum Version | Changed | -+=================+=================+=========+ -| mypy (dev) | 1.7.1 | X | -+-----------------+-----------------+---------+ -| | | X | -+-----------------+-----------------+---------+ +For `optional dependencies `_ the general recommendation is to use the latest version. +Optional dependencies below the lowest tested version may still work but are not considered supported. +The following table lists the optional dependencies that have had their minimum tested version increased. + ++-----------------+---------------------+ +| Package | New Minimum Version | ++=================+=====================+ +| beautifulsoup4 | 4.11.2 | ++-----------------+---------------------+ +| blosc | 1.21.3 | ++-----------------+---------------------+ +| bottleneck | 1.3.6 | ++-----------------+---------------------+ +| fastparquet | 2022.12.0 | ++-----------------+---------------------+ +| fsspec | 2022.11.0 | ++-----------------+---------------------+ +| gcsfs | 2022.11.0 | ++-----------------+---------------------+ +| lxml | 4.9.2 | ++-----------------+---------------------+ +| matplotlib | 3.6.3 | ++-----------------+---------------------+ +| numba | 0.56.4 | ++-----------------+---------------------+ +| numexpr | 2.8.4 | ++-----------------+---------------------+ +| qtpy | 2.3.0 | ++-----------------+---------------------+ +| openpyxl | 3.1.0 | ++-----------------+---------------------+ +| psycopg2 | 2.9.6 | ++-----------------+---------------------+ +| pyreadstat | 1.2.0 | ++-----------------+---------------------+ +| pytables | 3.8.0 | ++-----------------+---------------------+ +| pyxlsb | 1.0.10 | ++-----------------+---------------------+ +| s3fs | 2022.11.0 | ++-----------------+---------------------+ +| scipy | 1.10.0 | ++-----------------+---------------------+ +| sqlalchemy | 2.0.0 | ++-----------------+---------------------+ +| tabulate | 0.9.0 | ++-----------------+---------------------+ +| xarray | 2022.12.0 | ++-----------------+---------------------+ +| xlsxwriter | 3.0.5 | ++-----------------+---------------------+ +| zstandard | 0.19.0 | ++-----------------+---------------------+ +| pyqt5 | 5.15.8 | ++-----------------+---------------------+ +| tzdata | 2022.7 | ++-----------------+---------------------+ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. @@ -594,32 +656,33 @@ Other Deprecations - Changed :meth:`Timedelta.resolution_string` to return ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``H``, ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) - Deprecated :attr:`offsets.Day.delta`, :attr:`offsets.Hour.delta`, :attr:`offsets.Minute.delta`, :attr:`offsets.Second.delta`, :attr:`offsets.Milli.delta`, :attr:`offsets.Micro.delta`, :attr:`offsets.Nano.delta`, use ``pd.Timedelta(obj)`` instead (:issue:`55498`) - Deprecated :func:`pandas.api.types.is_interval` and :func:`pandas.api.types.is_period`, use ``isinstance(obj, pd.Interval)`` and ``isinstance(obj, pd.Period)`` instead (:issue:`55264`) -- Deprecated :func:`pd.core.internals.api.make_block`, use public APIs instead (:issue:`40226`) - Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`) - Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`) +- Deprecated :meth:`DateOffset.is_anchored`, use ``obj.n == 1`` for non-Tick subclasses (for Tick this was always False) (:issue:`55388`) - Deprecated :meth:`DatetimeArray.__init__` and :meth:`TimedeltaArray.__init__`, use :func:`array` instead (:issue:`55623`) - Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`) - Deprecated :meth:`Series.ravel`, the underlying array is already 1D, so ravel is not necessary (:issue:`52511`) - Deprecated :meth:`Series.resample` and :meth:`DataFrame.resample` with a :class:`PeriodIndex` (and the 'convention' keyword), convert to :class:`DatetimeIndex` (with ``.to_timestamp()``) before resampling instead (:issue:`53481`) - Deprecated :meth:`Series.view`, use :meth:`Series.astype` instead to change the dtype (:issue:`20251`) +- Deprecated :meth:`offsets.Tick.is_anchored`, use ``False`` instead (:issue:`55388`) - Deprecated ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock``, use public APIs instead (:issue:`55139`) - Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`) - Deprecated accepting a type as an argument in :meth:`Index.view`, call without any arguments instead (:issue:`55709`) - Deprecated allowing non-integer ``periods`` argument in :func:`date_range`, :func:`timedelta_range`, :func:`period_range`, and :func:`interval_range` (:issue:`56036`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict`. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_excel` except ``excel_writer``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_gbq` except ``destination_table``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_hdf` except ``path_or_buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_html` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_json` except ``path_or_buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_latex` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_markdown` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer``. (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_excel` except ``excel_writer`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_gbq` except ``destination_table`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_hdf` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_html` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_json` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_latex` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_markdown` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer`` (:issue:`54229`) - Deprecated allowing passing :class:`BlockManager` objects to :class:`DataFrame` or :class:`SingleBlockManager` objects to :class:`Series` (:issue:`52419`) - Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`) - Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) @@ -652,6 +715,7 @@ Other Deprecations - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) - Deprecated the previous implementation of :class:`DataFrame.stack`; specify ``future_stack=True`` to adopt the future version (:issue:`53515`) +- .. --------------------------------------------------------------------------- .. _whatsnew_220.performance: @@ -692,37 +756,38 @@ Bug fixes Categorical ^^^^^^^^^^^ - :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`) -- Bug in :meth:`CategoricalDtype.__eq__` returning false for unordered categorical data with mixed types (:issue:`55468`) -- +- Bug in :meth:`CategoricalDtype.__eq__` returning ``False`` for unordered categorical data with mixed types (:issue:`55468`) +- Bug when casting ``pa.dictionary`` to :class:`CategoricalDtype` using a ``pa.DictionaryArray`` as categories (:issue:`56672`) Datetimelike ^^^^^^^^^^^^ - Bug in :class:`DatetimeIndex` construction when passing both a ``tz`` and either ``dayfirst`` or ``yearfirst`` ignoring dayfirst/yearfirst (:issue:`55813`) - Bug in :class:`DatetimeIndex` when passing an object-dtype ndarray of float objects and a ``tz`` incorrectly localizing the result (:issue:`55780`) - Bug in :func:`Series.isin` with :class:`DatetimeTZDtype` dtype and comparison values that are all ``NaT`` incorrectly returning all-``False`` even if the series contains ``NaT`` entries (:issue:`56427`) -- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`) +- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame (:issue:`52093`) - Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing mixed-type objects with a mix of timezones or mix of timezone-awareness failing to raise ``ValueError`` (:issue:`55693`) +- Bug in :meth:`.Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in :meth:`DatetimeIndex.shift` with non-nanosecond resolution incorrectly returning with nanosecond resolution (:issue:`56117`) - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`) - Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`) - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) - Bug in :meth:`Series.dt.round` with non-nanosecond resolution and ``NaT`` entries incorrectly raising ``OverflowError`` (:issue:`56158`) - Bug in :meth:`Series.fillna` with non-nanosecond resolution dtypes and higher-resolution vector values returning incorrect (internally-corrupted) results (:issue:`56410`) -- Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in :meth:`Timestamp.unit` being inferred incorrectly from an ISO8601 format string with minute or hour resolution and a timezone offset (:issue:`56208`) -- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetim64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) +- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetime64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) - Bug in adding or subtracting a :class:`Week` offset to a ``datetime64`` :class:`Series`, :class:`Index`, or :class:`DataFrame` column with non-nanosecond resolution returning incorrect results (:issue:`55583`) - Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`) - Bug in addition or subtraction of :class:`DateOffset` objects with microsecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` columns with non-nanosecond resolution (:issue:`55595`) -- Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) +- Bug in addition or subtraction of very large :class:`.Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) - Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`) - Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`) - Bug in the results of :func:`to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`) +- Fixed regression where :func:`concat` would raise an error when concatenating ``datetime64`` columns with differing resolutions (:issue:`53641`) Timedelta ^^^^^^^^^ @@ -738,15 +803,18 @@ Timezones Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) +- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`) +- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) -- +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` matching float ``0.0`` with ``False`` and vice versa (:issue:`55398`) +- Bug in :meth:`Series.round` raising for nullable boolean dtype (:issue:`55936`) Conversion ^^^^^^^^^^ - Bug in :meth:`DataFrame.astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`) - Bug in :meth:`DataFrame.astype` where ``errors="ignore"`` had no effect for extension types (:issue:`54654`) - Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`) -- +- Bug in :meth:``DataFrame.loc`` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`) Strings ^^^^^^^ @@ -756,6 +824,7 @@ Strings - Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`) - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) - Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`) +- Bug in :meth:`Series.str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) @@ -763,16 +832,17 @@ Strings Interval ^^^^^^^^ -- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`) +- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown (:issue:`55015`) - Bug in :meth:`IntervalIndex.factorize` and :meth:`Series.factorize` with :class:`IntervalDtype` with datetime64 or timedelta64 intervals not preserving non-nanosecond units (:issue:`56099`) - Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`) +- Bug in :meth:`IntervalIndex.from_tuples` raising if subtype is a nullable extension dtype (:issue:`56765`) - Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`) - Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`) - Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`) -- Indexing ^^^^^^^^ +- Bug in :meth:`DataFrame.loc` mutating a boolean indexer when :class:`DataFrame` has a :class:`MultiIndex` (:issue:`56635`) - Bug in :meth:`DataFrame.loc` when setting :class:`Series` with extension dtype into NumPy dtype (:issue:`55604`) - Bug in :meth:`Index.difference` not returning a unique set of values when ``other`` is empty or ``other`` is considered non-comparable (:issue:`55113`) - Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`) @@ -781,25 +851,24 @@ Indexing Missing ^^^^^^^ - Bug in :meth:`DataFrame.update` wasn't updating in-place for tz-aware datetime64 dtypes (:issue:`56227`) -- MultiIndex ^^^^^^^^^^ - Bug in :meth:`MultiIndex.get_indexer` not raising ``ValueError`` when ``method`` provided and index is non-monotonic (:issue:`53452`) -- I/O ^^^ -- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified. (:issue:`56323`) -- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified. (:issue:`55677`) -- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) +- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified (:issue:`56323`) +- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified (:issue:`55677`) +- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raising a Python warning; this now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``quotechar`` was ignored (:issue:`52266`) -- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) -- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) +- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a CSV with no headers (:issue:`54459`) +- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when the file contains ``NaN`` or ``Inf`` (:issue:`54564`) - Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`) -- Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) +- Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing Boolean/string value (:issue:`54994`) - Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`) -- Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when string contains annotation (:issue:`55200`) +- Bug in :meth:`DataFrame.to_stata` raising for extension dtypes (:issue:`54671`) +- Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when a string cell contains an annotation (:issue:`55200`) - Bug in :meth:`~pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`) - Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`) @@ -808,29 +877,30 @@ Period - Bug in :class:`PeriodIndex` construction when more than one of ``data``, ``ordinal`` and ``**fields`` are passed failing to raise ``ValueError`` (:issue:`55961`) - Bug in :class:`Period` addition silently wrapping around instead of raising ``OverflowError`` (:issue:`55503`) - Bug in casting from :class:`PeriodDtype` with ``astype`` to ``datetime64`` or :class:`DatetimeTZDtype` with non-nanosecond unit incorrectly returning with nanosecond unit (:issue:`55958`) -- Plotting ^^^^^^^^ -- Bug in :meth:`DataFrame.plot.box` with ``vert=False`` and a matplotlib ``Axes`` created with ``sharey=True`` (:issue:`54941`) -- Bug in :meth:`DataFrame.plot.scatter` discaring string columns (:issue:`56142`) +- Bug in :meth:`DataFrame.plot.box` with ``vert=False`` and a Matplotlib ``Axes`` created with ``sharey=True`` (:issue:`54941`) +- Bug in :meth:`DataFrame.plot.scatter` discarding string columns (:issue:`56142`) - Bug in :meth:`Series.plot` when reusing an ``ax`` object failing to raise when a ``how`` keyword is passed (:issue:`55953`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :class:`.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) - Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, and :meth:`.SeriesGroupBy.idxmax` would not retain :class:`.Categorical` dtype when the index was a :class:`.CategoricalIndex` that contained NA values (:issue:`54234`) - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` when ``observed=False`` and ``f="idxmin"`` or ``f="idxmax"`` would incorrectly raise on unobserved categories (:issue:`54234`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_count` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_count` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`55951`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_count` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`55951`) - Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`) - Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where grouping by a combination of ``Decimal`` and NA values would fail when ``sort=True`` (:issue:`54847`) +- Bug in :meth:`DataFrame.groupby` for DataFrame subclasses when selecting a subset of columns to apply the function to (:issue:`56761`) - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) +- Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) +- Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where either the ``index`` or ``on`` column was :class:`ArrowDtype` with ``pyarrow.timestamp`` type (:issue:`55849`) Reshaping ^^^^^^^^^ @@ -839,50 +909,41 @@ Reshaping - Bug in :func:`merge_asof` raising ``TypeError`` when ``by`` dtype is not ``object``, ``int64``, or ``uint64`` (:issue:`22794`) - Bug in :func:`merge_asof` raising incorrect error for string dtype (:issue:`56444`) - Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`) +- Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`) - Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`) +- Bug in :func:`merge` not sorting for new string dtype (:issue:`56442`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) - Bug in :meth:`DataFrame.pivot_table` where the row margin is incorrect when the columns have numeric names (:issue:`26568`) - Bug in :meth:`DataFrame.pivot` with numeric columns and extension dtype for data (:issue:`56528`) -- Bug in :meth:`DataFrame.stack` and :meth:`Series.stack` with ``future_stack=True`` would not preserve NA values in the index (:issue:`56573`) +- Bug in :meth:`DataFrame.stack` with ``future_stack=True`` would not preserve NA values in the index (:issue:`56573`) Sparse ^^^^^^ -- Bug in :meth:`SparseArray.take` when using a different fill value than the array's fill value (:issue:`55181`) -- - -ExtensionArray -^^^^^^^^^^^^^^ -- -- - -Styler -^^^^^^ -- -- +- Bug in :meth:`arrays.SparseArray.take` when using a different fill value than the array's fill value (:issue:`55181`) Other ^^^^^ +- :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`) - Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`) +- Bug in :func:`api.interchange.from_dataframe` where it raised ``NotImplementedError`` when handling empty string columns (:issue:`56703`) - Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`) - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) - Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Bug in :meth:`DataFrame.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` raising a ``ValueError`` (:issue:`56478`) -- Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) +- Bug in rendering ``inf`` values inside a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) - Bug in the error message when assigning an empty :class:`DataFrame` to a column (:issue:`55956`) - Bug when time-like strings were being cast to :class:`ArrowDtype` with ``pyarrow.time64`` type (:issue:`56463`) - -.. ***DO NOT USE THIS SECTION*** - -- -- +- Fixed a spurious deprecation warning from ``numba`` >= 0.58.0 when passing a numpy ufunc in :class:`core.window.Rolling.apply` with ``engine="numba"`` (:issue:`55247`) .. --------------------------------------------------------------------------- .. _whatsnew_220.contributors: Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.1.4..v2.2.0 diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst new file mode 100644 index 0000000000000..4db0069ec4b95 --- /dev/null +++ b/doc/source/whatsnew/v2.2.1.rst @@ -0,0 +1,90 @@ +.. _whatsnew_221: + +What's new in 2.2.1 (February 22, 2024) +--------------------------------------- + +These are the changes in pandas 2.2.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.enhancements: + +Enhancements +~~~~~~~~~~~~ +- Added ``pyarrow`` pip extra so users can install pandas and pyarrow with pip with ``pip install pandas[pyarrow]`` (:issue:`54466`) + +.. _whatsnew_221.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed memory leak in :func:`read_csv` (:issue:`57039`) +- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) +- Fixed regression causing overflow for near-minimum timestamps (:issue:`57150`) +- Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) +- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :func:`pandas.testing.assert_series_equal` defaulting to ``check_exact=True`` when checking the :class:`Index` (:issue:`57067`) +- Fixed regression in :func:`read_json` where an :class:`Index` would be returned instead of a :class:`RangeIndex` (:issue:`57429`) +- Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) +- Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) +- Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) +- Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) +- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.loc` which was unnecessarily throwing "incompatible dtype warning" when expanding with partial row indexer and multiple columns (see `PDEP6 `_) (:issue:`56503`) +- Fixed regression in :meth:`DataFrame.map` with ``na_action="ignore"`` not being respected for NumPy nullable and :class:`ArrowDtypes` (:issue:`57316`) +- Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) +- Fixed regression in :meth:`DataFrame.query` with all ``NaT`` column with object dtype (:issue:`57068`) +- Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) +- Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) +- Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) +- Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) +- Fixed regression in :meth:`DataFrame.to_sql` when ``method="multi"`` is passed and the dialect type is not Oracle (:issue:`57310`) +- Fixed regression in :meth:`DataFrame.transpose` with nullable extension dtypes not having F-contiguous data potentially causing exceptions when used (:issue:`57315`) +- Fixed regression in :meth:`DataFrame.update` emitting incorrect warnings about downcasting (:issue:`57124`) +- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) +- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) +- Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) +- Fixed regression in :meth:`Series.astype` introducing decimals when converting from integer with missing values to string dtype (:issue:`57418`) +- Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) +- Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) +- Fixed regression in addition or subtraction of :class:`DateOffset` objects with millisecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` (:issue:`57529`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for Nullable integers (:issue:`55069`) +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which wasn't converting columns names to strings (:issue:`55069`) +- Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) +- Fixed bug in :meth:`PeriodIndex.asfreq` which was silently converting frequencies which are not supported as period frequencies instead of raising an error (:issue:`56945`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.other: + +Other +~~~~~ + +.. note:: + + The ``DeprecationWarning`` that was raised when pandas was imported without PyArrow being + installed has been removed. This decision was made because the warning was too noisy for too + many users and a lot of feedback was collected about the decision to make PyArrow a required + dependency. Pandas is currently considering the decision whether or not PyArrow should be added + as a hard dependency in 3.0. Interested users can follow the discussion + `here `_. + +- Added the argument ``skipna`` to :meth:`DataFrameGroupBy.first`, :meth:`DataFrameGroupBy.last`, :meth:`SeriesGroupBy.first`, and :meth:`SeriesGroupBy.last`; achieving ``skipna=False`` used to be available via :meth:`DataFrameGroupBy.nth`, but the behavior was changed in pandas 2.0.0 (:issue:`57019`) +- Added the argument ``skipna`` to :meth:`Resampler.first`, :meth:`Resampler.last` (:issue:`57019`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.2.0..v2.2.1 diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst new file mode 100644 index 0000000000000..fbe5e9b4febb5 --- /dev/null +++ b/doc/source/whatsnew/v2.2.2.rst @@ -0,0 +1,59 @@ +.. _whatsnew_222: + +What's new in 2.2.2 (April 10, 2024) +--------------------------------------- + +These are the changes in pandas 2.2.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_220.np2_compat: + +Pandas 2.2.2 is now compatible with numpy 2.0 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas 2.2.2 is the first version of pandas that is generally compatible with the upcoming +numpy 2.0 release, and wheels for pandas 2.2.2 will work with both numpy 1.x and 2.x. + +One major caveat is that arrays created with numpy 2.0's new ``StringDtype`` will convert +to ``object`` dtyped arrays upon :class:`Series`/:class:`DataFrame` creation. +Full support for numpy 2.0's StringDtype is expected to land in pandas 3.0. + +As usual please report any bugs discovered to our `issue tracker `_ + +.. _whatsnew_222.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pyarrow nullable on with missing values (:issue:`57664`) +- Avoid issuing a spurious ``DeprecationWarning`` when a custom :class:`DataFrame` or :class:`Series` subclass method is called (:issue:`57553`) +- Fixed regression in precision of :func:`to_datetime` with string and ``unit`` input (:issue:`57051`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.bug_fixes: + +Bug fixes +~~~~~~~~~ +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the column's type was nullable boolean (:issue:`55332`) +- :meth:`DataFrame.__dataframe__` was showing bytemask instead of bitmask for ``'string[pyarrow]'`` validity buffer (:issue:`57762`) +- :meth:`DataFrame.__dataframe__` was showing non-null validity buffer (instead of ``None``) ``'string[pyarrow]'`` without missing values (:issue:`57761`) +- :meth:`DataFrame.to_sql` was failing to find the right table when using the schema argument (:issue:`57539`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.2.1..v2.2.2 diff --git a/doc/source/whatsnew/v2.2.3.rst b/doc/source/whatsnew/v2.2.3.rst new file mode 100644 index 0000000000000..1696a7b6449af --- /dev/null +++ b/doc/source/whatsnew/v2.2.3.rst @@ -0,0 +1,45 @@ +.. _whatsnew_223: + +What's new in 2.2.3 (September 20, 2024) +---------------------------------------- + +These are the changes in pandas 2.2.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_220.py13_compat: + +Pandas 2.2.3 is now compatible with Python 3.13 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas 2.2.3 is the first version of pandas that is generally compatible with the upcoming +Python 3.13, and both wheels for free-threaded and normal Python 3.13 will be uploaded for +this release. + +As usual please report any bugs discovered to our `issue tracker `_ + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Bug in :func:`eval` on :class:`complex` including division ``/`` discards imaginary part. (:issue:`21374`) +- Minor fixes for numpy 2.1 compatibility. (:issue:`59444`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.other: + +Other +~~~~~ +- Missing licenses for 3rd party dependencies were added back into the wheels. (:issue:`58632`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.2.2..v2.2.3|HEAD diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst new file mode 100644 index 0000000000000..fc60789801ce7 --- /dev/null +++ b/doc/source/whatsnew/v2.3.0.rst @@ -0,0 +1,196 @@ +.. _whatsnew_230: + +What's new in 2.3.0 (Month XX, 2024) +------------------------------------ + +These are the changes in pandas 2.3.0. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_230.upcoming_changes: + +Upcoming changes in pandas 3.0 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +.. _whatsnew_230.enhancements: + +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_230.enhancements.enhancement1: + +enhancement1 +^^^^^^^^^^^^ + + +.. _whatsnew_230.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called + when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been + updated to raise FutureWarning with NumPy >= 2 (:issue:`60340`) +- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) +- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) +- Improved ``repr`` of :class:`.NumpyExtensionArray` to account for NEP51 (:issue:`61085`) +- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`) +- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`) +- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.notable_bug_fixes: + +Notable bug fixes +~~~~~~~~~~~~~~~~~ + +These are bug fixes that might have notable behavior changes. + +.. _whatsnew_230.notable_bug_fixes.notable_bug_fix1: + +notable_bug_fix1 +^^^^^^^^^^^^^^^^ + + +Increased minimum version for Python +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +pandas 2.3.0 supports Python 3.10 and higher. + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.deprecations: + +Deprecations +~~~~~~~~~~~~ +- Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`) +- Deprecated the ``"pyarrow_numpy"`` storage option for :class:`StringDtype` (:issue:`60152`) +- The deprecation of setting the argument ``include_groups`` to ``True`` in :meth:`DataFrameGroupBy.apply` has been promoted from a ``DeprecationWarning`` to ``FutureWarning``; only ``False`` will be allowed (:issue:`7155`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.performance: + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ +- +- + +Datetimelike +^^^^^^^^^^^^ +- +- + +Timedelta +^^^^^^^^^ +- +- + +Timezones +^^^^^^^^^ +- +- + +Numeric +^^^^^^^ +- Enabled :class:`Series.mode` and :class:`DataFrame.mode` with ``dropna=False`` to sort the result for all dtypes in the presence of NA values; previously only certain dtypes would sort (:issue:`60702`) +- + +Conversion +^^^^^^^^^^ +- +- + +Strings +^^^^^^^ +- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`) +- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) +- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`) +- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) +- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) +- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) + +Interval +^^^^^^^^ +- +- + +Indexing +^^^^^^^^ +- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) +- + +Missing +^^^^^^^ +- +- + +MultiIndex +^^^^^^^^^^ +- +- + +I/O +^^^ +- +- + +Period +^^^^^^ +- +- + +Plotting +^^^^^^^^ +- +- + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ +- +- + +Reshaping +^^^^^^^^^ +- +- + +Sparse +^^^^^^ +- +- + +ExtensionArray +^^^^^^^^^^^^^^ +- +- + +Styler +^^^^^^ +- +- + +Other +^^^^^ +- Fixed usage of ``inspect`` when the optional dependencies ``pyarrow`` or ``jinja2`` + are not installed (:issue:`60196`) +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.contributors: + +Contributors +~~~~~~~~~~~~ diff --git a/environment.yml b/environment.yml index 74317d47e2e53..d5d6c329dae8a 100644 --- a/environment.yml +++ b/environment.yml @@ -7,9 +7,9 @@ dependencies: - pip # build dependencies - - versioneer[toml] + - versioneer - cython=3.0.5 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies @@ -54,7 +54,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 @@ -76,7 +76,7 @@ dependencies: # code checks - flake8=6.1.0 # run in subprocess over docstring examples - - mypy=1.7.1 # pre-commit uses locally installed mypy + - mypy=1.8.0 # pre-commit uses locally installed mypy - tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py - pre-commit>=3.6.0 diff --git a/pandas/__init__.py b/pandas/__init__.py index 7fab662ed2de4..ca2eba2043292 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -202,8 +202,8 @@ FutureWarning, stacklevel=2, ) -# Don't allow users to use pandas.os or pandas.warnings -del os, warnings + +del warnings, os # module level doc-string __doc__ = """ diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index 97784c924dab4..838b6affd2836 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -52,6 +52,6 @@ def using_nullable_dtypes() -> bool: return _mode_options["nullable_dtypes"] -def using_pyarrow_string_dtype() -> bool: +def using_string_dtype() -> bool: _mode_options = _global_config["future"] return _mode_options["infer_string"] diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 88c3abba506a3..385727fad3c50 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -184,6 +184,17 @@ def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, fv = fill_value + {{if c_type_in == c_type_out != "object"}} + with nogil: + for i in range(n): + for j in range(k): + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + {{else}} for i in range(n): for j in range(k): idx = indexer[j] @@ -195,6 +206,7 @@ def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{else}} out[i, j] = values[i, idx] {{endif}} + {{endif}} @cython.wraparound(False) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 9889436a542c1..2932f3ff56396 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -67,6 +67,10 @@ cdef class NDArrayBacked: """ Construct a new ExtensionArray `new_array` with `arr` as its _ndarray. + The returned array has the same dtype as self. + + Caller is responsible for ensuring `values.dtype == self._ndarray.dtype`. + This should round-trip: self == self._from_backing_data(self._ndarray) """ diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 135828a23648a..a494b61fa7e3d 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -136,6 +136,7 @@ def group_last( result_mask: npt.NDArray[np.bool_] | None = ..., min_count: int = ..., # Py_ssize_t is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_nth( out: np.ndarray, # rank_t[:, ::1] @@ -147,6 +148,7 @@ def group_nth( min_count: int = ..., # int64_t rank: int = ..., # int64_t is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_rank( out: np.ndarray, # float64_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 19d71b0a6fde3..b855d64d0be18 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1424,6 +1424,7 @@ def group_last( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=-1, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -1458,14 +1459,19 @@ def group_last( for j in range(K): val = values[i, j] - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) + if skipna: + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + if isna_entry: + continue - if not isna_entry: - nobs[lab, j] += 1 - resx[lab, j] = val + nobs[lab, j] += 1 + resx[lab, j] = val + + if uses_mask and not skipna: + result_mask[lab, j] = mask[i, j] _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx @@ -1486,6 +1492,7 @@ def group_nth( int64_t min_count=-1, int64_t rank=1, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -1520,15 +1527,19 @@ def group_nth( for j in range(K): val = values[i, j] - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) + if skipna: + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + if isna_entry: + continue - if not isna_entry: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + if uses_mask and not skipna: + result_mask[lab, j] = mask[i, j] _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx @@ -1767,6 +1778,7 @@ def group_idxmin_idxmax( Py_ssize_t i, j, N, K, lab numeric_object_t val numeric_object_t[:, ::1] group_min_or_max + uint8_t[:, ::1] seen bint uses_mask = mask is not None bint isna_entry bint compute_max = name == "idxmax" @@ -1780,13 +1792,10 @@ def group_idxmin_idxmax( if numeric_object_t is object: group_min_or_max = np.empty((out).shape, dtype=object) + seen = np.zeros((out).shape, dtype=np.uint8) else: group_min_or_max = np.empty_like(out, dtype=values.dtype) - if N > 0 and K > 0: - # When N or K is zero, we never use group_min_or_max - group_min_or_max[:] = _get_min_or_max( - values[0, 0], compute_max, is_datetimelike - ) + seen = np.zeros_like(out, dtype=np.uint8) # When using transform, we need a valid value for take in the case # a category is not observed; these values will be dropped @@ -1802,6 +1811,7 @@ def group_idxmin_idxmax( if not skipna and out[lab, j] == -1: # Once we've hit NA there is no going back continue + val = values[i, j] if uses_mask: @@ -1810,10 +1820,14 @@ def group_idxmin_idxmax( isna_entry = _treat_as_na(val, is_datetimelike) if isna_entry: - if not skipna: + if not skipna or not seen[lab, j]: out[lab, j] = -1 else: - if compute_max: + if not seen[lab, j]: + seen[lab, j] = True + group_min_or_max[lab, j] = val + out[lab, j] = i + elif compute_max: if val > group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index ccac3d0b50d45..127b0b845d219 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -33,7 +33,10 @@ from pandas._libs.khash cimport ( kh_python_hash_func, khiter_t, ) -from pandas._libs.missing cimport checknull +from pandas._libs.missing cimport ( + checknull, + is_matching_na, +) def get_hashtable_trace_domain(): diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c0723392496c1..c42bccb7f38f7 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1121,11 +1121,13 @@ cdef class StringHashTable(HashTable): const char **vecs khiter_t k bint use_na_value + bint non_null_na_value if return_inverse: labels = np.zeros(n, dtype=np.intp) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None + non_null_na_value = not checknull(na_value) # assign pointers and pre-filter out missing (if ignore_na) vecs = malloc(n * sizeof(char *)) @@ -1134,7 +1136,12 @@ cdef class StringHashTable(HashTable): if (ignore_na and (not isinstance(val, str) - or (use_na_value and val == na_value))): + or (use_na_value and ( + (non_null_na_value and val == na_value) or + (not non_null_na_value and is_matching_na(val, na_value))) + ) + ) + ): # if missing values do not count as unique values (i.e. if # ignore_na is True), we can skip the actual value, and # replace the label with na_sentinel directly @@ -1400,10 +1407,11 @@ cdef class PyObjectHashTable(HashTable): object val khiter_t k bint use_na_value - + bint non_null_na_value if return_inverse: labels = np.empty(n, dtype=np.intp) use_na_value = na_value is not None + non_null_na_value = not checknull(na_value) for i in range(n): val = values[i] @@ -1411,7 +1419,11 @@ cdef class PyObjectHashTable(HashTable): if ignore_na and ( checknull(val) - or (use_na_value and val == na_value) + or (use_na_value and ( + (non_null_na_value and val == na_value) or + (not non_null_na_value and is_matching_na(val, na_value)) + ) + ) ): # if missing values do not count as unique values (i.e. if # ignore_na is True), skip the hashtable entry for them, and diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 75db47bf3160e..9c3791a642768 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -68,6 +68,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ... class MaskedUInt8Engine(MaskedIndexEngine): ... class MaskedBoolEngine(MaskedUInt8Engine): ... +class StringObjectEngine(ObjectEngine): + def __init__(self, values: object, na_value) -> None: ... + class BaseMultiIndexCodesEngine: levels: list[np.ndarray] offsets: np.ndarray # ndarray[uint64_t, ndim=1] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 0dc139781f58d..8bb839dee436d 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -96,6 +96,20 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None): return indexer.view(bool) +cdef _maybe_resize_array(ndarray values, Py_ssize_t loc, Py_ssize_t max_length): + """ + Resize array if loc is out of bounds. + """ + cdef: + Py_ssize_t n = len(values) + + if loc >= n: + while loc >= n: + n *= 2 + values = np.resize(values, min(n, max_length)) + return values + + # Don't populate hash tables in monotonic indexes larger than this _SIZE_CUTOFF = 1_000_000 @@ -281,7 +295,7 @@ cdef class IndexEngine: values = self.values self.monotonic_inc, self.monotonic_dec, is_strict_monotonic = \ self._call_monotonic(values) - except TypeError: + except (TypeError, ValueError): self.monotonic_inc = 0 self.monotonic_dec = 0 is_strict_monotonic = 0 @@ -450,27 +464,18 @@ cdef class IndexEngine: # found if val in d: key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) - result[count] = j count += 1 # value not found else: - - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i @@ -527,6 +532,24 @@ cdef class ObjectEngine(IndexEngine): return loc +cdef class StringObjectEngine(ObjectEngine): + + cdef: + object na_value + + def __init__(self, ndarray values, na_value): + super().__init__(values) + self.na_value = na_value + + cdef _check_type(self, object val): + if isinstance(val, str): + return val + elif checknull(val): + return self.na_value + else: + raise KeyError(val) + + cdef class DatetimeEngine(Int64Engine): cdef: @@ -1193,13 +1216,12 @@ cdef class MaskedIndexEngine(IndexEngine): if PySequence_GetItem(target_mask, i): if na_pos: + result = _maybe_resize_array( + result, + count + len(na_pos) - 1, + max_alloc, + ) for na_idx in na_pos: - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result[count] = na_idx count += 1 continue @@ -1207,23 +1229,18 @@ cdef class MaskedIndexEngine(IndexEngine): elif val in d: # found key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc, + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result[count] = j count += 1 continue # value not found - if count >= n_alloc: - n_alloc += 10_000 - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index b9fd970e68f5b..71a4d3ae2575f 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -86,6 +86,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: Literal[False] = ..., + convert_string: Literal[False] = ..., convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> npt.NDArray[np.object_ | np.number]: ... @@ -97,6 +98,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: bool = ..., + convert_string: bool = ..., convert_to_nullable_dtype: Literal[True] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @@ -108,6 +110,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: bool = ..., + convert_string: bool = ..., convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c483f35513a40..87cbadaa811f7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -37,7 +37,7 @@ from cython cimport ( floating, ) -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs.missing import check_na_tuples_nonequal @@ -736,7 +736,9 @@ cpdef ndarray[object] ensure_string_array( convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. copy : bool, default True - Whether to ensure that a new array is returned. + Whether to ensure that a new array is returned. When True, a new array + is always returned. When False, a new array is only returned when needed + to avoid mutating the input array. skipna : bool, default True Whether or not to coerce nulls to their stringified form (e.g. if False, NaN becomes 'nan'). @@ -753,22 +755,36 @@ cpdef ndarray[object] ensure_string_array( if hasattr(arr, "to_numpy"): - if hasattr(arr, "dtype") and arr.dtype.kind in "mM": + if ( + hasattr(arr, "dtype") + and arr.dtype.kind in "mM" + # TODO: we should add a custom ArrowExtensionArray.astype implementation + # that handles astype(str) specifically, avoiding ending up here and + # then we can remove the below check for `_pa_array` (for ArrowEA) + and not hasattr(arr, "_pa_array") + ): # dtype check to exclude DataFrame # GH#41409 TODO: not a great place for this out = arr.astype(str).astype(object) out[arr.isna()] = na_value return out - arr = arr.to_numpy() + arr = arr.to_numpy(dtype=object) elif not util.is_array(arr): arr = np.array(arr, dtype="object") result = np.asarray(arr, dtype="object") - if copy and (result is arr or np.shares_memory(arr, result)): - # GH#54654 - result = result.copy() - elif not copy and result is arr: + if result is arr or np.may_share_memory(arr, result): + # if np.asarray(..) did not make a copy of the input arr, we still need + # to do that to avoid mutating the input array + # GH#54654: share_memory check is needed for rare cases where np.asarray + # returns a new object without making a copy of the actual data + if copy: + result = result.copy() + else: + already_copied = False + elif not copy and not result.flags.writeable: + # Weird edge case where result is a view already_copied = False if issubclass(arr.dtype.type, np.str_): @@ -1830,7 +1846,7 @@ cdef class BoolValidator(Validator): cpdef bint is_bool_array(ndarray values, bint skipna=False): cdef: - BoolValidator validator = BoolValidator(len(values), + BoolValidator validator = BoolValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1848,7 +1864,7 @@ cdef class IntegerValidator(Validator): # Note: only python-exposed for tests cpdef bint is_integer_array(ndarray values, bint skipna=True): cdef: - IntegerValidator validator = IntegerValidator(len(values), + IntegerValidator validator = IntegerValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1863,7 +1879,7 @@ cdef class IntegerNaValidator(Validator): cdef bint is_integer_na_array(ndarray values, bint skipna=True): cdef: - IntegerNaValidator validator = IntegerNaValidator(len(values), + IntegerNaValidator validator = IntegerNaValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1879,7 +1895,7 @@ cdef class IntegerFloatValidator(Validator): cdef bint is_integer_float_array(ndarray values, bint skipna=True): cdef: - IntegerFloatValidator validator = IntegerFloatValidator(len(values), + IntegerFloatValidator validator = IntegerFloatValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1897,7 +1913,7 @@ cdef class FloatValidator(Validator): # Note: only python-exposed for tests cpdef bint is_float_array(ndarray values): cdef: - FloatValidator validator = FloatValidator(len(values), values.dtype) + FloatValidator validator = FloatValidator(values.size, values.dtype) return validator.validate(values) @@ -1915,7 +1931,7 @@ cdef class ComplexValidator(Validator): cdef bint is_complex_array(ndarray values): cdef: - ComplexValidator validator = ComplexValidator(len(values), values.dtype) + ComplexValidator validator = ComplexValidator(values.size, values.dtype) return validator.validate(values) @@ -1928,7 +1944,7 @@ cdef class DecimalValidator(Validator): cdef bint is_decimal_array(ndarray values, bint skipna=False): cdef: DecimalValidator validator = DecimalValidator( - len(values), values.dtype, skipna=skipna + values.size, values.dtype, skipna=skipna ) return validator.validate(values) @@ -1944,7 +1960,7 @@ cdef class StringValidator(Validator): cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: - StringValidator validator = StringValidator(len(values), + StringValidator validator = StringValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1961,7 +1977,7 @@ cdef class BytesValidator(Validator): cdef bint is_bytes_array(ndarray values, bint skipna=False): cdef: - BytesValidator validator = BytesValidator(len(values), values.dtype, + BytesValidator validator = BytesValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -2012,7 +2028,7 @@ cdef class DatetimeValidator(TemporalValidator): cpdef bint is_datetime_array(ndarray values, bint skipna=True): cdef: - DatetimeValidator validator = DatetimeValidator(len(values), + DatetimeValidator validator = DatetimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2026,7 +2042,7 @@ cdef class Datetime64Validator(DatetimeValidator): # Note: only python-exposed for tests cpdef bint is_datetime64_array(ndarray values, bint skipna=True): cdef: - Datetime64Validator validator = Datetime64Validator(len(values), + Datetime64Validator validator = Datetime64Validator(values.size, skipna=skipna) return validator.validate(values) @@ -2041,7 +2057,7 @@ cdef class AnyDatetimeValidator(DatetimeValidator): cdef bint is_datetime_or_datetime64_array(ndarray values, bint skipna=True): cdef: - AnyDatetimeValidator validator = AnyDatetimeValidator(len(values), + AnyDatetimeValidator validator = AnyDatetimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2053,7 +2069,7 @@ def is_datetime_with_singletz_array(values: ndarray) -> bool: Doesn't check values are datetime-like types. """ cdef: - Py_ssize_t i = 0, j, n = len(values) + Py_ssize_t i = 0, j, n = values.size object base_val, base_tz, val, tz if n == 0: @@ -2101,7 +2117,7 @@ cpdef bint is_timedelta_or_timedelta64_array(ndarray values, bint skipna=True): Infer with timedeltas and/or nat/none. """ cdef: - AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values), + AnyTimedeltaValidator validator = AnyTimedeltaValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2115,7 +2131,7 @@ cdef class DateValidator(Validator): # Note: only python-exposed for tests cpdef bint is_date_array(ndarray values, bint skipna=False): cdef: - DateValidator validator = DateValidator(len(values), skipna=skipna) + DateValidator validator = DateValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2128,7 +2144,7 @@ cdef class TimeValidator(Validator): # Note: only python-exposed for tests cpdef bint is_time_array(ndarray values, bint skipna=False): cdef: - TimeValidator validator = TimeValidator(len(values), skipna=skipna) + TimeValidator validator = TimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2179,14 +2195,14 @@ cpdef bint is_interval_array(ndarray values): Is this an ndarray of Interval (or np.nan) with a single dtype? """ cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, n = values.size str closed = None bint numeric = False bint dt64 = False bint td64 = False object val - if len(values) == 0: + if n == 0: return False for i in range(n): @@ -2482,6 +2498,7 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_numeric=True, # NB: different default! bint convert_to_nullable_dtype=False, bint convert_non_numeric=False, + bint convert_string=True, object dtype_if_all_nat=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2725,10 +2742,20 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): + if convert_to_nullable_dtype and is_string_array(objects, skipna=True): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype() + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + + elif ( + convert_string + and using_string_dtype() + and is_string_array(objects, skipna=True) + ): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 9154e836b3477..567bfc02a2950 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -29,7 +29,7 @@ from pandas._libs.util cimport is_nan @cython.wraparound(False) @cython.boundscheck(False) -def scalar_compare(object[:] values, object val, object op) -> ndarray: +def scalar_compare(ndarray[object] values, object val, object op) -> ndarray: """ Compare each element of `values` array with the scalar `val`, with the comparison operation described by `op`. diff --git a/pandas/_libs/src/datetime/pd_datetime.c b/pandas/_libs/src/datetime/pd_datetime.c index 19de51be6e1b2..4c1969f6d9f57 100644 --- a/pandas/_libs/src/datetime/pd_datetime.c +++ b/pandas/_libs/src/datetime/pd_datetime.c @@ -20,6 +20,9 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include #include "datetime.h" +/* Need to import_array for np_datetime.c (for NumPy 1.x support only) */ +#define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY +#include "numpy/ndarrayobject.h" #include "pandas/datetime/pd_datetime.h" #include "pandas/portable.h" @@ -255,5 +258,6 @@ static struct PyModuleDef pandas_datetimemodule = { PyMODINIT_FUNC PyInit_pandas_datetime(void) { PyDateTime_IMPORT; + import_array(); return PyModuleDef_Init(&pandas_datetimemodule); } diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 0e4188bea4dc7..c9f7a796a9b1c 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -109,6 +109,14 @@ void parser_set_default_options(parser_t *self) { parser_t *parser_new(void) { return (parser_t *)calloc(1, sizeof(parser_t)); } +static void parser_clear_data_buffers(parser_t *self) { + free_if_not_null((void *)&self->stream); + free_if_not_null((void *)&self->words); + free_if_not_null((void *)&self->word_starts); + free_if_not_null((void *)&self->line_start); + free_if_not_null((void *)&self->line_fields); +} + static void parser_cleanup(parser_t *self) { // XXX where to put this free_if_not_null((void *)&self->error_msg); @@ -119,6 +127,7 @@ static void parser_cleanup(parser_t *self) { self->skipset = NULL; } + parser_clear_data_buffers(self); if (self->cb_cleanup != NULL) { self->cb_cleanup(self->source); self->cb_cleanup = NULL; diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index 06e3251db8315..934c54fafb634 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -16,8 +16,6 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt // Licence at LICENSES/NUMPY_LICENSE -#define NO_IMPORT - #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #endif // NPY_NO_DEPRECATED_API @@ -25,7 +23,10 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include #include "pandas/vendored/numpy/datetime/np_datetime.h" -#include + +#define NO_IMPORT_ARRAY +#define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY +#include #include #if defined(_WIN32) @@ -482,10 +483,20 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, if (base == NPY_FR_ns) { int64_t nanoseconds; - PD_CHECK_OVERFLOW( - scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); - PD_CHECK_OVERFLOW( - checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); + + // Minimum valid timestamp in nanoseconds (1677-09-21 00:12:43.145224193). + const int64_t min_nanoseconds = NPY_MIN_INT64 + 1; + if (microseconds == min_nanoseconds / 1000 - 1) { + // For values within one microsecond of min_nanoseconds, use it as base + // and offset it with nanosecond delta to avoid overflow during scaling. + PD_CHECK_OVERFLOW(checked_int64_add( + min_nanoseconds, (dts->ps - _NS_MIN_DTS.ps) / 1000, &nanoseconds)); + } else { + PD_CHECK_OVERFLOW( + scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); + PD_CHECK_OVERFLOW( + checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); + } return nanoseconds; } @@ -1060,5 +1071,8 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, */ PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(PyArray_Descr *dtype) { - return (((PyArray_DatetimeDTypeMetaData *)dtype->c_metadata)->meta); +#if NPY_ABI_VERSION < 0x02000000 +#define PyDataType_C_METADATA(dtype) ((dtype)->c_metadata) +#endif + return ((PyArray_DatetimeDTypeMetaData *)PyDataType_C_METADATA(dtype))->meta; } diff --git a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c index 7cc20a52f1849..4cfead8ac77a5 100644 --- a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c @@ -38,9 +38,11 @@ Numeric decoder derived from TCL library // Licence at LICENSES/ULTRAJSON_LICENSE -#include "pandas/vendored/ujson/lib/ultrajson.h" +// clang-format off #define PY_SSIZE_T_CLEAN #include +#include "pandas/vendored/ujson/lib/ultrajson.h" +// clang-format on static int Object_objectAddKey(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ name, JSOBJ value) { diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 8bba95dd456de..5f35860c59cb7 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -74,7 +74,6 @@ typedef struct __NpyArrContext { npy_intp ndim; npy_intp index[NPY_MAXDIMS]; int type_num; - PyArray_GetItemFunc *getitem; char **rowLabels; char **columnLabels; @@ -405,15 +404,14 @@ static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { } npyarr->array = (PyObject *)obj; - npyarr->getitem = (PyArray_GetItemFunc *)PyArray_DESCR(obj)->f->getitem; npyarr->dataptr = PyArray_DATA(obj); npyarr->ndim = PyArray_NDIM(obj) - 1; npyarr->curdim = 0; npyarr->type_num = PyArray_DESCR(obj)->type_num; if (GET_TC(tc)->transpose) { - npyarr->dim = PyArray_DIM(obj, npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->dim = PyArray_DIM(obj, (int)npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, (int)npyarr->ndim); npyarr->stridedim = npyarr->ndim; npyarr->index[npyarr->ndim] = 0; npyarr->inc = -1; @@ -447,8 +445,15 @@ static void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { npyarr->curdim--; npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; npyarr->stridedim -= npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArrayPassThru_iterEnd received a non-array object"); + return; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; + npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim); npyarr->dataptr += npyarr->stride; NpyArr_freeItemValue(obj, tc); @@ -467,18 +472,25 @@ static int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { NpyArr_freeItemValue(obj, tc); - if (PyArray_ISDATETIME(npyarr->array)) { + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArr_iterNextItem received a non-array object"); + return 0; + } + PyArrayObject *arrayobj = (PyArrayObject *)npyarr->array; + + if (PyArray_ISDATETIME(arrayobj)) { GET_TC(tc)->itemValue = obj; Py_INCREF(obj); - ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array); + ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(arrayobj); // Also write the resolution (unit) of the ndarray - PyArray_Descr *dtype = PyArray_DESCR(npyarr->array); + PyArray_Descr *dtype = PyArray_DESCR(arrayobj); ((PyObjectEncoder *)tc->encoder)->valueUnit = get_datetime_metadata_from_dtype(dtype).base; ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; } else { - GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); + GET_TC(tc)->itemValue = PyArray_GETITEM(arrayobj, npyarr->dataptr); } npyarr->dataptr += npyarr->stride; @@ -505,8 +517,15 @@ static int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { npyarr->curdim++; npyarr->stridedim += npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArr_iterNext received a non-array object"); + return 0; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; + + npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim); npyarr->index[npyarr->stridedim] = 0; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; @@ -1610,7 +1629,14 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (!values) { goto INVALID; } - pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); + + if (!PyArray_Check(pc->newObj)) { + PyErr_SetString(PyExc_TypeError, + "Object_beginTypeContext received a non-array object"); + goto INVALID; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)pc->newObj; + pc->columnLabelsLen = PyArray_DIM(arrayobj, 0); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); if (!pc->columnLabels) { diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 017fdc4bc834f..dd23c2f27ca09 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -277,7 +277,7 @@ def array_with_unit_to_datetime( bint is_raise = errors == "raise" ndarray[int64_t] iresult tzinfo tz = None - float fval + double fval assert is_ignore or is_coerce or is_raise diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index cb2658d343772..a8ac80a2d0f39 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -89,7 +89,7 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, - format: str | None = *, + str format = *, bint exact = * ) except? -1 diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index aa01a05d0d932..779d1e3111932 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -331,7 +331,7 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, - format: str | None=None, + str format=None, bint exact=True, ) except? -1: cdef: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index b3788b6003e67..5dacd7dd55231 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -756,11 +756,14 @@ cdef class BaseOffset: raise ValueError(f"{self} is a non-fixed frequency") def is_anchored(self) -> bool: - # TODO: Does this make sense for the general case? It would help - # if there were a canonical docstring for what is_anchored means. + # GH#55388 """ Return boolean whether the frequency is a unit frequency (n=1). + .. deprecated:: 2.2.0 + is_anchored is deprecated and will be removed in a future version. + Use ``obj.n == 1`` instead. + Examples -------- >>> pd.DateOffset().is_anchored() @@ -768,6 +771,12 @@ cdef class BaseOffset: >>> pd.DateOffset(2).is_anchored() False """ + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self.n == 1 # ------------------------------------------------------------------ @@ -954,6 +963,27 @@ cdef class Tick(SingleConstructorOffset): return True def is_anchored(self) -> bool: + # GH#55388 + """ + Return False. + + .. deprecated:: 2.2.0 + is_anchored is deprecated and will be removed in a future version. + Use ``False`` instead. + + Examples + -------- + >>> pd.offsets.Hour().is_anchored() + False + >>> pd.offsets.Hour(2).is_anchored() + False + """ + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use False instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return False # This is identical to BaseOffset.__hash__, but has to be redefined here @@ -1428,13 +1458,22 @@ cdef class RelativeDeltaOffset(BaseOffset): "minutes", "seconds", "microseconds", + "milliseconds", } # relativedelta/_offset path only valid for base DateOffset if self._use_relativedelta and set(kwds).issubset(relativedelta_fast): + td_args = { + "days", + "hours", + "minutes", + "seconds", + "microseconds", + "milliseconds" + } td_kwds = { key: val for key, val in kwds.items() - if key in ["days", "hours", "minutes", "seconds", "microseconds"] + if key in td_args } if "weeks" in kwds: days = td_kwds.get("days", 0) @@ -1444,6 +1483,8 @@ cdef class RelativeDeltaOffset(BaseOffset): delta = Timedelta(**td_kwds) if "microseconds" in kwds: delta = delta.as_unit("us") + elif "milliseconds" in kwds: + delta = delta.as_unit("ms") else: delta = delta.as_unit("s") else: @@ -1461,6 +1502,8 @@ cdef class RelativeDeltaOffset(BaseOffset): delta = Timedelta(self._offset * self.n) if "microseconds" in kwds: delta = delta.as_unit("us") + elif "milliseconds" in kwds: + delta = delta.as_unit("ms") else: delta = delta.as_unit("s") return delta @@ -2663,6 +2706,13 @@ cdef class QuarterOffset(SingleConstructorOffset): return f"{self._prefix}-{month}" def is_anchored(self) -> bool: + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1 " + f"and obj.startingMonth is not None\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self.n == 1 and self.startingMonth is not None def is_on_offset(self, dt: datetime) -> bool: @@ -3308,6 +3358,13 @@ cdef class Week(SingleConstructorOffset): self._cache = state.pop("_cache", {}) def is_anchored(self) -> bool: + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1 " + f"and obj.weekday is not None\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self.n == 1 and self.weekday is not None @apply_wraps @@ -3597,6 +3654,12 @@ cdef class FY5253Mixin(SingleConstructorOffset): self.variation = state.pop("variation") def is_anchored(self) -> bool: + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return ( self.n == 1 and self.startingMonth is not None and self.weekday is not None ) @@ -4221,9 +4284,7 @@ cdef class CustomBusinessDay(BusinessDay): @property def _period_dtype_code(self): # GH#52534 - raise TypeError( - "CustomBusinessDay is not supported as period frequency" - ) + raise ValueError(f"{self.base} is not supported as period frequency") _apply_array = BaseOffset._apply_array @@ -4661,29 +4722,7 @@ _lite_rule_alias = { "ns": "ns", } -_dont_uppercase = { - "h", - "bh", - "cbh", - "MS", - "ms", - "s", - "me", - "qe", - "qe-dec", - "qe-jan", - "qe-feb", - "qe-mar", - "qe-apr", - "qe-may", - "qe-jun", - "qe-jul", - "qe-aug", - "qe-sep", - "qe-oct", - "qe-nov", - "ye", -} +_dont_uppercase = _dont_uppercase = {"h", "bh", "cbh", "MS", "ms", "s"} INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" @@ -4702,7 +4741,29 @@ def _get_offset(name: str) -> BaseOffset: -------- _get_offset('EOM') --> BMonthEnd(1) """ - if name.lower() not in _dont_uppercase: + if ( + name not in _lite_rule_alias + and (name.upper() in _lite_rule_alias) + and name != "ms" + ): + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use \'{name.upper()}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + elif ( + name not in _lite_rule_alias + and (name.lower() in _lite_rule_alias) + and name != "MS" + ): + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use \'{name.lower()}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if name not in _dont_uppercase: name = name.upper() name = _lite_rule_alias.get(name, name) name = _lite_rule_alias.get(name.lower(), name) @@ -4772,19 +4833,19 @@ cpdef to_offset(freq, bint is_period=False): if freq is None: return None - if isinstance(freq, BaseOffset): - return freq - if isinstance(freq, tuple): raise TypeError( f"to_offset does not support tuples {freq}, pass as a string instead" ) + if isinstance(freq, BaseOffset): + result = freq + elif PyDelta_Check(freq): - return delta_to_tick(freq) + result = delta_to_tick(freq) elif isinstance(freq, str): - delta = None + result = None stride_sign = None try: @@ -4795,40 +4856,61 @@ cpdef to_offset(freq, bint is_period=False): tups = zip(split[0::4], split[1::4], split[2::4]) for n, (sep, stride, name) in enumerate(tups): - if is_period is False and name in c_OFFSET_DEPR_FREQSTR: + if not is_period and name.upper() in c_OFFSET_DEPR_FREQSTR: warnings.warn( f"\'{name}\' is deprecated and will be removed " f"in a future version, please use " - f"\'{c_OFFSET_DEPR_FREQSTR.get(name)}\' instead.", + f"\'{c_OFFSET_DEPR_FREQSTR.get(name.upper())}\' instead.", FutureWarning, stacklevel=find_stack_level(), ) - name = c_OFFSET_DEPR_FREQSTR[name] - if is_period is True and name in c_REVERSE_OFFSET_DEPR_FREQSTR: - if name.startswith("Y"): + name = c_OFFSET_DEPR_FREQSTR[name.upper()] + if (not is_period and + name != name.upper() and + name.lower() not in {"s", "ms", "us", "ns"} and + name.upper().split("-")[0].endswith(("S", "E"))): + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{name.upper()}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + name = name.upper() + if is_period and name.upper() in c_REVERSE_OFFSET_DEPR_FREQSTR: + if name.upper().startswith("Y"): raise ValueError( - f"for Period, please use \'Y{name[2:]}\' " + f"for Period, please use \'Y{name.upper()[2:]}\' " f"instead of \'{name}\'" ) - if (name.startswith("B") or - name.startswith("S") or name.startswith("C")): + if (name.upper().startswith("B") or + name.upper().startswith("S") or + name.upper().startswith("C")): raise ValueError(INVALID_FREQ_ERR_MSG.format(name)) else: raise ValueError( f"for Period, please use " - f"\'{c_REVERSE_OFFSET_DEPR_FREQSTR.get(name)}\' " + f"\'{c_REVERSE_OFFSET_DEPR_FREQSTR.get(name.upper())}\' " f"instead of \'{name}\'" ) - elif is_period is True and name in c_OFFSET_DEPR_FREQSTR: - if name.startswith("A"): + elif is_period and name.upper() in c_OFFSET_DEPR_FREQSTR: + if name.upper().startswith("A"): warnings.warn( f"\'{name}\' is deprecated and will be removed in a future " - f"version, please use \'{c_DEPR_ABBREVS.get(name)}\' " + f"version, please use " + f"\'{c_DEPR_ABBREVS.get(name.upper())}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if name.upper() != name: + warnings.warn( + f"\'{name}\' is deprecated and will be removed in " + f"a future version, please use \'{name.upper()}\' " f"instead.", FutureWarning, stacklevel=find_stack_level(), ) - name = c_OFFSET_DEPR_FREQSTR.get(name) + name = c_OFFSET_DEPR_FREQSTR.get(name.upper()) if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") @@ -4864,21 +4946,32 @@ cpdef to_offset(freq, bint is_period=False): offset = _get_offset(prefix) offset = offset * int(np.fabs(stride) * stride_sign) - if delta is None: - delta = offset + if result is None: + result = offset else: - delta = delta + offset + result = result + offset except (ValueError, TypeError) as err: raise ValueError(INVALID_FREQ_ERR_MSG.format( f"{freq}, failed to parse with error message: {repr(err)}") ) else: - delta = None + result = None - if delta is None: + if result is None: raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) - return delta + try: + has_period_dtype_code = hasattr(result, "_period_dtype_code") + except ValueError: + has_period_dtype_code = False + + if is_period and not has_period_dtype_code: + if isinstance(freq, str): + raise ValueError(f"{result.name} is not supported as period frequency") + else: + raise ValueError(f"{freq} is not supported as period frequency") + + return result # ---------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 2c4f0cd14db13..e3facd3d9599b 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -607,7 +607,8 @@ cdef ndarray[int64_t] _get_dst_hours( ndarray[uint8_t, cast=True] mismatch ndarray[int64_t] delta, dst_hours ndarray[intp_t] switch_idxs, trans_idx, grp, a_idx, b_idx, one_diff - list trans_grp + # TODO: Can uncomment when numpy >=2 is the minimum + # tuple trans_grp intp_t switch_idx int64_t left, right diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 672c16a85086c..d7197f23ce1e4 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -8,12 +8,12 @@ TYPE_CHECKING, Callable, ContextManager, - cast, ) import warnings import numpy as np +from pandas._config import using_string_dtype from pandas._config.localization import ( can_set_locale, get_locales, @@ -22,8 +22,6 @@ from pandas.compat import pa_version_under10p1 -from pandas.core.dtypes.common import is_string_dtype - import pandas as pd from pandas import ( ArrowDtype, @@ -82,8 +80,8 @@ with_csv_dialect, ) from pandas.core.arrays import ( + ArrowExtensionArray, BaseMaskedArray, - ExtensionArray, NumpyExtensionArray, ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray @@ -95,7 +93,6 @@ NpDtype, ) - from pandas.core.arrays import ArrowExtensionArray UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"] UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"] @@ -110,7 +107,11 @@ ALL_FLOAT_DTYPES: list[Dtype] = [*FLOAT_NUMPY_DTYPES, *FLOAT_EA_DTYPES] COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] -STRING_DTYPES: list[Dtype] = [str, "str", "U"] +if using_string_dtype(): + STRING_DTYPES: list[Dtype] = ["U"] +else: + STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef] +COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"] TIMEDELTA64_DTYPES: list[Dtype] = ["timedelta64[ns]", "m8[ns]"] @@ -236,11 +237,18 @@ + TIMEDELTA_PYARROW_DTYPES + BOOL_PYARROW_DTYPES ) + ALL_REAL_PYARROW_DTYPES_STR_REPR = ( + ALL_INT_PYARROW_DTYPES_STR_REPR + FLOAT_PYARROW_DTYPES_STR_REPR + ) else: FLOAT_PYARROW_DTYPES_STR_REPR = [] ALL_INT_PYARROW_DTYPES_STR_REPR = [] ALL_PYARROW_DTYPES = [] + ALL_REAL_PYARROW_DTYPES_STR_REPR = [] +ALL_REAL_NULLABLE_DTYPES = ( + FLOAT_NUMPY_DTYPES + ALL_REAL_EXTENSION_DTYPES + ALL_REAL_PYARROW_DTYPES_STR_REPR +) arithmetic_dunder_methods = [ "__add__", @@ -507,6 +515,8 @@ def shares_memory(left, right) -> bool: if isinstance(left, MultiIndex): return shares_memory(left._codes, right) if isinstance(left, (Index, Series)): + if isinstance(right, (Index, Series)): + return shares_memory(left._values, right._values) return shares_memory(left._values, right) if isinstance(left, NDArrayBackedExtensionArray): @@ -516,24 +526,18 @@ def shares_memory(left, right) -> bool: if isinstance(left, pd.core.arrays.IntervalArray): return shares_memory(left._left, right) or shares_memory(left._right, right) - if ( - isinstance(left, ExtensionArray) - and is_string_dtype(left.dtype) - and left.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] - ): - # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 - left = cast("ArrowExtensionArray", left) - if ( - isinstance(right, ExtensionArray) - and is_string_dtype(right.dtype) - and right.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] - ): - right = cast("ArrowExtensionArray", right) + if isinstance(left, ArrowExtensionArray): + if isinstance(right, ArrowExtensionArray): + # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 left_pa_data = left._pa_array right_pa_data = right._pa_array left_buf1 = left_pa_data.chunk(0).buffers()[1] right_buf1 = right_pa_data.chunk(0).buffers()[1] - return left_buf1 == right_buf1 + return left_buf1.address == right_buf1.address + else: + # if we have one one ArrowExtensionArray and one other array, assume + # they can only share memory if they share the same numpy buffer + return np.shares_memory(left, right) if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray): # By convention, we'll say these share memory if they share *either* diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index f11dc11f6ac0d..c9a287942f2da 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -218,7 +218,12 @@ def _assert_raised_with_correct_stacklevel( frame = inspect.currentframe() for _ in range(4): frame = frame.f_back # type: ignore[union-attr] - caller_filename = inspect.getfile(frame) # type: ignore[arg-type] + try: + caller_filename = inspect.getfile(frame) # type: ignore[arg-type] + finally: + # See note in + # https://docs.python.org/3/library/inspect.html#inspect.Traceback + del frame msg = ( "Warning not set with correct stacklevel. " f"File where warning is raised: {actual_warning.filename} != " diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index e342f76dc724b..a1f9844669c8c 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -4,11 +4,13 @@ from typing import ( TYPE_CHECKING, Literal, + NoReturn, cast, ) import numpy as np +from pandas._libs import lib from pandas._libs.missing import is_matching_na from pandas._libs.sparse import SparseIndex import pandas._libs.testing as _testing @@ -143,7 +145,7 @@ def assert_almost_equal( ) -def _check_isinstance(left, right, cls): +def _check_isinstance(left, right, cls) -> None: """ Helper method for our assert_* methods that ensures that the two objects being compared have the right type before @@ -576,7 +578,7 @@ def assert_timedelta_array_equal( def raise_assert_detail( obj, message, left, right, diff=None, first_diff=None, index_values=None -): +) -> NoReturn: __tracebackhide__ = True msg = f"""{obj} are different @@ -591,13 +593,19 @@ def raise_assert_detail( if isinstance(left, np.ndarray): left = pprint_thing(left) - elif isinstance(left, (CategoricalDtype, NumpyEADtype, StringDtype)): + elif isinstance(left, (CategoricalDtype, NumpyEADtype)): left = repr(left) + elif isinstance(left, StringDtype): + # TODO(infer_string) this special case could be avoided if we have + # a more informative repr https://github.com/pandas-dev/pandas/issues/59342 + left = f"StringDtype(storage={left.storage}, na_value={left.na_value})" if isinstance(right, np.ndarray): right = pprint_thing(right) - elif isinstance(right, (CategoricalDtype, NumpyEADtype, StringDtype)): + elif isinstance(right, (CategoricalDtype, NumpyEADtype)): right = repr(right) + elif isinstance(right, StringDtype): + right = f"StringDtype(storage={right.storage}, na_value={right.na_value})" msg += f""" [left]: {left} @@ -664,7 +672,7 @@ def _get_base(obj): if left_base is right_base: raise AssertionError(f"{repr(left_base)} is {repr(right_base)}") - def _raise(left, right, err_msg): + def _raise(left, right, err_msg) -> NoReturn: if err_msg is None: if left.shape != right.shape: raise_assert_detail( @@ -697,9 +705,9 @@ def assert_extension_array_equal( right, check_dtype: bool | Literal["equiv"] = True, index_values=None, - check_exact: bool = False, - rtol: float = 1.0e-5, - atol: float = 1.0e-8, + check_exact: bool | lib.NoDefault = lib.no_default, + rtol: float | lib.NoDefault = lib.no_default, + atol: float | lib.NoDefault = lib.no_default, obj: str = "ExtensionArray", ) -> None: """ @@ -714,7 +722,12 @@ def assert_extension_array_equal( index_values : Index | numpy.ndarray, default None Optional index (shared by both left and right), used in output. check_exact : bool, default False - Whether to compare number exactly. Only takes effect for float dtypes. + Whether to compare number exactly. + + .. versionchanged:: 2.2.0 + + Defaults to True for integer dtypes if none of + ``check_exact``, ``rtol`` and ``atol`` are specified. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. atol : float, default 1e-8 @@ -738,6 +751,23 @@ def assert_extension_array_equal( >>> b, c = a.array, a.array >>> tm.assert_extension_array_equal(b, c) """ + if ( + check_exact is lib.no_default + and rtol is lib.no_default + and atol is lib.no_default + ): + check_exact = ( + is_numeric_dtype(left.dtype) + and not is_float_dtype(left.dtype) + or is_numeric_dtype(right.dtype) + and not is_float_dtype(right.dtype) + ) + elif check_exact is lib.no_default: + check_exact = False + + rtol = rtol if rtol is not lib.no_default else 1.0e-5 + atol = atol if atol is not lib.no_default else 1.0e-8 + assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" if check_dtype: @@ -781,12 +811,27 @@ def assert_extension_array_equal( left_na, right_na, obj=f"{obj} NA mask", index_values=index_values ) + # Specifically for StringArrayNumpySemantics, validate here we have a valid array + if ( + isinstance(left.dtype, StringDtype) + and left.dtype.storage == "python" + and left.dtype.na_value is np.nan + ): + assert np.all( + [np.isnan(val) for val in left._ndarray[left_na]] # type: ignore[attr-defined] + ), "wrong missing value sentinels" + if ( + isinstance(right.dtype, StringDtype) + and right.dtype.storage == "python" + and right.dtype.na_value is np.nan + ): + assert np.all( + [np.isnan(val) for val in right._ndarray[right_na]] # type: ignore[attr-defined] + ), "wrong missing value sentinels" + left_valid = left[~left_na].to_numpy(dtype=object) right_valid = right[~right_na].to_numpy(dtype=object) - if check_exact or ( - (is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)) - or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) - ): + if check_exact: assert_numpy_array_equal( left_valid, right_valid, obj=obj, index_values=index_values ) @@ -810,14 +855,14 @@ def assert_series_equal( check_index_type: bool | Literal["equiv"] = "equiv", check_series_type: bool = True, check_names: bool = True, - check_exact: bool = False, + check_exact: bool | lib.NoDefault = lib.no_default, check_datetimelike_compat: bool = False, check_categorical: bool = True, check_category_order: bool = True, check_freq: bool = True, check_flags: bool = True, - rtol: float = 1.0e-5, - atol: float = 1.0e-8, + rtol: float | lib.NoDefault = lib.no_default, + atol: float | lib.NoDefault = lib.no_default, obj: str = "Series", *, check_index: bool = True, @@ -840,7 +885,12 @@ def assert_series_equal( check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False - Whether to compare number exactly. Only takes effect for float dtypes. + Whether to compare number exactly. + + .. versionchanged:: 2.2.0 + + Defaults to True for integer dtypes if none of + ``check_exact``, ``rtol`` and ``atol`` are specified. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True @@ -876,6 +926,23 @@ def assert_series_equal( >>> tm.assert_series_equal(a, b) """ __tracebackhide__ = True + check_exact_index = False if check_exact is lib.no_default else check_exact + if ( + check_exact is lib.no_default + and rtol is lib.no_default + and atol is lib.no_default + ): + check_exact = ( + is_numeric_dtype(left.dtype) + and not is_float_dtype(left.dtype) + or is_numeric_dtype(right.dtype) + and not is_float_dtype(right.dtype) + ) + elif check_exact is lib.no_default: + check_exact = False + + rtol = rtol if rtol is not lib.no_default else 1.0e-5 + atol = atol if atol is not lib.no_default else 1.0e-8 if not check_index and check_like: raise ValueError("check_like must be False if check_index is False") @@ -902,7 +969,7 @@ def assert_series_equal( right.index, exact=check_index_type, check_names=check_names, - check_exact=check_exact, + check_exact=check_exact_index, check_categorical=check_categorical, check_order=not check_like, rtol=rtol, @@ -930,10 +997,7 @@ def assert_series_equal( pass else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - if check_exact or ( - (is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)) - or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) - ): + if check_exact: left_values = left._values right_values = right._values # Only check exact if dtype is numeric @@ -948,9 +1012,15 @@ def assert_series_equal( obj=str(obj), ) else: + # convert both to NumPy if not, check_dtype would raise earlier + lv, rv = left_values, right_values + if isinstance(left_values, ExtensionArray): + lv = left_values.to_numpy() + if isinstance(right_values, ExtensionArray): + rv = right_values.to_numpy() assert_numpy_array_equal( - left_values, - right_values, + lv, + rv, check_dtype=check_dtype, obj=str(obj), index_values=left.index, @@ -1054,14 +1124,14 @@ def assert_frame_equal( check_frame_type: bool = True, check_names: bool = True, by_blocks: bool = False, - check_exact: bool = False, + check_exact: bool | lib.NoDefault = lib.no_default, check_datetimelike_compat: bool = False, check_categorical: bool = True, check_like: bool = False, check_freq: bool = True, check_flags: bool = True, - rtol: float = 1.0e-5, - atol: float = 1.0e-8, + rtol: float | lib.NoDefault = lib.no_default, + atol: float | lib.NoDefault = lib.no_default, obj: str = "DataFrame", ) -> None: """ @@ -1096,7 +1166,12 @@ def assert_frame_equal( Specify how to compare internal data. If False, compare by columns. If True, compare by blocks. check_exact : bool, default False - Whether to compare number exactly. Only takes effect for float dtypes. + Whether to compare number exactly. + + .. versionchanged:: 2.2.0 + + Defaults to True for integer dtypes if none of + ``check_exact``, ``rtol`` and ``atol`` are specified. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True @@ -1151,6 +1226,9 @@ def assert_frame_equal( >>> assert_frame_equal(df1, df2, check_dtype=False) """ __tracebackhide__ = True + _rtol = rtol if rtol is not lib.no_default else 1.0e-5 + _atol = atol if atol is not lib.no_default else 1.0e-8 + _check_exact = check_exact if check_exact is not lib.no_default else False # instance validation _check_isinstance(left, right, DataFrame) @@ -1174,11 +1252,11 @@ def assert_frame_equal( right.index, exact=check_index_type, check_names=check_names, - check_exact=check_exact, + check_exact=_check_exact, check_categorical=check_categorical, check_order=not check_like, - rtol=rtol, - atol=atol, + rtol=_rtol, + atol=_atol, obj=f"{obj}.index", ) @@ -1188,11 +1266,11 @@ def assert_frame_equal( right.columns, exact=check_column_type, check_names=check_names, - check_exact=check_exact, + check_exact=_check_exact, check_categorical=check_categorical, check_order=not check_like, - rtol=rtol, - atol=atol, + rtol=_rtol, + atol=_atol, obj=f"{obj}.columns", ) diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index eb6e4a917889a..48616ee134582 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -78,14 +78,15 @@ def set_timezone(tz: str) -> Generator[None, None, None]: import time def setTZ(tz) -> None: - if tz is None: - try: - del os.environ["TZ"] - except KeyError: - pass - else: - os.environ["TZ"] = tz - time.tzset() + if hasattr(time, "tzset"): + if tz is None: + try: + del os.environ["TZ"] + except KeyError: + pass + else: + os.environ["TZ"] = tz + time.tzset() orig_tz = os.environ.get("TZ") setTZ(tz) diff --git a/pandas/_version.py b/pandas/_version.py index 5d610b5e1ea7e..f8a960630126d 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -386,7 +386,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): return pieces -def plus_or_dot(pieces): +def plus_or_dot(pieces) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 738442fab8c70..ff99d6b759d66 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -25,11 +25,17 @@ import pandas.compat.compressors from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( + HAS_PYARROW, pa_version_under10p1, pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, pa_version_under14p1, + pa_version_under16p0, + pa_version_under17p0, + pa_version_under18p0, + pa_version_under19p0, + pa_version_under20p0, ) if TYPE_CHECKING: @@ -186,6 +192,12 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under13p0", "pa_version_under14p0", "pa_version_under14p1", + "pa_version_under16p0", + "pa_version_under17p0", + "pa_version_under18p0", + "pa_version_under19p0", + "pa_version_under20p0", + "HAS_PYARROW", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 9d04d7c0a1216..2bc6cd46f09a7 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -120,9 +120,8 @@ def import_optional_dependency( The imported module, when found and the version is correct. None is returned when the package is not found and `errors` is False, or when the package's version is too old and `errors` - is ``'warn'``. + is ``'warn'`` or ``'ignore'``. """ - assert errors in {"warn", "raise", "ignore"} package_name = INSTALL_MAPPING.get(name) @@ -163,5 +162,7 @@ def import_optional_dependency( return None elif errors == "raise": raise ImportError(msg) + else: + return None return module diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 3014bd652d8c4..a06761d03887b 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -12,7 +12,7 @@ np_version_gte1p24 = _nlv >= Version("1.24") np_version_gte1p24p3 = _nlv >= Version("1.24.3") np_version_gte1p25 = _nlv >= Version("1.25") -np_version_gt2 = _nlv >= Version("2.0.0.dev0") +np_version_gt2 = _nlv >= Version("2.0.0") is_numpy_dev = _nlv.dev is not None _min_numpy_ver = "1.22.4" diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index a36e25a9df410..4df30f7f4a8a7 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -138,6 +138,7 @@ def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) -> ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None ARGSORT_DEFAULTS["kind"] = None +ARGSORT_DEFAULTS["stable"] = None validate_argsort = CompatValidator( @@ -149,6 +150,7 @@ def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) -> ARGSORT_DEFAULTS_KIND: dict[str, int | None] = {} ARGSORT_DEFAULTS_KIND["axis"] = -1 ARGSORT_DEFAULTS_KIND["order"] = None +ARGSORT_DEFAULTS_KIND["stable"] = None validate_argsort_kind = CompatValidator( ARGSORT_DEFAULTS_KIND, fname="argsort", max_fname_arg_count=0, method="both" ) diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index beb4814914101..d78827042e95c 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -15,6 +15,12 @@ pa_version_under14p0 = _palv < Version("14.0.0") pa_version_under14p1 = _palv < Version("14.0.1") pa_version_under15p0 = _palv < Version("15.0.0") + pa_version_under16p0 = _palv < Version("16.0.0") + pa_version_under17p0 = _palv < Version("17.0.0") + pa_version_under18p0 = _palv < Version("18.0.0") + pa_version_under19p0 = _palv < Version("19.0.0") + pa_version_under20p0 = _palv < Version("20.0.0") + HAS_PYARROW = True except ImportError: pa_version_under10p1 = True pa_version_under11p0 = True @@ -23,3 +29,9 @@ pa_version_under14p0 = True pa_version_under14p1 = True pa_version_under15p0 = True + pa_version_under16p0 = True + pa_version_under17p0 = True + pa_version_under18p0 = True + pa_version_under19p0 = True + pa_version_under20p0 = True + HAS_PYARROW = False diff --git a/pandas/conftest.py b/pandas/conftest.py index 983272d79081e..35fe5cb475cde 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -190,10 +190,6 @@ def pytest_collection_modifyitems(items, config) -> None: if is_doctest: for item in items: - # autouse=True for the add_doctest_imports can lead to expensive teardowns - # since doctest_namespace is a session fixture - item.add_marker(pytest.mark.usefixtures("add_doctest_imports")) - for path, message in ignored_doctest_warnings: ignore_doctest_warning(item, path, message) @@ -250,7 +246,14 @@ def pytest_collection_modifyitems(items, config) -> None: ) -@pytest.fixture +# ---------------------------------------------------------------- +# Autouse fixtures +# ---------------------------------------------------------------- + + +# https://github.com/pytest-dev/pytest/issues/11873 +# Would like to avoid autouse=True, but cannot as of pytest 8.0.0 +@pytest.fixture(autouse=True) def add_doctest_imports(doctest_namespace) -> None: """ Make `np` and `pd` names available for doctests. @@ -259,9 +262,6 @@ def add_doctest_imports(doctest_namespace) -> None: doctest_namespace["pd"] = pd -# ---------------------------------------------------------------- -# Autouse fixtures -# ---------------------------------------------------------------- @pytest.fixture(autouse=True) def configure_tests() -> None: """ @@ -548,7 +548,7 @@ def multiindex_year_month_day_dataframe_random_data(): """ tdf = DataFrame( np.random.default_rng(2).standard_normal((100, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100, freq="B"), ) ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() @@ -615,7 +615,8 @@ def _create_mi_with_dt64tz_level(): indices_dict = { - "string": Index([f"pandas_{i}" for i in range(100)]), + "object": Index([f"pandas_{i}" for i in range(100)], dtype=object), + "string": Index([f"pandas_{i}" for i in range(100)], dtype="str"), "datetime": date_range("2020-01-01", periods=100), "datetime-tz": date_range("2020-01-01", periods=100, tz="US/Pacific"), "period": period_range("2020-01-01", periods=100, freq="D"), @@ -742,7 +743,7 @@ def string_series() -> Series: """ return Series( np.arange(30, dtype=np.float64) * 1.1, - index=Index([f"i_{i}" for i in range(30)], dtype=object), + index=Index([f"i_{i}" for i in range(30)]), name="series", ) @@ -753,7 +754,7 @@ def object_series() -> Series: Fixture for Series of dtype object with Index of unique strings """ data = [f"foo_{i}" for i in range(30)] - index = Index([f"bar_{i}" for i in range(30)], dtype=object) + index = Index([f"bar_{i}" for i in range(30)]) return Series(data, index=index, name="objects", dtype=object) @@ -845,8 +846,8 @@ def int_frame() -> DataFrame: """ return DataFrame( np.ones((30, 4), dtype=np.int64), - index=Index([f"foo_{i}" for i in range(30)], dtype=object), - columns=Index(list("ABCD"), dtype=object), + index=Index([f"foo_{i}" for i in range(30)]), + columns=Index(list("ABCD")), ) @@ -1228,6 +1229,34 @@ def string_dtype(request): return request.param +@pytest.fixture( + params=[ + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), + ], + ids=[ + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], +) +def string_dtype_no_object(request): + """ + Parametrized fixture for string dtypes. + * 'string[python]' (NA variant) + * 'string[pyarrow]' (NA variant) + * 'str' (NaN variant, with pyarrow) + * 'str' (NaN variant, without pyarrow) + """ + # need to instantiate the StringDtype here instead of in the params + # to avoid importing pyarrow during test collection + storage, na_value = request.param + return pd.StringDtype(storage, na_value) + + @pytest.fixture( params=[ "string[python]", @@ -1244,11 +1273,26 @@ def nullable_string_dtype(request): return request.param +@pytest.fixture( + params=[ + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + ] +) +def pyarrow_string_dtype(request): + """ + Parametrized fixture for string dtypes backed by Pyarrow. + + * 'str[pyarrow]' + * 'string[pyarrow]' + """ + return pd.StringDtype(*request.param) + + @pytest.fixture( params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] ) def string_storage(request): @@ -1257,7 +1301,31 @@ def string_storage(request): * 'python' * 'pyarrow' - * 'pyarrow_numpy' + """ + return request.param + + +@pytest.fixture( + params=[ + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), + ], + ids=[ + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], +) +def string_dtype_arguments(request): + """ + Parametrized fixture for StringDtype storage and na_value. + + * 'python' + pd.NA + * 'pyarrow' + pd.NA + * 'pyarrow' + np.nan """ return request.param @@ -1280,6 +1348,7 @@ def dtype_backend(request): # Alias so we can test with cartesian product of string_storage string_storage2 = string_storage +string_dtype_arguments2 = string_dtype_arguments @pytest.fixture(params=tm.BYTES_DTYPES) @@ -1306,20 +1375,36 @@ def object_dtype(request): @pytest.fixture( params=[ - "object", - "string[python]", - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ] + np.dtype("object"), + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), + ], + ids=[ + "string=object", + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], ) def any_string_dtype(request): """ Parametrized fixture for string dtypes. * 'object' - * 'string[python]' - * 'string[pyarrow]' + * 'string[python]' (NA variant) + * 'string[pyarrow]' (NA variant) + * 'str' (NaN variant, with pyarrow) + * 'str' (NaN variant, without pyarrow) """ - return request.param + if isinstance(request.param, np.dtype): + return request.param + else: + # need to instantiate the StringDtype here instead of in the params + # to avoid importing pyarrow during test collection + storage, na_value = request.param + return pd.StringDtype(storage, na_value) @pytest.fixture(params=tm.DATETIME64_DTYPES) @@ -1403,6 +1488,21 @@ def complex_dtype(request): return request.param +@pytest.fixture(params=tm.COMPLEX_FLOAT_DTYPES) +def complex_or_float_dtype(request): + """ + Parameterized fixture for complex and numpy float dtypes. + + * complex + * 'complex64' + * 'complex128' + * float + * 'float32' + * 'float64' + """ + return request.param + + @pytest.fixture(params=tm.SIGNED_INT_NUMPY_DTYPES) def any_signed_int_numpy_dtype(request): """ @@ -1642,6 +1742,38 @@ def any_numpy_dtype(request): return request.param +@pytest.fixture(params=tm.ALL_REAL_NULLABLE_DTYPES) +def any_real_nullable_dtype(request): + """ + Parameterized fixture for all real dtypes that can hold NA. + + * float + * 'float32' + * 'float64' + * 'Float32' + * 'Float64' + * 'UInt8' + * 'UInt16' + * 'UInt32' + * 'UInt64' + * 'Int8' + * 'Int16' + * 'Int32' + * 'Int64' + * 'uint8[pyarrow]' + * 'uint16[pyarrow]' + * 'uint32[pyarrow]' + * 'uint64[pyarrow]' + * 'int8[pyarrow]' + * 'int16[pyarrow]' + * 'int32[pyarrow]' + * 'int64[pyarrow]' + * 'float[pyarrow]' + * 'double[pyarrow]' + """ + return request.param + + @pytest.fixture(params=tm.ALL_NUMERIC_DTYPES) def any_numeric_dtype(request): """ diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index ee09c9380fb0f..b05f12295a729 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -49,7 +49,8 @@ @contextmanager def set_numba_data(index: Index): numba_data = index._data - if numba_data.dtype == object: + if numba_data.dtype in (object, "string"): + numba_data = np.asarray(numba_data) if not lib.is_string_array(numba_data): raise ValueError( "The numba engine only supports using string or numeric column names" diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 15a07da76d2f7..c6084880bea5d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -931,11 +931,11 @@ def value_counts_internal( # For backwards compatibility, we let Index do its normal type # inference, _except_ for if if infers from object to bool. idx = Index(keys) - if idx.dtype == bool and keys.dtype == object: + if idx.dtype in [bool, "string"] and keys.dtype == object: idx = idx.astype(object) elif ( idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714 - and idx.dtype != "string[pyarrow_numpy]" + and idx.dtype != "string" ): warnings.warn( # GH#56161 @@ -1053,7 +1053,7 @@ def mode( return npresult, res_mask # type: ignore[return-value] try: - npresult = np.sort(npresult) + npresult = safe_sort(npresult) except TypeError as err: warnings.warn( f"Unable to sort modes: {err}", diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 25a71ce5b5f4f..fafc9ee1b6928 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1174,12 +1174,7 @@ def apply_with_numba(self) -> dict[int, Any]: from pandas.core._numba.extensions import set_numba_data index = self.obj.index - if index.dtype == "string": - index = index.astype(object) - columns = self.obj.columns - if columns.dtype == "string": - columns = columns.astype(object) # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 335fa1afc0f4e..6bf97729a79b1 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -62,6 +62,10 @@ def _reductions( ): return libmissing.NA + if values.dtype == np.dtype(object): + # object dtype does not support `where` without passing an initial + values = values[~mask] + return func(values, axis=axis, **kwargs) return func(values, where=~mask, axis=axis, **kwargs) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index ee6f00b219a15..5c933294fb944 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -102,7 +102,7 @@ def quantile_with_mask( interpolation=interpolation, ) - result = np.array(result, copy=False) + result = np.asarray(result) result = result.T return result @@ -201,9 +201,9 @@ def _nanpercentile( ] if values.dtype.kind == "f": # preserve itemsize - result = np.array(result, dtype=values.dtype, copy=False).T + result = np.asarray(result, dtype=values.dtype).T else: - result = np.array(result, copy=False).T + result = np.asarray(result).T if ( result.dtype != values.dtype and not mask.all() diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 5f377276be480..7d40fb985a593 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -149,4 +149,6 @@ def re_replacer(s): if mask is None: values[:] = f(values) else: + if values.ndim != mask.ndim: + mask = np.broadcast_to(mask, values.shape) values[mask] = f(values[mask]) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index cc41985843574..e136b4f92031d 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -1,22 +1,84 @@ from __future__ import annotations -from typing import Literal +from functools import partial +import re +from typing import ( + TYPE_CHECKING, + Any, + Literal, +) import numpy as np -from pandas.compat import pa_version_under10p1 +from pandas._libs import lib +from pandas.compat import ( + pa_version_under10p1, + pa_version_under11p0, + pa_version_under13p0, + pa_version_under17p0, +) if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc +if TYPE_CHECKING: + from collections.abc import Callable + + from pandas._typing import ( + Scalar, + Self, + ) + class ArrowStringArrayMixin: - _pa_array = None + _pa_array: pa.ChunkedArray def __init__(self, *args, **kwargs) -> None: raise NotImplementedError + def _convert_bool_result(self, result, na=lib.no_default, method_name=None): + # Convert a bool-dtype result to the appropriate result type + raise NotImplementedError + + def _convert_int_result(self, result): + # Convert an integer-dtype result to the appropriate result type + raise NotImplementedError + + def _apply_elementwise(self, func: Callable) -> list[list[Any]]: + raise NotImplementedError + + def _str_len(self): + result = pc.utf8_length(self._pa_array) + return self._convert_int_result(result) + + def _str_lower(self) -> Self: + return type(self)(pc.utf8_lower(self._pa_array)) + + def _str_upper(self) -> Self: + return type(self)(pc.utf8_upper(self._pa_array)) + + def _str_strip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_trim_whitespace(self._pa_array) + else: + result = pc.utf8_trim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_lstrip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_ltrim_whitespace(self._pa_array) + else: + result = pc.utf8_ltrim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_rstrip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_rtrim_whitespace(self._pa_array) + else: + result = pc.utf8_rtrim(self._pa_array, characters=to_strip) + return type(self)(result) + def _str_pad( self, width: int, @@ -28,7 +90,19 @@ def _str_pad( elif side == "right": pa_pad = pc.utf8_rpad elif side == "both": - pa_pad = pc.utf8_center + if pa_version_under17p0: + # GH#59624 fall back to object dtype + from pandas import array as pd_array + + obj_arr = self.astype(object, copy=False) # type: ignore[attr-defined] + obj = pd_array(obj_arr, dtype=object) + result = obj._str_pad(width, side, fillchar) # type: ignore[attr-defined] + return type(self)._from_sequence(result, dtype=self.dtype) # type: ignore[attr-defined] + else: + # GH#54792 + # https://github.com/apache/arrow/issues/15053#issuecomment-2317032347 + lean_left = (width % 2) == 0 + pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=lean_left) else: raise ValueError( f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" @@ -51,12 +125,29 @@ def _str_get(self, i: int): selected = pc.utf8_slice_codeunits( self._pa_array, start=start, stop=stop, step=step ) - null_value = pa.scalar( - None, type=self._pa_array.type # type: ignore[attr-defined] - ) + null_value = pa.scalar(None, type=self._pa_array.type) result = pc.if_else(not_out_of_bounds, selected, null_value) return type(self)(result) + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if pa_version_under11p0: + # GH#59724 + result = self._apply_elementwise(lambda val: val[start:stop:step]) + return type(self)(pa.chunked_array(result, type=self._pa_array.type)) + if start is None: + if step is not None and step < 0: + # GH#59710 + start = -1 + else: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + def _str_slice_replace( self, start: int | None = None, stop: int | None = None, repl: str | None = None ): @@ -68,7 +159,34 @@ def _str_slice_replace( stop = np.iinfo(np.int64).max return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) - def _str_capitalize(self): + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ) -> Self: + if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: + raise NotImplementedError( + "replace is not supported with a re.Pattern, callable repl, " + "case=False, or flags!=0" + ) + + func = pc.replace_substring_regex if regex else pc.replace_substring + # https://github.com/apache/arrow/issues/39149 + # GH 56404, unexpected behavior with negative max_replacements with pyarrow. + pa_max_replacements = None if n < 0 else n + result = func( + self._pa_array, + pattern=pat, + replacement=repl, + max_replacements=pa_max_replacements, + ) + return type(self)(result) + + def _str_capitalize(self) -> Self: return type(self)(pc.utf8_capitalize(self._pa_array)) def _str_title(self): @@ -77,8 +195,162 @@ def _str_title(self): def _str_swapcase(self): return type(self)(pc.utf8_swapcase(self._pa_array)) + def _str_removeprefix(self, prefix: str): + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) + predicate = lambda val: val.removeprefix(prefix) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + def _str_removesuffix(self, suffix: str): ends_with = pc.ends_with(self._pa_array, pattern=suffix) removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) result = pc.if_else(ends_with, removed, self._pa_array) return type(self)(result) + + def _str_startswith( + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default + ): + if isinstance(pat, str): + result = pc.starts_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple we return null for missing values and False + # for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.starts_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) + return self._convert_bool_result(result, na=na, method_name="startswith") + + def _str_endswith( + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default + ): + if isinstance(pat, str): + result = pc.ends_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple we return null for missing values and False + # for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.ends_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) + return self._convert_bool_result(result, na=na, method_name="endswith") + + def _str_isalnum(self): + result = pc.utf8_is_alnum(self._pa_array) + return self._convert_bool_result(result) + + def _str_isalpha(self): + result = pc.utf8_is_alpha(self._pa_array) + return self._convert_bool_result(result) + + def _str_isdecimal(self): + result = pc.utf8_is_decimal(self._pa_array) + return self._convert_bool_result(result) + + def _str_isdigit(self): + result = pc.utf8_is_digit(self._pa_array) + return self._convert_bool_result(result) + + def _str_islower(self): + result = pc.utf8_is_lower(self._pa_array) + return self._convert_bool_result(result) + + def _str_isnumeric(self): + result = pc.utf8_is_numeric(self._pa_array) + return self._convert_bool_result(result) + + def _str_isspace(self): + result = pc.utf8_is_space(self._pa_array) + return self._convert_bool_result(result) + + def _str_istitle(self): + result = pc.utf8_is_title(self._pa_array) + return self._convert_bool_result(result) + + def _str_isupper(self): + result = pc.utf8_is_upper(self._pa_array) + return self._convert_bool_result(result) + + def _str_contains( + self, + pat, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, + regex: bool = True, + ): + if flags: + raise NotImplementedError(f"contains not implemented with {flags=}") + + if regex: + pa_contains = pc.match_substring_regex + else: + pa_contains = pc.match_substring + result = pa_contains(self._pa_array, pat, ignore_case=not case) + return self._convert_bool_result(result, na=na, method_name="contains") + + def _str_match( + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, + ): + if not pat.startswith("^"): + pat = f"^{pat}" + return self._str_contains(pat, case, flags, na, regex=True) + + def _str_fullmatch( + self, + pat, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, + ): + if not pat.endswith("$") or pat.endswith("\\$"): + pat = f"{pat}$" + return self._str_match(pat, case, flags, na) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if ( + pa_version_under13p0 + and not (start != 0 and end is not None) + and not (start == 0 and end is None) + ): + # GH#59562 + res_list = self._apply_elementwise(lambda val: val.find(sub, start, end)) + return self._convert_int_result(pa.chunked_array(res_list)) + + if (start == 0 or start is None) and end is None: + result = pc.find_substring(self._pa_array, sub) + else: + if sub == "": + # GH#56792 + res_list = self._apply_elementwise( + lambda val: val.find(sub, start, end) + ) + return self._convert_int_result(pa.chunked_array(res_list)) + if start is None: + start_offset = 0 + start = 0 + elif start < 0: + start_offset = pc.add(start, pc.utf8_length(self._pa_array)) + start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset) + else: + start_offset = start + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + found = pc.not_equal(result, pa.scalar(-1, type=result.type)) + offset_result = pc.add(result, start_offset) + result = pc.if_else(found, offset_result, -1) + return self._convert_int_result(result) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 9ece12cf51a7b..cb6861a8dd00f 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -305,7 +305,12 @@ def _fill_mask_inplace( func(self._ndarray.T, limit=limit, mask=mask.T) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: mask = self.isna() if mask.any(): @@ -315,7 +320,7 @@ def _pad_or_backfill( npvalues = self._ndarray.T if copy: npvalues = npvalues.copy() - func(npvalues, limit=limit, mask=mask.T) + func(npvalues, limit=limit, limit_area=limit_area, mask=mask.T) npvalues = npvalues.T if copy: @@ -510,17 +515,14 @@ def _quantile( fill_value = self._internal_fill_value res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation) - - res_values = self._cast_quantile_result(res_values) - return self._from_backing_data(res_values) - - # TODO: see if we can share this with other dispatch-wrapping methods - def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: - """ - Cast the result of quantile_with_mask to an appropriate dtype - to pass to _from_backing_data in _quantile. - """ - return res_values + if res_values.dtype == self._ndarray.dtype: + return self._from_backing_data(res_values) + else: + # e.g. test_quantile_empty we are empty integer dtype and res_values + # has floating dtype + # TODO: technically __init__ isn't defined here. + # Should we raise NotImplementedError and handle this on NumpyEA? + return type(self)(res_values) # type: ignore[call-arg] # ------------------------------------------------------------------------ # numpy-like methods diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index c75ec7f843ed2..6b46396d5efdf 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -44,7 +44,16 @@ def to_numpy_dtype_inference( dtype_given = True if na_value is lib.no_default: - na_value = arr.dtype.na_value + if dtype is None or not hasna: + na_value = arr.dtype.na_value + elif dtype.kind == "f": # type: ignore[union-attr] + na_value = np.nan + elif dtype.kind == "M": # type: ignore[union-attr] + na_value = np.datetime64("nat") + elif dtype.kind == "m": # type: ignore[union-attr] + na_value = np.timedelta64("nat") + else: + na_value = arr.dtype.na_value if not dtype_given and hasna: try: diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py index 2a053fac2985c..285c3fd465ffc 100644 --- a/pandas/core/arrays/arrow/_arrow_utils.py +++ b/pandas/core/arrays/arrow/_arrow_utils.py @@ -1,24 +1,8 @@ from __future__ import annotations -import warnings - import numpy as np import pyarrow -from pandas.errors import PerformanceWarning -from pandas.util._exceptions import find_stack_level - - -def fallback_performancewarning(version: str | None = None) -> None: - """ - Raise a PerformanceWarning for falling back to ExtensionArray's - non-pyarrow method - """ - msg = "Falling back on a non-pyarrow code path which may decrease performance." - if version is not None: - msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning." - warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) - def pyarrow_array_to_numpy_and_mask( arr, dtype: np.dtype diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 7f88267943526..65f0784eaa3fd 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -6,13 +6,18 @@ ABCMeta, abstractmethod, ) -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + cast, +) from pandas.compat import ( pa_version_under10p1, pa_version_under11p0, ) +from pandas.core.dtypes.common import is_list_like + if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc @@ -41,7 +46,7 @@ def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: def _validate(self, data): dtype = data.dtype - if not isinstance(dtype, ArrowDtype): + if pa_version_under10p1 or not isinstance(dtype, ArrowDtype): # Raise AttributeError so that inspect can handle non-struct Series. raise AttributeError(self._validation_msg.format(dtype=dtype)) @@ -267,15 +272,27 @@ def dtypes(self) -> Series: names = [struct.name for struct in pa_type] return Series(types, index=Index(names)) - def field(self, name_or_index: str | int) -> Series: + def field( + self, + name_or_index: list[str] + | list[bytes] + | list[int] + | pc.Expression + | bytes + | str + | int, + ) -> Series: """ Extract a child field of a struct as a Series. Parameters ---------- - name_or_index : str | int + name_or_index : str | bytes | int | expression | list Name or index of the child field to extract. + For list-like inputs, this will index into a nested + struct. + Returns ------- pandas.Series @@ -285,6 +302,19 @@ def field(self, name_or_index: str | int) -> Series: -------- Series.struct.explode : Return all child fields as a DataFrame. + Notes + ----- + The name of the resulting Series will be set using the following + rules: + + - For string, bytes, or integer `name_or_index` (or a list of these, for + a nested selection), the Series name is set to the selected + field's name. + - For a :class:`pyarrow.compute.Expression`, this is set to + the string form of the expression. + - For list-like `name_or_index`, the name will be set to the + name of the final field selected. + Examples -------- >>> import pyarrow as pa @@ -314,27 +344,92 @@ def field(self, name_or_index: str | int) -> Series: 1 2 2 1 Name: version, dtype: int64[pyarrow] + + Or an expression + + >>> import pyarrow.compute as pc + >>> s.struct.field(pc.field("project")) + 0 pandas + 1 pandas + 2 numpy + Name: project, dtype: string[pyarrow] + + For nested struct types, you can pass a list of values to index + multiple levels: + + >>> version_type = pa.struct([ + ... ("major", pa.int64()), + ... ("minor", pa.int64()), + ... ]) + >>> s = pd.Series( + ... [ + ... {"version": {"major": 1, "minor": 5}, "project": "pandas"}, + ... {"version": {"major": 2, "minor": 1}, "project": "pandas"}, + ... {"version": {"major": 1, "minor": 26}, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", version_type), ("project", pa.string())] + ... )) + ... ) + >>> s.struct.field(["version", "minor"]) + 0 5 + 1 1 + 2 26 + Name: minor, dtype: int64[pyarrow] + >>> s.struct.field([0, 0]) + 0 1 + 1 2 + 2 1 + Name: major, dtype: int64[pyarrow] """ from pandas import Series + def get_name( + level_name_or_index: list[str] + | list[bytes] + | list[int] + | pc.Expression + | bytes + | str + | int, + data: pa.ChunkedArray, + ): + if isinstance(level_name_or_index, int): + name = data.type.field(level_name_or_index).name + elif isinstance(level_name_or_index, (str, bytes)): + name = level_name_or_index + elif isinstance(level_name_or_index, pc.Expression): + name = str(level_name_or_index) + elif is_list_like(level_name_or_index): + # For nested input like [2, 1, 2] + # iteratively get the struct and field name. The last + # one is used for the name of the index. + level_name_or_index = list(reversed(level_name_or_index)) + selected = data + while level_name_or_index: + # we need the cast, otherwise mypy complains about + # getting ints, bytes, or str here, which isn't possible. + level_name_or_index = cast(list, level_name_or_index) + name_or_index = level_name_or_index.pop() + name = get_name(name_or_index, selected) + selected = selected.type.field(selected.type.get_field_index(name)) + name = selected.name + else: + raise ValueError( + "name_or_index must be an int, str, bytes, " + "pyarrow.compute.Expression, or list of those" + ) + return name + pa_arr = self._data.array._pa_array - if isinstance(name_or_index, int): - index = name_or_index - elif isinstance(name_or_index, str): - index = pa_arr.type.get_field_index(name_or_index) - else: - raise ValueError( - "name_or_index must be an int or str, " - f"got {type(name_or_index).__name__}" - ) + name = get_name(name_or_index, pa_arr) + field_arr = pc.struct_field(pa_arr, name_or_index) - pa_field = pa_arr.type[index] - field_arr = pc.struct_field(pa_arr, [index]) return Series( field_arr, dtype=ArrowDtype(field_arr.type), index=self._data.index, - name=pa_field.name, + name=name, ) def explode(self) -> DataFrame: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 23b5448029dd9..010a0cb608de1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -12,11 +12,13 @@ cast, ) import unicodedata +import warnings import numpy as np from pandas._libs import lib from pandas._libs.tslibs import ( + NaT, Timedelta, Timestamp, timezones, @@ -27,6 +29,7 @@ pa_version_under13p0, ) from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.cast import ( @@ -37,10 +40,12 @@ CategoricalDtype, is_array_like, is_bool_dtype, + is_float_dtype, is_integer, is_list_like, is_numeric_dtype, is_scalar, + is_string_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna @@ -67,6 +72,7 @@ unpack_tuple_and_ellipses, validate_indices, ) +from pandas.core.nanops import check_below_min_count from pandas.core.strings.base import BaseStringArrayMethods from pandas.io._util import _arrow_dtype_mapping @@ -107,25 +113,50 @@ def cast_for_truediv( arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar - ) -> pa.ChunkedArray: + ) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]: # Ensure int / int -> float mirroring Python/Numpy behavior # as pc.divide_checked(int, int) -> int if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( pa_object.type ): - return arrow_array.cast(pa.float64()) - return arrow_array + # GH: 56645. + # https://github.com/apache/arrow/issues/35563 + return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast( + pa_object, pa.float64(), safe=False + ) + + return arrow_array, pa_object def floordiv_compat( left: pa.ChunkedArray | pa.Array | pa.Scalar, right: pa.ChunkedArray | pa.Array | pa.Scalar, ) -> pa.ChunkedArray: - # Ensure int // int -> int mirroring Python/Numpy behavior - # as pc.floor(pc.divide_checked(int, int)) -> float - converted_left = cast_for_truediv(left, right) - result = pc.floor(pc.divide(converted_left, right)) + # TODO: Replace with pyarrow floordiv kernel. + # https://github.com/apache/arrow/issues/39386 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): + divided = pc.divide_checked(left, right) + if pa.types.is_signed_integer(divided.type): + # GH 56676 + has_remainder = pc.not_equal(pc.multiply(divided, right), left) + has_one_negative_operand = pc.less( + pc.bit_wise_xor(left, right), + pa.scalar(0, type=divided.type), + ) + result = pc.if_else( + pc.and_( + has_remainder, + has_one_negative_operand, + ), + # GH: 55561 + pc.subtract(divided, pa.scalar(1, type=divided.type)), + divided, + ) + else: + result = divided result = result.cast(left.type) + else: + divided = pc.divide(left, right) + result = pc.floor(divided) return result ARROW_ARITHMETIC_FUNCS = { @@ -135,8 +166,8 @@ def floordiv_compat( "rsub": lambda x, y: pc.subtract_checked(y, x), "mul": pc.multiply_checked, "rmul": lambda x, y: pc.multiply_checked(y, x), - "truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y), - "rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)), + "truediv": lambda x, y: pc.divide(*cast_for_truediv(x, y)), + "rtruediv": lambda x, y: pc.divide(*cast_for_truediv(y, x)), "floordiv": lambda x, y: floordiv_compat(x, y), "rfloordiv": lambda x, y: floordiv_compat(y, x), "mod": NotImplemented, @@ -155,6 +186,7 @@ def floordiv_compat( AxisInt, Dtype, FillnaOptions, + InterpolateOptions, Iterator, NpDtype, NumpySorter, @@ -542,10 +574,11 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage in ( - "pyarrow", - "pyarrow_numpy", + if ( + isinstance(self._dtype, StringDtype) + and self._dtype.storage == "pyarrow" ): + # TODO(infer_string) should this be large_string? pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype @@ -628,9 +661,26 @@ def __arrow_array__(self, type=None): """Convert myself to a pyarrow ChunkedArray.""" return self._pa_array - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.to_numpy(dtype=dtype) + if copy is False: + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + elif copy is None: + # `to_numpy(copy=False)` has the meaning of NumPy `copy=None`. + copy = False + + return self.to_numpy(dtype=dtype, copy=copy) def __invert__(self) -> Self: # This is a bit wise op for integer types @@ -645,7 +695,12 @@ def __invert__(self) -> Self: return type(self)(pc.invert(self._pa_array)) def __neg__(self) -> Self: - return type(self)(pc.negate_checked(self._pa_array)) + try: + return type(self)(pc.negate_checked(self._pa_array)) + except pa.ArrowNotImplementedError as err: + raise TypeError( + f"unary '-' not supported for dtype '{self.dtype}'" + ) from err def __pos__(self) -> Self: return type(self)(self._pa_array) @@ -673,7 +728,13 @@ def _cmp_method(self, other, op): if isinstance( other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): - result = pc_func(self._pa_array, self._box_pa(other)) + try: + result = pc_func(self._pa_array, self._box_pa(other)) + except pa.ArrowNotImplementedError: + # TODO: could this be wrong if other is object dtype? + # in which case we need to operate pointwise? + result = ops.invalid_comparison(self, other, op) + result = pa.array(result, type=pa.bool_()) elif is_scalar(other): try: result = pc_func(self._pa_array, self._box_pa(other)) @@ -685,7 +746,7 @@ def _cmp_method(self, other, op): try: result[valid] = op(np_array[valid], other) except TypeError: - result = ops.invalid_comparison(np_array, other, op) + result = ops.invalid_comparison(self, other, op) result = pa.array(result, type=pa.bool_()) result = pc.if_else(valid, result, None) else: @@ -694,8 +755,19 @@ def _cmp_method(self, other, op): ) return ArrowExtensionArray(result) - def _evaluate_op_method(self, other, op, arrow_funcs): + def _op_method_error_message(self, other, op) -> str: + if hasattr(other, "dtype"): + other_type = f"dtype '{other.dtype}'" + else: + other_type = f"object of type {type(other)}" + return ( + f"operation '{op.__name__}' not supported for " + f"dtype '{self.dtype}' with {other_type}" + ) + + def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: pa_type = self._pa_array.type + other_original = other other = self._box_pa(other) if ( @@ -705,10 +777,15 @@ def _evaluate_op_method(self, other, op, arrow_funcs): ): if op in [operator.add, roperator.radd]: sep = pa.scalar("", type=pa_type) - if op is operator.add: - result = pc.binary_join_element_wise(self._pa_array, other, sep) - elif op is roperator.radd: - result = pc.binary_join_element_wise(other, self._pa_array, sep) + try: + if op is operator.add: + result = pc.binary_join_element_wise(self._pa_array, other, sep) + elif op is roperator.radd: + result = pc.binary_join_element_wise(other, self._pa_array, sep) + except pa.ArrowNotImplementedError as err: + raise TypeError( + self._op_method_error_message(other_original, op) + ) from err return type(self)(result) elif op in [operator.mul, roperator.rmul]: binary = self._pa_array @@ -740,9 +817,14 @@ def _evaluate_op_method(self, other, op, arrow_funcs): pc_func = arrow_funcs[op.__name__] if pc_func is NotImplemented: + if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type): + raise TypeError(self._op_method_error_message(other_original, op)) raise NotImplementedError(f"{op.__name__} not implemented.") - result = pc_func(self._pa_array, other) + try: + result = pc_func(self._pa_array, other) + except pa.ArrowNotImplementedError as err: + raise TypeError(self._op_method_error_message(other_original, op)) from err return type(self)(result) def _logical_method(self, other, op): @@ -998,13 +1080,18 @@ def dropna(self) -> Self: return type(self)(pc.drop_null(self._pa_array)) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: if not self._hasna: # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() - if limit is None: + if limit is None and limit_area is None: method = missing.clean_fill_method(method) try: if method == "pad": @@ -1020,7 +1107,9 @@ def _pad_or_backfill( # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) @doc(ExtensionArray.fillna) def fillna( @@ -1054,7 +1143,7 @@ def fillna( try: fill_value = self._box_pa(value, pa_type=self._pa_array.type) except pa.ArrowTypeError as err: - msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" + msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'" raise TypeError(msg) from err try: @@ -1313,6 +1402,11 @@ def _to_timedeltaarray(self) -> TimedeltaArray: np_array = np_array.astype(np_dtype) return TimedeltaArray._simple_new(np_array, dtype=np_dtype) + def _values_for_json(self) -> np.ndarray: + if is_numeric_dtype(self.dtype): + return np.asarray(self, dtype=object) + return super()._values_for_json() + @doc(ExtensionArray.to_numpy) def to_numpy( self, @@ -1320,6 +1414,7 @@ def to_numpy( copy: bool = False, na_value: object = lib.no_default, ) -> np.ndarray: + original_na_value = na_value dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna) pa_type = self._pa_array.type if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): @@ -1345,7 +1440,14 @@ def to_numpy( if dtype is not None and isna(na_value): na_value = None result = np.full(len(data), fill_value=na_value, dtype=dtype) - elif not data._hasna or (pa.types.is_floating(pa_type) and na_value is np.nan): + elif not data._hasna or ( + pa.types.is_floating(pa_type) + and ( + na_value is np.nan + or original_na_value is lib.no_default + and is_float_dtype(dtype) + ) + ): result = data._pa_array.to_numpy() if dtype is not None: result = result.astype(dtype, copy=False) @@ -1366,7 +1468,7 @@ def to_numpy( def map(self, mapper, na_action=None): if is_numeric_dtype(self.dtype): - return map_array(self.to_numpy(), mapper, na_action=None) + return map_array(self.to_numpy(), mapper, na_action=na_action) else: return super().map(mapper, na_action) @@ -1516,6 +1618,9 @@ def _accumulate( ------ NotImplementedError : subclass does not define accumulations """ + if is_string_dtype(self): + return self._str_accumulate(name=name, skipna=skipna, **kwargs) + pyarrow_name = { "cummax": "cumulative_max", "cummin": "cumulative_min", @@ -1540,13 +1645,68 @@ def _accumulate( else: data_to_accum = data_to_accum.cast(pa.int64()) - result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + try: + result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + except pa.ArrowNotImplementedError as err: + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) from err if convert_to_int: result = result.cast(pa_dtype) return type(self)(result) + def _str_accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> ArrowExtensionArray | ExtensionArray: + """ + Accumulate implementation for strings, see `_accumulate` docstring for details. + + pyarrow.compute does not implement these methods for strings. + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # We may need to strip out trailing NA values + tail: pa.array | None = None + na_mask: pa.array | None = None + pa_array = self._pa_array + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + if self._hasna: + na_mask = pc.is_null(pa_array) + if pc.all(na_mask) == pa.scalar(True): + return type(self)(pa_array) + if skipna: + if name == "cumsum": + pa_array = pc.fill_null(pa_array, "") + else: + # We can retain the running min/max by forward/backward filling. + pa_array = pc.fill_null_forward(pa_array) + pa_array = pc.fill_null_backward(pa_array) + else: + # When not skipping NA values, the result should be null from + # the first NA value onward. + idx = pc.index(na_mask, True).as_py() + tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) + pa_array = pa_array[:idx] + + # error: Cannot call function of unknown type + pa_result = pa.array(np_func(pa_array), type=pa_array.type) # type: ignore[operator] + + if tail is not None: + pa_result = pa.concat_arrays([pa_result, tail]) + elif na_mask is not None: + pa_result = pc.if_else(na_mask, None, pa_result) + + result = type(self)(pa_result) + return result + def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar: """ Return a pyarrow scalar result of performing the reduction operation. @@ -1611,6 +1771,37 @@ def pyarrow_meth(data, skip_nulls, **kwargs): denominator = pc.sqrt_checked(pc.count(self._pa_array)) return pc.divide_checked(numerator, denominator) + elif name == "sum" and ( + pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type) + ): + + def pyarrow_meth(data, skip_nulls, min_count=0): # type: ignore[misc] + mask = pc.is_null(data) if data.null_count > 0 else None + if skip_nulls: + if min_count > 0 and check_below_min_count( + (len(data),), + None if mask is None else mask.to_numpy(), + min_count, + ): + return pa.scalar(None, type=data.type) + if data.null_count > 0: + # binary_join returns null if there is any null -> + # have to filter out any nulls + data = data.filter(pc.invert(mask)) + else: + if mask is not None or check_below_min_count( + (len(data),), None, min_count + ): + return pa.scalar(None, type=data.type) + + if pa.types.is_large_string(data.type): + # binary_join only supports string, not large_string + data = data.cast(pa.string()) + data_list = pa.ListArray.from_arrays( + [0, len(data)], data.combine_chunks() + )[0] + return pc.binary_join(data_list, "") + else: pyarrow_name = { "median": "quantile", @@ -1906,7 +2097,7 @@ def _rank( """ See Series.rank.__doc__. """ - return type(self)( + return self._convert_rank_result( self._rank_calc( axis=axis, method=method, @@ -2002,10 +2193,52 @@ def _maybe_convert_setitem_value(self, value): try: value = self._box_pa(value, self._pa_array.type) except pa.ArrowTypeError as err: - msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" + msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'" raise TypeError(msg) from err return value + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index, + limit, + limit_direction, + limit_area, + copy: bool, + **kwargs, + ) -> Self: + """ + See NDFrame.interpolate.__doc__. + """ + # NB: we return type(self) even if copy=False + if not self.dtype._is_numeric: + raise TypeError(f"Cannot interpolate with {self.dtype} dtype") + + mask = self.isna() + if self.dtype.kind == "f": + data = self._pa_array.to_numpy() + elif self.dtype.kind in "iu": + data = self.to_numpy(dtype="f8", na_value=0.0) + else: + raise NotImplementedError( + f"interpolate is not implemented for dtype={self.dtype}" + ) + + missing.interpolate_2d_inplace( + data, + method=method, + axis=0, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + mask=mask, + **kwargs, + ) + return type(self)(self._box_pa_array(pa.array(data, mask=mask))) + @classmethod def _if_else( cls, @@ -2122,6 +2355,20 @@ def _groupby_op( **kwargs, ): if isinstance(self.dtype, StringDtype): + if how in [ + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + ]: + raise TypeError( + f"dtype '{self.dtype}' does not support operation '{how}'" + ) return super()._groupby_op( how=how, has_dropped_na=has_dropped_na, @@ -2163,86 +2410,23 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]: for chunk in self._pa_array.iterchunks() ] - def _str_count(self, pat: str, flags: int = 0): - if flags: - raise NotImplementedError(f"count not implemented with {flags=}") - return type(self)(pc.count_substring_regex(self._pa_array, pat)) - - def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True - ): - if flags: - raise NotImplementedError(f"contains not implemented with {flags=}") - - if regex: - pa_contains = pc.match_substring_regex - else: - pa_contains = pc.match_substring - result = pa_contains(self._pa_array, pat, ignore_case=not case) - if not isna(na): + def _convert_bool_result(self, result, na=lib.no_default, method_name=None): + if na is not lib.no_default and not isna( + na + ): # pyright: ignore [reportGeneralTypeIssues] result = result.fill_null(na) return type(self)(result) - def _str_startswith(self, pat: str | tuple[str, ...], na=None): - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) + def _convert_int_result(self, result): return type(self)(result) - def _str_endswith(self, pat: str | tuple[str, ...], na=None): - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) + def _convert_rank_result(self, result): return type(self)(result) - def _str_replace( - self, - pat: str | re.Pattern, - repl: str | Callable, - n: int = -1, - case: bool = True, - flags: int = 0, - regex: bool = True, - ): - if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: - raise NotImplementedError( - "replace is not supported with a re.Pattern, callable repl, " - "case=False, or flags!=0" - ) - - func = pc.replace_substring_regex if regex else pc.replace_substring - # https://github.com/apache/arrow/issues/39149 - # GH 56404, unexpected behavior with negative max_replacements with pyarrow. - pa_max_replacements = None if n < 0 else n - result = func( - self._pa_array, - pattern=pat, - replacement=repl, - max_replacements=pa_max_replacements, - ) - return type(self)(result) + def _str_count(self, pat: str, flags: int = 0): + if flags: + raise NotImplementedError(f"count not implemented with {flags=}") + return type(self)(pc.count_substring_regex(self._pa_array, pat)) def _str_repeat(self, repeats: int | Sequence[int]): if not isinstance(repeats, int): @@ -2252,37 +2436,6 @@ def _str_repeat(self, repeats: int | Sequence[int]): else: return type(self)(pc.binary_repeat(self._pa_array, repeats)) - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.startswith("^"): - pat = f"^{pat}" - return self._str_contains(pat, case, flags, na, regex=True) - - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.endswith("$") or pat.endswith("//$"): - pat = f"{pat}$" - return self._str_match(pat, case, flags, na) - - def _str_find(self, sub: str, start: int = 0, end: int | None = None): - if start != 0 and end is not None: - slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) - result = pc.find_substring(slices, sub) - not_found = pc.equal(result, -1) - start_offset = max(0, start) - offset_result = pc.add(result, start_offset) - result = pc.if_else(not_found, result, offset_result) - elif start == 0 and end is None: - slices = self._pa_array - result = pc.find_substring(slices, sub) - else: - raise NotImplementedError( - f"find not implemented with {sub=}, {start=}, {end=}" - ) - return type(self)(result) - def _str_join(self, sep: str): if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( self._pa_array.type @@ -2303,84 +2456,6 @@ def _str_rpartition(self, sep: str, expand: bool): result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ): - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - - def _str_isalnum(self): - return type(self)(pc.utf8_is_alnum(self._pa_array)) - - def _str_isalpha(self): - return type(self)(pc.utf8_is_alpha(self._pa_array)) - - def _str_isdecimal(self): - return type(self)(pc.utf8_is_decimal(self._pa_array)) - - def _str_isdigit(self): - return type(self)(pc.utf8_is_digit(self._pa_array)) - - def _str_islower(self): - return type(self)(pc.utf8_is_lower(self._pa_array)) - - def _str_isnumeric(self): - return type(self)(pc.utf8_is_numeric(self._pa_array)) - - def _str_isspace(self): - return type(self)(pc.utf8_is_space(self._pa_array)) - - def _str_istitle(self): - return type(self)(pc.utf8_is_title(self._pa_array)) - - def _str_isupper(self): - return type(self)(pc.utf8_is_upper(self._pa_array)) - - def _str_len(self): - return type(self)(pc.utf8_length(self._pa_array)) - - def _str_lower(self): - return type(self)(pc.utf8_lower(self._pa_array)) - - def _str_upper(self): - return type(self)(pc.utf8_upper(self._pa_array)) - - def _str_strip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_trim_whitespace(self._pa_array) - else: - result = pc.utf8_trim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_lstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._pa_array) - else: - result = pc.utf8_ltrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_rstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._pa_array) - else: - result = pc.utf8_rtrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_removeprefix(self, prefix: str): - if not pa_version_under13p0: - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return type(self)(result) - predicate = lambda val: val.removeprefix(prefix) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - def _str_casefold(self): predicate = lambda val: val.casefold() result = self._apply_elementwise(predicate) @@ -2489,6 +2564,92 @@ def _str_wrap(self, width: int, **kwargs): result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) + @property + def _dt_days(self): + return type(self)( + pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32()) + ) + + @property + def _dt_hours(self): + return type(self)( + pa.array( + [ + td.components.hours if td is not NaT else None + for td in self._to_timedeltaarray() + ], + type=pa.int32(), + ) + ) + + @property + def _dt_minutes(self): + return type(self)( + pa.array( + [ + td.components.minutes if td is not NaT else None + for td in self._to_timedeltaarray() + ], + type=pa.int32(), + ) + ) + + @property + def _dt_seconds(self): + return type(self)( + pa.array( + self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32() + ) + ) + + @property + def _dt_milliseconds(self): + return type(self)( + pa.array( + [ + td.components.milliseconds if td is not NaT else None + for td in self._to_timedeltaarray() + ], + type=pa.int32(), + ) + ) + + @property + def _dt_microseconds(self): + return type(self)( + pa.array( + self._to_timedeltaarray().microseconds, + from_pandas=True, + type=pa.int32(), + ) + ) + + @property + def _dt_nanoseconds(self): + return type(self)( + pa.array( + self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32() + ) + ) + + def _dt_to_pytimedelta(self): + data = self._pa_array.to_pylist() + if self._dtype.pyarrow_dtype.unit == "ns": + data = [None if ts is None else ts.to_pytimedelta() for ts in data] + return np.array(data, dtype=object) + + def _dt_total_seconds(self): + return type(self)( + pa.array(self._to_timedeltaarray().total_seconds(), from_pandas=True) + ) + + def _dt_as_unit(self, unit: str): + if pa.types.is_date(self.dtype.pyarrow_dtype): + raise NotImplementedError("as_unit not implemented for date types") + pd_array = self._maybe_convert_datelike_array() + # Don't just cast _pa_array in order to follow pandas unit conversion rules + return type(self)(pa.array(pd_array.as_unit(unit), from_pandas=True)) + @property def _dt_year(self): return type(self)(pc.year(self._pa_array)) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 59c6d911cfaef..62ca2a45fb941 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -70,6 +70,7 @@ unique, ) from pandas.core.array_algos.quantile import quantile_with_mask +from pandas.core.missing import _fill_limit_area_1d from pandas.core.sorting import ( nargminmax, nargsort, @@ -718,7 +719,10 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: return TimedeltaArray._from_sequence(self, dtype=dtype, copy=copy) - return np.array(self, dtype=dtype, copy=copy) + if not copy: + return np.asarray(self, dtype=dtype) + else: + return np.array(self, dtype=dtype, copy=copy) def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll: """ @@ -954,7 +958,12 @@ def interpolate( ) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: """ Pad or backfill values, used by Series/DataFrame ffill and bfill. @@ -1012,6 +1021,12 @@ def _pad_or_backfill( DeprecationWarning, stacklevel=find_stack_level(), ) + if limit_area is not None: + raise NotImplementedError( + f"{type(self).__name__} does not implement limit_area " + "(added in pandas 2.2). 3rd-party ExtnsionArray authors " + "need to add this argument to _pad_or_backfill." + ) return self.fillna(method=method, limit=limit) mask = self.isna() @@ -1021,6 +1036,8 @@ def _pad_or_backfill( meth = missing.clean_fill_method(method) npmask = np.asarray(mask) + if limit_area is not None and not npmask.all(): + _fill_limit_area_1d(npmask, limit_area) if meth == "pad": indexer = libalgos.get_fill_indexer(npmask, limit=limit) return self.take(indexer, allow_fill=True) @@ -2352,6 +2369,20 @@ def _groupby_op( # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray + if op.how in [ + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + ]: + raise TypeError( + f"dtype '{self.dtype}' does not support operation '{how}'" + ) if op.how not in ["any", "all"]: # Fail early to avoid conversion to object op._get_cython_function(op.kind, op.how, np.dtype(object), False) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 065a942cae768..0fe69f6d1ebc2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -44,7 +44,9 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, + CategoricalDtypeType, ExtensionDtype, ) from pandas.core.dtypes.generic import ( @@ -443,24 +445,32 @@ def __init__( values = arr if dtype.categories is None: - if not isinstance(values, ABCIndex): - # in particular RangeIndex xref test_index_equal_range_categories - values = sanitize_array(values, None) - try: - codes, categories = factorize(values, sort=True) - except TypeError as err: - codes, categories = factorize(values, sort=False) - if dtype.ordered: - # raise, as we don't have a sortable data structure and so - # the user should give us one by specifying categories - raise TypeError( - "'values' is not ordered, please " - "explicitly specify the categories order " - "by passing in a categories argument." - ) from err - - # we're inferring from values - dtype = CategoricalDtype(categories, dtype.ordered) + if isinstance(values.dtype, ArrowDtype) and issubclass( + values.dtype.type, CategoricalDtypeType + ): + arr = values._pa_array.combine_chunks() + categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype) + codes = arr.indices.to_numpy() + dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) + else: + if not isinstance(values, ABCIndex): + # in particular RangeIndex xref test_index_equal_range_categories + values = sanitize_array(values, None) + try: + codes, categories = factorize(values, sort=True) + except TypeError as err: + codes, categories = factorize(values, sort=False) + if dtype.ordered: + # raise, as we don't have a sortable data structure and so + # the user should give us one by specifying categories + raise TypeError( + "'values' is not ordered, please " + "explicitly specify the categories order " + "by passing in a categories argument." + ) from err + + # we're inferring from values + dtype = CategoricalDtype(categories, dtype.ordered) elif isinstance(values.dtype, CategoricalDtype): old_codes = extract_array(values)._codes @@ -567,11 +577,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: raise ValueError("Cannot convert float NaN to integer") elif len(self.codes) == 0 or len(self.categories) == 0: - result = np.array( - self, - dtype=dtype, - copy=copy, - ) + # For NumPy 1.x compatibility we cannot use copy=None. And + # `copy=False` has the meaning of `copy=None` here: + if not copy: + result = np.asarray(self, dtype=dtype) + else: + result = np.array(self, dtype=dtype) else: # GH8628 (PERF): astype category codes instead of astyping array @@ -1626,10 +1637,23 @@ def _validate_codes_for_dtype(cls, codes, *, dtype: CategoricalDtype) -> np.ndar # ------------------------------------------------------------- @ravel_compat - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """ The numpy array interface. + Users should not call this directly. Rather, it is invoked by + :func:`numpy.array` and :func:`numpy.asarray`. + + Parameters + ---------- + dtype : np.dtype or None + Specifies the the dtype for the array. + + copy : bool or None, optional + See :func:`numpy.asarray`. + Returns ------- numpy.array @@ -1647,13 +1671,25 @@ def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: >>> np.asarray(cat) array(['a', 'b'], dtype=object) """ + if copy is False: + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + ret = take_nd(self.categories._values, self._codes) - if dtype and np.dtype(dtype) != self.categories.dtype: - return np.asarray(ret, dtype) # When we're a Categorical[ExtensionArray], like Interval, # we need to ensure __array__ gets all the way to an # ndarray. - return np.asarray(ret) + + # `take_nd` should already make a copy, so don't force again. + return np.asarray(ret, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # for binary ops, use our custom dunder methods @@ -2463,11 +2499,6 @@ def unique(self) -> Self: # pylint: disable=useless-parent-delegation return super().unique() - def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: - # make sure we have correct itemsize for resulting codes - assert res_values.dtype == self._ndarray.dtype - return res_values - def equals(self, other: object) -> bool: """ Returns True if categorical arrays are equal. @@ -2668,23 +2699,37 @@ def _replace(self, *, to_replace, value, inplace: bool = False): # ------------------------------------------------------------------------ # String methods interface def _str_map( - self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True + self, f, na_value=lib.no_default, dtype=np.dtype("object"), convert: bool = True ): # Optimization to apply the callable `f` to the categories once # and rebuild the result by `take`ing from the result with the codes. # Returns the same type as the object-dtype implementation though. - from pandas.core.arrays import NumpyExtensionArray - categories = self.categories codes = self.codes - result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype) + if categories.dtype == "string": + result = categories.array._str_map(f, na_value, dtype) # type: ignore[attr-defined] + if ( + categories.dtype.na_value is np.nan # type: ignore[union-attr] + and is_bool_dtype(dtype) + and (na_value is lib.no_default or isna(na_value)) + ): + # NaN propagates as False for functions with boolean return type + na_value = False + else: + from pandas.core.arrays import NumpyExtensionArray + + result = NumpyExtensionArray(categories.to_numpy())._str_map( + f, na_value, dtype + ) return take_nd(result, codes, fill_value=na_value) def _str_get_dummies(self, sep: str = "|"): # sep may not be in categories. Just bail on this. from pandas.core.arrays import NumpyExtensionArray - return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep) + return NumpyExtensionArray(self.to_numpy(str, na_value="NaN"))._str_get_dummies( + sep + ) # ------------------------------------------------------------------------ # GroupBy Methods diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 11a0c7bf18fcb..cfe1f3acd9143 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -20,6 +20,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( algos, lib, @@ -92,6 +94,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, DatetimeTZDtype, ExtensionDtype, @@ -350,10 +353,27 @@ def _formatter(self, boxed: bool = False): # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): + if copy is False: + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow this " + "behavior starting with pandas 3.0.\nThis conversion to NumPy " + "requires a copy, but 'copy=False' was passed. Consider using " + "'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return np.array(list(self), dtype=object) + + if copy is True: + return np.array(self._ndarray, dtype=dtype) return self._ndarray @overload @@ -467,10 +487,16 @@ def astype(self, dtype, copy: bool = True): return self._box_values(self.asi8.ravel()).reshape(self.shape) + elif is_string_dtype(dtype): + if isinstance(dtype, ExtensionDtype): + arr_object = self._format_native_types(na_rep=dtype.na_value) # type: ignore[arg-type] + cls = dtype.construct_array_type() + return cls._from_sequence(arr_object, dtype=dtype, copy=False) + else: + return self._format_native_types() + elif isinstance(dtype, ExtensionDtype): return super().astype(dtype, copy=copy) - elif is_string_dtype(dtype): - return self._format_native_types() elif dtype.kind in "iu": # we deliberately ignore int32 vs. int64 here. # See https://github.com/pandas-dev/pandas/issues/24381 for more. @@ -1786,6 +1812,10 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: dtype='object') """ result = self._format_native_types(date_format=date_format, na_rep=np.nan) + if using_string_dtype(): + from pandas import StringDtype + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result.astype(object, copy=False) @@ -2531,7 +2561,7 @@ def _validate_inferred_freq( return freq -def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str: +def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype | ArrowDtype) -> str: """ Return the unit str corresponding to the dtype's resolution. @@ -2546,4 +2576,8 @@ def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str: """ if isinstance(dtype, DatetimeTZDtype): return dtype.unit + elif isinstance(dtype, ArrowDtype): + if dtype.kind not in "mM": + raise ValueError(f"{dtype=} does not have a resolution.") + return dtype.pyarrow_dtype.unit return np.datetime_data(dtype)[0] diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6b7ddc4a72957..0db25db02e75a 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -14,6 +14,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( lib, tslib, @@ -39,10 +41,7 @@ tz_convert_from_utc, tzconversion, ) -from pandas._libs.tslibs.dtypes import ( - abbrev_to_npy_unit, - freq_to_period_freqstr, -) +from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit from pandas.errors import PerformanceWarning from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_inclusive @@ -638,12 +637,12 @@ def _resolution_obj(self) -> Resolution: # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: if dtype is None and self.tz: # The default for tz-aware is object, to preserve tz info dtype = object - return super().__array__(dtype=dtype) + return super().__array__(dtype=dtype, copy=copy) def __iter__(self) -> Iterator: """ @@ -1232,8 +1231,10 @@ def to_period(self, freq=None) -> PeriodArray: if freq is None: freq = self.freqstr or self.inferred_freq - if isinstance(self.freq, BaseOffset): - freq = freq_to_period_freqstr(self.freq.n, self.freq.name) + if isinstance(self.freq, BaseOffset) and hasattr( + self.freq, "_period_dtype_code" + ): + freq = PeriodDtype(self.freq)._freqstr if freq is None: raise ValueError( @@ -1307,6 +1308,13 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: values, "month_name", locale=locale, reso=self._creso ) result = self._maybe_mask_results(result, fill_value=None) + if using_string_dtype(): + from pandas import ( + StringDtype, + array as pd_array, + ) + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result def day_name(self, locale=None) -> npt.NDArray[np.object_]: @@ -1364,6 +1372,14 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]: values, "day_name", locale=locale, reso=self._creso ) result = self._maybe_mask_results(result, fill_value=None) + if using_string_dtype(): + # TODO: no tests that check for dtype of result as of 2024-08-15 + from pandas import ( + StringDtype, + array as pd_array, + ) + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result @property @@ -2394,7 +2410,7 @@ def objects_to_datetime64( assert errors in ["raise", "ignore", "coerce"] # if str-dtype, convert - data = np.array(data, copy=False, dtype=np.object_) + data = np.asarray(data, dtype=np.object_) result, tz_parsed = tslib.array_to_datetime( data, diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index a19b304529383..da57e4ceed87e 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -42,6 +42,7 @@ from pandas.compat.numpy import function as nv from pandas.errors import IntCastingNaNError from pandas.util._decorators import Appender +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ( LossySetitemError, @@ -79,6 +80,7 @@ unique, value_counts_internal as value_counts, ) +from pandas.core.arrays import ArrowExtensionArray from pandas.core.arrays.base import ( ExtensionArray, _extension_array_shared_docs, @@ -370,11 +372,18 @@ def _ensure_simple_new_inputs( right = ensure_wrapped_if_datetimelike(right) right = extract_array(right, extract_numpy=True) - lbase = getattr(left, "_ndarray", left).base - rbase = getattr(right, "_ndarray", right).base - if lbase is not None and lbase is rbase: - # If these share data, then setitem could corrupt our IA - right = right.copy() + if isinstance(left, ArrowExtensionArray) or isinstance( + right, ArrowExtensionArray + ): + pass + else: + lbase = getattr(left, "_ndarray", left) + lbase = getattr(lbase, "_data", lbase).base + rbase = getattr(right, "_ndarray", right) + rbase = getattr(rbase, "_data", rbase).base + if lbase is not None and lbase is rbase: + # If these share data, then setitem could corrupt our IA + right = right.copy() dtype = IntervalDtype(left.dtype, closed=closed) @@ -890,11 +899,18 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr return obj[indexer] def _pad_or_backfill( # pylint: disable=useless-parent-delegation - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) def fillna( self, value=None, method=None, limit: int | None = None, copy: bool = True @@ -1552,11 +1568,25 @@ def is_non_overlapping_monotonic(self) -> bool: # --------------------------------------------------------------------- # Conversion - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """ Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') """ + if copy is False: + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + left = self._left right = self._right mask = self.isna() diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 03c09c5b2fd18..da656a2768901 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -22,6 +22,7 @@ AxisInt, DtypeObj, FillnaOptions, + InterpolateOptions, NpDtype, PositionalIndexer, Scalar, @@ -37,6 +38,7 @@ ) from pandas.errors import AbstractMethodError from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.base import ExtensionDtype @@ -98,6 +100,7 @@ NumpySorter, NumpyValueArrayLike, ) + from pandas.core.arrays import FloatingArray from pandas.compat.numpy import function as nv @@ -192,7 +195,12 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self._simple_new(self._data[item], newmask) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: mask = self._mask @@ -204,7 +212,21 @@ def _pad_or_backfill( if copy: npvalues = npvalues.copy() new_mask = new_mask.copy() + elif limit_area is not None: + mask = mask.copy() func(npvalues, limit=limit, mask=new_mask) + + if limit_area is not None and not mask.all(): + mask = mask.T + neg_mask = ~mask + first = neg_mask.argmax() + last = len(neg_mask) - neg_mask[::-1].argmax() - 1 + if limit_area == "inside": + new_mask[:first] |= mask[:first] + new_mask[last + 1 :] |= mask[last + 1 :] + elif limit_area == "outside": + new_mask[first + 1 : last] |= mask[first + 1 : last] + if copy: return self._simple_new(npvalues.T, new_mask.T) else: @@ -281,7 +303,7 @@ def _validate_setitem_value(self, value): # Note: without the "str" here, the f-string rendering raises in # py38 builds. - raise TypeError(f"Invalid value '{str(value)}' for dtype {self.dtype}") + raise TypeError(f"Invalid value '{value!s}' for dtype '{self.dtype}'") def __setitem__(self, key, value) -> None: key = check_array_indexer(self, key) @@ -384,6 +406,8 @@ def round(self, decimals: int = 0, *args, **kwargs): DataFrame.round : Round values of a DataFrame. Series.round : Round values of a Series. """ + if self.dtype.kind == "b": + return self nv.validate_round(args, kwargs) values = np.round(self._data, decimals=decimals, **kwargs) @@ -407,6 +431,9 @@ def __abs__(self) -> Self: # ------------------------------------------------------------------ + def _values_for_json(self) -> np.ndarray: + return np.asarray(self, dtype=object) + def to_numpy( self, dtype: npt.DTypeLike | None = None, @@ -475,6 +502,8 @@ def to_numpy( """ hasna = self._hasna dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna) + if dtype is None: + dtype = object if hasna: if ( @@ -565,12 +594,32 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: __array_priority__ = 1000 # higher than ndarray so ops dispatch to us - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """ the array interface, return my values We return an object array here to preserve our scalar values """ - return self.to_numpy(dtype=dtype) + if copy is False: + if not self._hasna: + # special case, here we can simply return the underlying data + return np.array(self._data, dtype=dtype, copy=copy) + + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if copy is None: + copy = False # The NumPy copy=False meaning is different here. + return self.to_numpy(dtype=dtype, copy=copy) _HANDLED_TYPES: tuple[type, ...] @@ -1305,7 +1354,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) def map(self, mapper, na_action=None): - return map_array(self.to_numpy(), mapper, na_action=None) + return map_array(self.to_numpy(), mapper, na_action=na_action) def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ @@ -1470,6 +1519,58 @@ def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): else: return self.dtype.na_value + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index, + limit, + limit_direction, + limit_area, + copy: bool, + **kwargs, + ) -> FloatingArray: + """ + See NDFrame.interpolate.__doc__. + """ + # NB: we return type(self) even if copy=False + if self.dtype.kind == "f": + if copy: + data = self._data.copy() + mask = self._mask.copy() + else: + data = self._data + mask = self._mask + elif self.dtype.kind in "iu": + copy = True + data = self._data.astype("f8") + mask = self._mask.copy() + else: + raise NotImplementedError( + f"interpolate is not implemented for dtype={self.dtype}" + ) + + missing.interpolate_2d_inplace( + data, + method=method, + axis=0, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + mask=mask, + **kwargs, + ) + if not copy: + return self # type: ignore[return-value] + if self.dtype.kind == "f": + return type(self)._simple_new(data, mask) # type: ignore[return-value] + else: + from pandas.core.arrays import FloatingArray + + return FloatingArray._simple_new(data, mask) + def _accumulate( self, name: str, *, skipna: bool = True, **kwargs ) -> BaseMaskedArray: @@ -1541,13 +1642,24 @@ def transpose_homogeneous_masked_arrays( same dtype. The caller is responsible for ensuring validity of input data. """ masked_arrays = list(masked_arrays) + dtype = masked_arrays[0].dtype + values = [arr._data.reshape(1, -1) for arr in masked_arrays] - transposed_values = np.concatenate(values, axis=0) + transposed_values = np.concatenate( + values, + axis=0, + out=np.empty( + (len(masked_arrays), len(masked_arrays[0])), + order="F", + dtype=dtype.numpy_dtype, + ), + ) masks = [arr._mask.reshape(1, -1) for arr in masked_arrays] - transposed_masks = np.concatenate(masks, axis=0) + transposed_masks = np.concatenate( + masks, axis=0, out=np.empty_like(transposed_values, dtype=bool) + ) - dtype = masked_arrays[0].dtype arr_type = dtype.construct_array_type() transposed_arrays: list[BaseMaskedArray] = [] for i in range(transposed_values.shape[1]): diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 210450e868698..68fa7fcb6573c 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -159,7 +159,10 @@ def _coerce_to_data_and_mask( return values, mask, dtype, inferred_type original = values - values = np.array(values, copy=copy) + if not copy: + values = np.asarray(values) + else: + values = np.array(values, copy=copy) inferred_type = None if values.dtype == object or is_string_dtype(values.dtype): inferred_type = lib.infer_dtype(values, skipna=True) @@ -168,7 +171,10 @@ def _coerce_to_data_and_mask( raise TypeError(f"{values.dtype} cannot be converted to {name}") elif values.dtype.kind == "b" and checker(dtype): - values = np.array(values, dtype=default_dtype, copy=copy) + if not copy: + values = np.asarray(values, dtype=default_dtype) + else: + values = np.array(values, dtype=default_dtype, copy=copy) elif values.dtype.kind not in "iuf": name = dtype_cls.__name__.strip("_") @@ -207,9 +213,9 @@ def _coerce_to_data_and_mask( inferred_type not in ["floating", "mixed-integer-float"] and not mask.any() ): - values = np.array(original, dtype=dtype, copy=False) + values = np.asarray(original, dtype=dtype) else: - values = np.array(original, dtype="object", copy=False) + values = np.asarray(original, dtype="object") # we copy as need to coerce here if mask.any(): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index d83a37088daec..e0031d3db6ca7 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -2,6 +2,7 @@ from typing import ( TYPE_CHECKING, + Any, Literal, ) @@ -29,6 +30,8 @@ from pandas.core.strings.object_array import ObjectStringArrayMixin if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( AxisInt, Dtype, @@ -137,9 +140,6 @@ def _from_sequence( result = result.copy() return cls(result) - def _from_backing_data(self, arr: np.ndarray) -> NumpyExtensionArray: - return type(self)(arr) - # ------------------------------------------------------------------------ # Data @@ -150,7 +150,12 @@ def dtype(self) -> NumpyEADtype: # ------------------------------------------------------------------------ # NumPy Array Interface - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: + if copy is not None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.array(self._ndarray, dtype=dtype, copy=copy) return np.asarray(self._ndarray, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): @@ -285,6 +290,9 @@ def interpolate( See NDFrame.interpolate.__doc__. """ # NB: we return type(self) even if copy=False + if not self.dtype._is_numeric: + raise TypeError(f"Cannot interpolate with {self.dtype} dtype") + if not copy: out_data = self._ndarray else: @@ -556,6 +564,11 @@ def _wrap_ndarray_result(self, result: np.ndarray): return TimedeltaArray._simple_new(result, dtype=result.dtype) return type(self)(result) - # ------------------------------------------------------------------------ - # String methods interface - _str_na_value = np.nan + def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: + # NEP 51: https://github.com/numpy/numpy/pull/22449 + if self.dtype.kind in "SU": + return "'{}'".format + elif self.dtype == "object": + return repr + else: + return str diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 2930b979bfe78..2947ba7b8c72a 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -256,7 +256,10 @@ def __init__( raise raise_on_incompatible(values, dtype.freq) values, dtype = values._ndarray, values.dtype - values = np.array(values, dtype="int64", copy=copy) + if not copy: + values = np.asarray(values, dtype="int64") + else: + values = np.array(values, dtype="int64", copy=copy) if dtype is None: raise ValueError("dtype is not specified and cannot be inferred") dtype = cast(PeriodDtype, dtype) @@ -400,10 +403,30 @@ def freq(self) -> BaseOffset: def freqstr(self) -> str: return freq_to_period_freqstr(self.freq.n, self.freq.name) - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: if dtype == "i8": - return self.asi8 - elif dtype == bool: + # For NumPy 1.x compatibility we cannot use copy=None. And + # `copy=False` has the meaning of `copy=None` here: + if not copy: + return np.asarray(self.asi8, dtype=dtype) + else: + return np.array(self.asi8, dtype=dtype) + + if copy is False: + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if dtype == bool: return ~self._isnan # This will raise TypeError for non-object dtypes @@ -733,8 +756,8 @@ def asfreq(self, freq=None, how: str = "E") -> Self: '2015-01'], dtype='period[M]') """ how = libperiod.validate_end_alias(how) - if isinstance(freq, BaseOffset): - freq = freq_to_period_freqstr(freq.n, freq.name) + if isinstance(freq, BaseOffset) and hasattr(freq, "_period_dtype_code"): + freq = PeriodDtype(freq)._freqstr freq = Period._maybe_convert_freq(freq) base1 = self._dtype._dtype_code @@ -810,12 +833,19 @@ def searchsorted( return m8arr.searchsorted(npvalue, side=side, sorter=sorter) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: # view as dt64 so we get treated as timelike in core.missing, # similar to dtl._period_dispatch dta = self.view("M8[ns]") - result = dta._pad_or_backfill(method=method, limit=limit, copy=copy) + result = dta._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) if copy: return cast("Self", result.view(self.dtype)) else: @@ -1179,12 +1209,7 @@ def dt64arr_to_periodarr( reso = get_unit_from_dtype(data.dtype) freq = Period._maybe_convert_freq(freq) - try: - base = freq._period_dtype_code - except (AttributeError, TypeError): - # AttributeError: _period_dtype_code might not exist - # TypeError: _period_dtype_code might intentionally raise - raise TypeError(f"{freq.name} is not supported as period frequency") + base = freq._period_dtype_code return c_dt64arr_to_periodarr(data.view("i8"), base, tz, reso=reso), freq diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index fc7debb1f31e4..67bb417865475 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -92,8 +92,8 @@ def from_coo(cls, A, dense_index: bool = False) -> Series: ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4) ... ) >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> + >>> A.todense() matrix([[0., 0., 1., 2.], @@ -178,8 +178,8 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False) ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True ... ) >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> + >>> A.todense() matrix([[0., 0., 1., 3.], [3., 0., 0., 0.], @@ -350,8 +350,8 @@ def to_coo(self): -------- >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) >>> df.sparse.to_coo() - <4x1 sparse matrix of type '' - with 2 stored elements in COOrdinate format> + """ import_optional_dependency("scipy") from scipy.sparse import coo_matrix diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 5db77db2a9c66..07ff592f491a8 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -551,12 +551,30 @@ def from_spmatrix(cls, data: spmatrix) -> Self: return cls._simple_new(arr, index, dtype) - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: - fill_value = self.fill_value - + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: if self.sp_index.ngaps == 0: # Compat for na dtype and int values. - return self.sp_values + if copy is True: + return np.array(self.sp_values) + else: + return self.sp_values + + if copy is False: + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + fill_value = self.fill_value + if dtype is None: # Can NumPy represent this type? # If not, `np.result_type` will raise. We catch that @@ -716,11 +734,18 @@ def isna(self) -> Self: # type: ignore[override] return type(self)(mask, fill_value=False, dtype=dtype) def _pad_or_backfill( # pylint: disable=useless-parent-delegation - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: # TODO(3.0): We can remove this method once deprecation for fillna method # keyword is enforced. - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) def fillna( self, diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 00197a150fb97..c1048e806ff9a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,14 +1,21 @@ from __future__ import annotations +from functools import partial +import operator from typing import ( TYPE_CHECKING, - ClassVar, + Any, Literal, + cast, ) +import warnings import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_string_dtype, +) from pandas._libs import ( lib, @@ -16,9 +23,13 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas._libs.lib import ensure_string_array -from pandas.compat import pa_version_under10p1 +from pandas.compat import ( + HAS_PYARROW, + pa_version_under10p1, +) from pandas.compat.numpy import function as nv from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ( ExtensionDtype, @@ -34,7 +45,12 @@ pandas_dtype, ) -from pandas.core import ops +from pandas.core import ( + missing, + nanops, + ops, +) +from pandas.core.algorithms import isin from pandas.core.array_algos import masked_reductions from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import ( @@ -50,10 +66,13 @@ from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna +from pandas.io.formats import printing + if TYPE_CHECKING: import pyarrow from pandas._typing import ( + ArrayLike, AxisInt, Dtype, DtypeObj, @@ -80,8 +99,10 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow", "pyarrow_numpy"}, optional + storage : {"python", "pyarrow"}, optional If not given, the value of ``pd.options.mode.string_storage``. + na_value : {np.nan, pd.NA}, default pd.NA + Whether the dtype follows NaN or NA missing value semantics. Attributes ---------- @@ -100,38 +121,104 @@ class StringDtype(StorageExtensionDtype): string[pyarrow] """ - # error: Cannot override instance variable (previously declared on - # base class "StorageExtensionDtype") with class variable - name: ClassVar[str] = "string" # type: ignore[misc] + @property + def name(self) -> str: # type: ignore[override] + if self._na_value is libmissing.NA: + return "string" + else: + return "str" #: StringDtype().na_value uses pandas.NA except the implementation that # follows NumPy semantics, which uses nan. @property def na_value(self) -> libmissing.NAType | float: # type: ignore[override] - if self.storage == "pyarrow_numpy": - return np.nan - else: - return libmissing.NA + return self._na_value - _metadata = ("storage",) + _metadata = ("storage", "_na_value") # type: ignore[assignment] - def __init__(self, storage=None) -> None: + def __init__( + self, + storage: str | None = None, + na_value: libmissing.NAType | float = libmissing.NA, + ) -> None: + # infer defaults if storage is None: - infer_string = get_option("future.infer_string") - if infer_string: - storage = "pyarrow_numpy" + if na_value is not libmissing.NA: + storage = get_option("mode.string_storage") + if storage == "auto": + if HAS_PYARROW: + storage = "pyarrow" + else: + storage = "python" else: storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow", "pyarrow_numpy"}: + if storage == "auto": + storage = "python" + + if storage == "pyarrow_numpy": + warnings.warn( + "The 'pyarrow_numpy' storage option name is deprecated and will be " + 'removed in pandas 3.0. Use \'pd.StringDtype(storage="pyarrow", ' + "na_value-np.nan)' to construct the same dtype.\nOr enable the " + "'pd.options.future.infer_string = True' option globally and use " + 'the "str" alias as a shorthand notation to specify a dtype ' + '(instead of "string[pyarrow_numpy]").', + FutureWarning, + stacklevel=find_stack_level(), + ) + storage = "pyarrow" + na_value = np.nan + + # validate options + if storage not in {"python", "pyarrow"}: raise ValueError( - f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " - f"Got {storage} instead." + f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." ) - if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1: + if storage == "pyarrow" and pa_version_under10p1: raise ImportError( "pyarrow>=10.0.1 is required for PyArrow backed StringArray." ) - self.storage = storage + + if isinstance(na_value, float) and np.isnan(na_value): + # when passed a NaN value, always set to np.nan to ensure we use + # a consistent NaN value (and we can use `dtype.na_value is np.nan`) + na_value = np.nan + elif na_value is not libmissing.NA: + raise ValueError(f"'na_value' must be np.nan or pd.NA, got {na_value}") + + self.storage = cast(str, storage) + self._na_value = na_value + + def __repr__(self) -> str: + if self._na_value is libmissing.NA: + return f"{self.name}[{self.storage}]" + else: + # TODO add more informative repr + return self.name + + def __eq__(self, other: object) -> bool: + # we need to override the base class __eq__ because na_value (NA or NaN) + # cannot be checked with normal `==` + if isinstance(other, str): + # TODO should dtype == "string" work for the NaN variant? + if other == "string" or other == self.name: # noqa: PLR1714 + return True + try: + other = self.construct_from_string(other) + except (TypeError, ImportError): + # TypeError if `other` is not a valid string for StringDtype + # ImportError if pyarrow is not installed for "string[pyarrow]" + return False + if isinstance(other, type(self)): + return self.storage == other.storage and self.na_value is other.na_value + return False + + def __hash__(self) -> int: + # need to override __hash__ as well because of overriding __eq__ + return super().__hash__() + + def __reduce__(self): + return StringDtype, (self.storage, self.na_value) @property def type(self) -> type[str]: @@ -171,11 +258,14 @@ def construct_from_string(cls, string) -> Self: ) if string == "string": return cls() + elif string == "str" and using_string_dtype(): + return cls(na_value=np.nan) elif string == "string[python]": return cls(storage="python") elif string == "string[pyarrow]": return cls(storage="pyarrow") elif string == "string[pyarrow_numpy]": + # this is deprecated in the dtype __init__, remove this in pandas 3.0 return cls(storage="pyarrow_numpy") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @@ -198,13 +288,43 @@ def construct_array_type( # type: ignore[override] ArrowStringArrayNumpySemantics, ) - if self.storage == "python": + if self.storage == "python" and self._na_value is libmissing.NA: return StringArray - elif self.storage == "pyarrow": + elif self.storage == "pyarrow" and self._na_value is libmissing.NA: return ArrowStringArray + elif self.storage == "python": + return StringArrayNumpySemantics else: return ArrowStringArrayNumpySemantics + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + storages = set() + na_values = set() + + for dtype in dtypes: + if isinstance(dtype, StringDtype): + storages.add(dtype.storage) + na_values.add(dtype.na_value) + elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "T"): + continue + else: + return None + + if len(storages) == 2: + # if both python and pyarrow storage -> priority to pyarrow + storage = "pyarrow" + else: + storage = next(iter(storages)) # type: ignore[assignment] + + na_value: libmissing.NAType | float + if len(na_values) == 2: + # if both NaN and NA -> priority to NA + na_value = libmissing.NA + else: + na_value = next(iter(na_values)) + + return StringDtype(storage=storage, na_value=na_value) + def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray ) -> BaseStringArray: @@ -212,13 +332,17 @@ def __from_arrow__( Construct StringArray from pyarrow Array/ChunkedArray. """ if self.storage == "pyarrow": - from pandas.core.arrays.string_arrow import ArrowStringArray + if self._na_value is libmissing.NA: + from pandas.core.arrays.string_arrow import ArrowStringArray - return ArrowStringArray(array) - elif self.storage == "pyarrow_numpy": - from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics + return ArrowStringArray(array) + else: + from pandas.core.arrays.string_arrow import ( + ArrowStringArrayNumpySemantics, + ) + + return ArrowStringArrayNumpySemantics(array) - return ArrowStringArrayNumpySemantics(array) else: import pyarrow @@ -233,7 +357,7 @@ def __from_arrow__( # convert chunk by chunk to numpy and concatenate then, to avoid # overflow for large string data when concatenating the pyarrow arrays arr = arr.to_numpy(zero_copy_only=False) - arr = ensure_string_array(arr, na_value=libmissing.NA) + arr = ensure_string_array(arr, na_value=self.na_value) results.append(arr) if len(chunks) == 0: @@ -243,11 +367,7 @@ def __from_arrow__( # Bypass validation inside StringArray constructor, see GH#47781 new_string_array = StringArray.__new__(StringArray) - NDArrayBacked.__init__( - new_string_array, - arr, - StringDtype(storage="python"), - ) + NDArrayBacked.__init__(new_string_array, arr, self) return new_string_array @@ -256,6 +376,8 @@ class BaseStringArray(ExtensionArray): Mixin class for StringArray, ArrowStringArray. """ + dtype: StringDtype + @doc(ExtensionArray.tolist) def tolist(self): if self.ndim > 1: @@ -269,6 +391,152 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: raise ValueError return cls._from_sequence(scalars, dtype=dtype) + def _formatter(self, boxed: bool = False): + formatter = partial( + printing.pprint_thing, + escape_chars=("\t", "\r", "\n"), + quote_strings=not boxed, + ) + return formatter + + def _str_map( + self, + f, + na_value=lib.no_default, + dtype: Dtype | None = None, + convert: bool = True, + ): + if self.dtype.na_value is np.nan: + return self._str_map_nan_semantics( + f, na_value=na_value, dtype=dtype, convert=convert + ) + + from pandas.arrays import BooleanArray + + if dtype is None: + dtype = self.dtype + if na_value is lib.no_default: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: type[IntegerArray | BooleanArray] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + elif dtype == np.dtype("bool"): + # GH#55736 + na_value = bool(na_value) + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + # error: Argument 1 to "dtype" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected + # "Type[object]" + dtype=np.dtype(cast(type, dtype)), + ) + + if not na_value_is_na: + mask[:] = False + + return constructor(result, mask) + + else: + return self._str_map_str_or_object(dtype, na_value, arr, f, mask) + + def _str_map_str_or_object( + self, + dtype, + na_value, + arr: np.ndarray, + f, + mask: npt.NDArray[np.bool_], + ): + # _str_map helper for case where dtype is either string dtype or object + if is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + if self.dtype.storage == "pyarrow": + import pyarrow as pa + + result = pa.array( + result, mask=mask, type=pa.large_string(), from_pandas=True + ) + # error: Too many arguments for "BaseStringArray" + return type(self)(result) # type: ignore[call-arg] + + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) + + def _str_map_nan_semantics( + self, + f, + na_value=lib.no_default, + dtype: Dtype | None = None, + convert: bool = True, + ): + if dtype is None: + dtype = self.dtype + if na_value is lib.no_default: + if is_bool_dtype(dtype): + # NaN propagates as False + na_value = False + else: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + na_value_is_na = isna(na_value) + if na_value_is_na: + if is_integer_dtype(dtype): + na_value = 0 + else: + # NaN propagates as False + na_value = False + + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(cast(type, dtype)), + ) + if na_value_is_na and is_integer_dtype(dtype) and mask.any(): + # TODO: we could alternatively do this check before map_infer_mask + # and adjust the dtype/na_value we pass there. Which is more + # performant? + result = result.astype("float64") + result[mask] = np.nan + + return result + + else: + return self._str_map_str_or_object(dtype, na_value, arr, f, mask) + + def view(self, dtype: Dtype | None = None) -> ArrayLike: + if dtype is not None: + raise TypeError("Cannot change data-type for string array.") + return super().view(dtype=dtype) + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" @@ -355,6 +623,8 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] # undo the NumpyExtensionArray hack _typ = "extension" + _storage = "python" + _na_value: libmissing.NAType | float = libmissing.NA def __init__(self, values, copy: bool = False) -> None: values = extract_array(values) @@ -362,7 +632,11 @@ def __init__(self, values, copy: bool = False) -> None: super().__init__(values, copy=copy) if not isinstance(values, type(self)): self._validate() - NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) + NDArrayBacked.__init__( + self, + self._ndarray, + StringDtype(storage=self._storage, na_value=self._na_value), + ) def _validate(self): """Validate that we only store NA or strings.""" @@ -380,20 +654,37 @@ def _validate(self): else: lib.convert_nans_to_NA(self._ndarray) + def _validate_scalar(self, value): + # used by NDArrayBackedExtensionIndex.insert + if isna(value): + return self.dtype.na_value + elif not isinstance(value, str): + raise TypeError( + f"Invalid value '{value}' for dtype '{self.dtype}'. Value should be a " + f"string or missing value, got '{type(value).__name__}' instead." + ) + return value + @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) assert isinstance(dtype, StringDtype) and dtype.storage == "python" + else: + if using_string_dtype(): + dtype = StringDtype(storage="python", na_value=np.nan) + else: + dtype = StringDtype(storage="python") from pandas.core.arrays.masked import BaseMaskedArray + na_value = dtype.na_value if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - result[na_values] = libmissing.NA + result[na_values] = na_value else: if lib.is_pyarrow_array(scalars): @@ -402,12 +693,12 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal # zero_copy_only to True which caused problems see GH#52076 scalars = np.array(scalars) # convert non-na-likes to str, and nan-likes to StringDtype().na_value - result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) + result = lib.ensure_string_array(scalars, na_value=na_value, copy=copy) # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? new_string_array = cls.__new__(cls) - NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) + NDArrayBacked.__init__(new_string_array, result, dtype) return new_string_array @@ -436,42 +727,57 @@ def __arrow_array__(self, type=None): values[self.isna()] = None return pa.array(values, type=type, from_pandas=True) - def _values_for_factorize(self): + def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: # type: ignore[override] arr = self._ndarray.copy() - mask = self.isna() - arr[mask] = None - return arr, None - def __setitem__(self, key, value) -> None: - value = extract_array(value, extract_numpy=True) - if isinstance(value, type(self)): - # extract_array doesn't extract NumpyExtensionArray subclasses - value = value._ndarray + return arr, self.dtype.na_value - key = check_array_indexer(self, key) - scalar_key = lib.is_scalar(key) - scalar_value = lib.is_scalar(value) - if scalar_key and not scalar_value: - raise ValueError("setting an array element with a sequence.") - - # validate new items - if scalar_value: + def _maybe_convert_setitem_value(self, value): + """Maybe convert value to be pyarrow compatible.""" + if lib.is_scalar(value): if isna(value): - value = libmissing.NA + value = self.dtype.na_value elif not isinstance(value, str): raise TypeError( - f"Cannot set non-string value '{value}' into a StringArray." + f"Invalid value '{value}' for dtype '{self.dtype}'. Value should " + f"be a string or missing value, got '{type(value).__name__}' " + "instead." ) else: + value = extract_array(value, extract_numpy=True) if not is_array_like(value): value = np.asarray(value, dtype=object) + elif isinstance(value.dtype, type(self.dtype)): + return value + else: + # cast categories and friends to arrays to see if values are + # compatible, compatibility with arrow backed strings + value = np.asarray(value) if len(value) and not lib.is_string_array(value, skipna=True): - raise TypeError("Must provide strings.") + raise TypeError( + "Invalid value for dtype 'str'. Value should be a " + "string or missing value (or array of those)." + ) + return value + + def __setitem__(self, key, value) -> None: + value = self._maybe_convert_setitem_value(value) + + key = check_array_indexer(self, key) + scalar_key = lib.is_scalar(key) + scalar_value = lib.is_scalar(value) + if scalar_key and not scalar_value: + raise ValueError("setting an array element with a sequence.") - mask = isna(value) - if mask.any(): - value = value.copy() - value[isna(value)] = libmissing.NA + if not scalar_value: + if value.dtype == self.dtype: + value = value._ndarray + else: + value = np.asarray(value) + mask = isna(value) + if mask.any(): + value = value.copy() + value[isna(value)] = self.dtype.na_value super().__setitem__(key, value) @@ -481,6 +787,30 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: # base class implementation that uses __setitem__ ExtensionArray._putmask(self, mask, value) + def _where(self, mask: npt.NDArray[np.bool_], value) -> Self: + # the super() method NDArrayBackedExtensionArray._where uses + # np.putmask which doesn't properly handle None/pd.NA, so using the + # base class implementation that uses __setitem__ + return ExtensionArray._where(self, mask, value) + + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + if isinstance(values, BaseStringArray) or ( + isinstance(values, ExtensionArray) and is_string_dtype(values.dtype) + ): + values = values.astype(self.dtype, copy=False) + else: + if not lib.is_string_array(np.asarray(values), skipna=True): + values = np.array( + [val for val in values if isinstance(val, str) or isna(val)], + dtype=object, + ) + if not len(values): + return np.zeros(self.shape, dtype=bool) + + values = self._from_sequence(values, dtype=self.dtype) + + return isin(np.asarray(self), np.asarray(values)) + def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) @@ -515,13 +845,115 @@ def astype(self, dtype, copy: bool = True): return super().astype(dtype, copy) def _reduce( - self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs + self, + name: str, + *, + skipna: bool = True, + keepdims: bool = False, + axis: AxisInt | None = 0, + **kwargs, ): - if name in ["min", "max"]: - return getattr(self, name)(skipna=skipna, axis=axis) + if self.dtype.na_value is np.nan and name in ["any", "all"]: + if name == "any": + return nanops.nanany(self._ndarray, skipna=skipna) + else: + return nanops.nanall(self._ndarray, skipna=skipna) + if name in ["min", "max", "argmin", "argmax", "sum"]: + result = getattr(self, name)(skipna=skipna, axis=axis, **kwargs) + if keepdims: + return self._from_sequence([result], dtype=self.dtype) + return result raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArray: + """ + Return an ExtensionArray performing an accumulation operation. + + The underlying data type might change. + + Parameters + ---------- + name : str + Name of the function, supported values are: + - cummin + - cummax + - cumsum + - cumprod + skipna : bool, default True + If True, skip NA values. + **kwargs + Additional keyword arguments passed to the accumulation function. + Currently, there is no supported kwarg. + + Returns + ------- + array + + Raises + ------ + NotImplementedError : subclass does not define accumulations + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # We may need to strip out trailing NA values + tail: np.ndarray | None = None + na_mask: np.ndarray | None = None + ndarray = self._ndarray + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + if self._hasna: + na_mask = cast("npt.NDArray[np.bool_]", isna(ndarray)) + if np.all(na_mask): + return type(self)(ndarray) + if skipna: + if name == "cumsum": + ndarray = np.where(na_mask, "", ndarray) + else: + # We can retain the running min/max by forward/backward filling. + ndarray = ndarray.copy() + missing.pad_or_backfill_inplace( + ndarray, + method="pad", + axis=0, + ) + missing.pad_or_backfill_inplace( + ndarray, + method="backfill", + axis=0, + ) + else: + # When not skipping NA values, the result should be null from + # the first NA value onward. + idx = np.argmax(na_mask) + tail = np.empty(len(ndarray) - idx, dtype="object") + tail[:] = self.dtype.na_value + ndarray = ndarray[:idx] + + # mypy: Cannot call function of unknown type + np_result = np_func(ndarray) # type: ignore[operator] + + if tail is not None: + np_result = np.hstack((np_result, tail)) + elif na_mask is not None: + # Argument 2 to "where" has incompatible type "NAType | float" + np_result = np.where(na_mask, self.dtype.na_value, np_result) # type: ignore[arg-type] + + result = type(self)(np_result) + return result + + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: + if self.dtype.na_value is np.nan and result is libmissing.NA: + # the masked_reductions use pd.NA -> convert to np.nan + return np.nan + return super()._wrap_reduction_result(axis, result) + def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: nv.validate_min((), kwargs) result = masked_reductions.min( @@ -536,11 +968,29 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: ) return self._wrap_reduction_result(axis, result) + def sum( + self, + *, + axis: AxisInt | None = None, + skipna: bool = True, + min_count: int = 0, + **kwargs, + ) -> Scalar: + nv.validate_sum((), kwargs) + result = masked_reductions.sum( + values=self._ndarray, mask=self.isna(), skipna=skipna + ) + return self._wrap_reduction_result(axis, result) + def value_counts(self, dropna: bool = True) -> Series: from pandas.core.algorithms import value_counts_internal as value_counts result = value_counts(self._ndarray, dropna=dropna).astype("Int64") + result = value_counts(self._ndarray, sort=False, dropna=dropna) result.index = result.index.astype(self.dtype) + + if self.dtype.na_value is libmissing.NA: + result = result.astype("Int64") return result def memory_usage(self, deep: bool = False) -> int: @@ -579,79 +1029,52 @@ def _cmp_method(self, other, op): f"Lengths of operands do not match: {len(self)} != {len(other)}" ) - other = np.asarray(other) + # for array-likes, first filter out NAs before converting to numpy + if not is_array_like(other): + other = np.asarray(other) other = other[valid] if op.__name__ in ops.ARITHMETIC_BINOPS: result = np.empty_like(self._ndarray, dtype="object") - result[mask] = libmissing.NA + result[mask] = self.dtype.na_value result[valid] = op(self._ndarray[valid], other) - return StringArray(result) + return self._from_backing_data(result) else: # logical result = np.zeros(len(self._ndarray), dtype="bool") result[valid] = op(self._ndarray[valid], other) - return BooleanArray(result, mask) + res_arr = BooleanArray(result, mask) + if self.dtype.na_value is np.nan: + if op == operator.ne: + return res_arr.to_numpy(np.bool_, na_value=True) + else: + return res_arr.to_numpy(np.bool_, na_value=False) + return res_arr _arith_method = _cmp_method - # ------------------------------------------------------------------------ - # String methods interface - # error: Incompatible types in assignment (expression has type "NAType", - # base class "NumpyExtensionArray" defined the type as "float") - _str_na_value = libmissing.NA # type: ignore[assignment] - - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - from pandas.arrays import BooleanArray - - if dtype is None: - dtype = StringDtype(storage="python") - if na_value is None: - na_value = self.dtype.na_value - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray | BooleanArray] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray +class StringArrayNumpySemantics(StringArray): + _storage = "python" + _na_value = np.nan - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - elif dtype == np.dtype("bool"): - na_value = bool(na_value) - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected - # "Type[object]" - dtype=np.dtype(dtype), # type: ignore[arg-type] + def _validate(self) -> None: + """Validate that we only store NaN or strings.""" + if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): + raise ValueError( + "StringArrayNumpySemantics requires a sequence of strings or NaN" ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value + if self._ndarray.dtype != "object": + raise ValueError( + "StringArrayNumpySemantics requires a sequence of strings or NaN. Got " + f"'{self._ndarray.dtype}' dtype instead." ) - return StringArray(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) + # TODO validate or force NA/None to NaN + + @classmethod + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy: bool = False + ) -> Self: + if dtype is None: + dtype = StringDtype(storage="python", na_value=np.nan) + return super()._from_sequence(scalars, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d5a76811a12e6..c8aea6f6bab5a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,6 +1,5 @@ from __future__ import annotations -from functools import partial import operator import re from typing import ( @@ -19,15 +18,12 @@ from pandas.compat import ( pa_version_under10p1, pa_version_under13p0, + pa_version_under16p0, ) from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( - is_bool_dtype, - is_integer_dtype, - is_object_dtype, is_scalar, - is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.missing import isna @@ -35,30 +31,27 @@ from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.arrow import ArrowExtensionArray from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.floating import Float64Dtype from pandas.core.arrays.integer import Int64Dtype from pandas.core.arrays.numeric import NumericDtype from pandas.core.arrays.string_ import ( BaseStringArray, StringDtype, ) -from pandas.core.ops import invalid_comparison from pandas.core.strings.object_array import ObjectStringArrayMixin if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc - from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning - if TYPE_CHECKING: from collections.abc import Sequence from pandas._typing import ( ArrayLike, - AxisInt, Dtype, - Scalar, + Self, npt, ) @@ -74,6 +67,10 @@ def _chk_pyarrow_available() -> None: raise ImportError(msg) +def _is_string_view(typ): + return not pa_version_under16p0 and pa.types.is_string_view(typ) + + # TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from # ObjectStringArrayMixin because we want to have the object-dtype based methods as # fallback for the ones that pyarrow doesn't yet support @@ -125,21 +122,28 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] _storage = "pyarrow" + _na_value: libmissing.NAType | float = libmissing.NA def __init__(self, values) -> None: _chk_pyarrow_available() - if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( - values.type + if isinstance(values, (pa.Array, pa.ChunkedArray)) and ( + pa.types.is_string(values.type) + or _is_string_view(values.type) + or ( + pa.types.is_dictionary(values.type) + and ( + pa.types.is_string(values.type.value_type) + or pa.types.is_large_string(values.type.value_type) + or _is_string_view(values.type.value_type) + ) + ) ): values = pc.cast(values, pa.large_string()) super().__init__(values) - self._dtype = StringDtype(storage=self._storage) + self._dtype = StringDtype(storage=self._storage, na_value=self._na_value) - if not pa.types.is_large_string(self._pa_array.type) and not ( - pa.types.is_dictionary(self._pa_array.type) - and pa.types.is_large_string(self._pa_array.type.value_type) - ): + if not pa.types.is_large_string(self._pa_array.type): raise ValueError( "ArrowStringArray requires a PyArrow (chunked) array of " "large_string type" @@ -179,10 +183,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage in ( - "pyarrow", - "pyarrow_numpy", - ) + assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and @@ -190,13 +191,13 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - return cls(pa.array(result, mask=na_values, type=pa.string())) + return cls(pa.array(result, mask=na_values, type=pa.large_string())) elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): - return cls(pc.cast(scalars, pa.string())) + return cls(pc.cast(scalars, pa.large_string())) # convert non-na-likes to str result = lib.ensure_string_array(scalars, copy=copy) - return cls(pa.array(result, type=pa.string(), from_pandas=True)) + return cls(pa.array(result, type=pa.large_string(), from_pandas=True)) @classmethod def _from_sequence_of_strings( @@ -212,12 +213,38 @@ def dtype(self) -> StringDtype: # type: ignore[override] return self._dtype def insert(self, loc: int, item) -> ArrowStringArray: + if self.dtype.na_value is np.nan and item is np.nan: + item = libmissing.NA if not isinstance(item, str) and item is not libmissing.NA: - raise TypeError("Scalar must be NA or str") + raise TypeError( + f"Invalid value '{item}' for dtype 'str'. Value should be a " + f"string or missing value, got '{type(item).__name__}' instead." + ) return super().insert(loc, item) - @classmethod - def _result_converter(cls, values, na=None): + def _convert_bool_result(self, values, na=lib.no_default, method_name=None): + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + f"Allowing a non-bool 'na' in obj.str.{method_name} is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + na = bool(na) + + if self.dtype.na_value is np.nan: + if na is lib.no_default or isna(na): + # NaN propagates as False + values = values.fill_null(False) + else: + values = values.fill_null(na) + return values.to_numpy() + else: + if na is not lib.no_default and not isna( + na + ): # pyright: ignore [reportGeneralTypeIssues] + values = values.fill_null(na) return BooleanDtype().__from_arrow__(values) def _maybe_convert_setitem_value(self, value): @@ -226,20 +253,26 @@ def _maybe_convert_setitem_value(self, value): if isna(value): value = None elif not isinstance(value, str): - raise TypeError("Scalar must be NA or str") + raise TypeError( + f"Invalid value '{value}' for dtype 'str'. Value should be a " + f"string or missing value, got '{type(value).__name__}' instead." + ) else: value = np.array(value, dtype=object, copy=True) value[isna(value)] = None for v in value: if not (v is None or isinstance(v, str)): - raise TypeError("Scalar must be NA or str") + raise TypeError( + "Invalid value for dtype 'str'. Value should be a " + "string or missing value (or array of those)." + ) return super()._maybe_convert_setitem_value(value) def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: value_set = [ pa_scalar.as_py() for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] - if pa_scalar.type in (pa.string(), pa.null()) + if pa_scalar.type in (pa.string(), pa.null(), pa.large_string()) ] # short-circuit to return all False array. @@ -282,123 +315,48 @@ def _data(self): # ------------------------------------------------------------------------ # String methods interface - # error: Incompatible types in assignment (expression has type "NAType", - # base class "ObjectStringArrayMixin" defined the type as "float") - _str_na_value = libmissing.NA # type: ignore[assignment] - - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - # TODO: de-duplicate with StringArray method. This method is moreless copy and - # paste. - - from pandas.arrays import ( - BooleanArray, - IntegerArray, - ) - - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray | BooleanArray] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected - # "Type[object]" - dtype=np.dtype(dtype), # type: ignore[arg-type] - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) - return type(self)(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) + _str_isalnum = ArrowStringArrayMixin._str_isalnum + _str_isalpha = ArrowStringArrayMixin._str_isalpha + _str_isdecimal = ArrowStringArrayMixin._str_isdecimal + _str_isdigit = ArrowStringArrayMixin._str_isdigit + _str_islower = ArrowStringArrayMixin._str_islower + _str_isnumeric = ArrowStringArrayMixin._str_isnumeric + _str_isspace = ArrowStringArrayMixin._str_isspace + _str_istitle = ArrowStringArrayMixin._str_istitle + _str_isupper = ArrowStringArrayMixin._str_isupper + + _str_map = BaseStringArray._str_map + _str_startswith = ArrowStringArrayMixin._str_startswith + _str_endswith = ArrowStringArrayMixin._str_endswith + _str_pad = ArrowStringArrayMixin._str_pad + _str_match = ArrowStringArrayMixin._str_match + _str_fullmatch = ArrowStringArrayMixin._str_fullmatch + _str_lower = ArrowStringArrayMixin._str_lower + _str_upper = ArrowStringArrayMixin._str_upper + _str_strip = ArrowStringArrayMixin._str_strip + _str_lstrip = ArrowStringArrayMixin._str_lstrip + _str_rstrip = ArrowStringArrayMixin._str_rstrip + _str_removesuffix = ArrowStringArrayMixin._str_removesuffix + _str_get = ArrowStringArrayMixin._str_get + _str_capitalize = ArrowStringArrayMixin._str_capitalize + _str_title = ArrowStringArrayMixin._str_title + _str_swapcase = ArrowStringArrayMixin._str_swapcase + _str_slice_replace = ArrowStringArrayMixin._str_slice_replace + _str_len = ArrowStringArrayMixin._str_len + _str_slice = ArrowStringArrayMixin._str_slice def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ): if flags: - fallback_performancewarning() return super()._str_contains(pat, case, flags, na, regex) - if regex: - result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) - else: - result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = self._result_converter(result, na=na) - if not isna(na): - result[isna(result)] = bool(na) - return result - - def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) - ) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._result_converter(result) - - def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) - ) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._result_converter(result) + return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) def _str_replace( self, @@ -410,146 +368,38 @@ def _str_replace( regex: bool = True, ): if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: - fallback_performancewarning() return super()._str_replace(pat, repl, n, case, flags, regex) - func = pc.replace_substring_regex if regex else pc.replace_substring - result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) - return type(self)(result) + return ArrowStringArrayMixin._str_replace( + self, pat, repl, n, case, flags, regex + ) def _str_repeat(self, repeats: int | Sequence[int]): if not isinstance(repeats, int): return super()._str_repeat(repeats) else: - return type(self)(pc.binary_repeat(self._pa_array, repeats)) - - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.startswith("^"): - pat = f"^{pat}" - return self._str_contains(pat, case, flags, na, regex=True) - - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.endswith("$") or pat.endswith("//$"): - pat = f"{pat}$" - return self._str_match(pat, case, flags, na) - - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ): - if stop is None: - return super()._str_slice(start, stop, step) - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - - def _str_isalnum(self): - result = pc.utf8_is_alnum(self._pa_array) - return self._result_converter(result) - - def _str_isalpha(self): - result = pc.utf8_is_alpha(self._pa_array) - return self._result_converter(result) - - def _str_isdecimal(self): - result = pc.utf8_is_decimal(self._pa_array) - return self._result_converter(result) - - def _str_isdigit(self): - result = pc.utf8_is_digit(self._pa_array) - return self._result_converter(result) - - def _str_islower(self): - result = pc.utf8_is_lower(self._pa_array) - return self._result_converter(result) - - def _str_isnumeric(self): - result = pc.utf8_is_numeric(self._pa_array) - return self._result_converter(result) - - def _str_isspace(self): - result = pc.utf8_is_space(self._pa_array) - return self._result_converter(result) - - def _str_istitle(self): - result = pc.utf8_is_title(self._pa_array) - return self._result_converter(result) - - def _str_isupper(self): - result = pc.utf8_is_upper(self._pa_array) - return self._result_converter(result) - - def _str_len(self): - result = pc.utf8_length(self._pa_array) - return self._convert_int_dtype(result) - - def _str_lower(self): - return type(self)(pc.utf8_lower(self._pa_array)) - - def _str_upper(self): - return type(self)(pc.utf8_upper(self._pa_array)) - - def _str_strip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_trim_whitespace(self._pa_array) - else: - result = pc.utf8_trim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_lstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._pa_array) - else: - result = pc.utf8_ltrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_rstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._pa_array) - else: - result = pc.utf8_rtrim(self._pa_array, characters=to_strip) - return type(self)(result) + return ArrowExtensionArray._str_repeat(self, repeats=repeats) def _str_removeprefix(self, prefix: str): if not pa_version_under13p0: - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return type(self)(result) + return ArrowStringArrayMixin._str_removeprefix(self, prefix) return super()._str_removeprefix(prefix) - def _str_removesuffix(self, suffix: str): - ends_with = pc.ends_with(self._pa_array, pattern=suffix) - removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) - result = pc.if_else(ends_with, removed, self._pa_array) - return type(self)(result) - def _str_count(self, pat: str, flags: int = 0): if flags: return super()._str_count(pat, flags) result = pc.count_substring_regex(self._pa_array, pat) - return self._convert_int_dtype(result) + return self._convert_int_result(result) def _str_find(self, sub: str, start: int = 0, end: int | None = None): - if start != 0 and end is not None: - slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) - result = pc.find_substring(slices, sub) - not_found = pc.equal(result, -1) - offset_result = pc.add(result, end - start) - result = pc.if_else(not_found, result, offset_result) - elif start == 0 and end is None: - slices = self._pa_array - result = pc.find_substring(slices, sub) - else: + if ( + pa_version_under13p0 + and not (start != 0 and end is not None) + and not (start == 0 and end is None) + ): + # GH#59562 return super()._str_find(sub, start, end) - return self._convert_int_dtype(result) + return ArrowStringArrayMixin._str_find(self, sub, start, end) def _str_get_dummies(self, sep: str = "|"): dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) @@ -558,158 +408,78 @@ def _str_get_dummies(self, sep: str = "|"): dummies = np.vstack(dummies_pa.to_numpy()) return dummies.astype(np.int64, copy=False), labels - def _convert_int_dtype(self, result): + def _convert_int_result(self, result): + if self.dtype.na_value is np.nan: + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() + if result.dtype == np.int32: + result = result.astype(np.int64) + return result + return Int64Dtype().__from_arrow__(result) + def _convert_rank_result(self, result): + if self.dtype.na_value is np.nan: + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() + return result.astype("float64", copy=False) + + return Float64Dtype().__from_arrow__(result) + def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): - result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + if self.dtype.na_value is np.nan and name in ["any", "all"]: + if not skipna: + nas = pc.is_null(self._pa_array) + arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, "")) + else: + arr = pc.not_equal(self._pa_array, "") + result = ArrowExtensionArray(arr)._reduce( + name, skipna=skipna, keepdims=keepdims, **kwargs + ) + if keepdims: + # ArrowExtensionArray will return a length-1 bool[pyarrow] array + return result.astype(np.bool_) + return result + + if name in ("min", "max", "sum", "argmin", "argmax"): + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + else: + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + if name in ("argmin", "argmax") and isinstance(result, pa.Array): - return self._convert_int_dtype(result) + return self._convert_int_result(result) elif isinstance(result, pa.Array): return type(self)(result) else: return result - def _rank( - self, - *, - axis: AxisInt = 0, - method: str = "average", - na_option: str = "keep", - ascending: bool = True, - pct: bool = False, - ): - """ - See Series.rank.__doc__. - """ - return self._convert_int_dtype( - self._rank_calc( - axis=axis, - method=method, - na_option=na_option, - ascending=ascending, - pct=pct, - ) - ) - - -class ArrowStringArrayNumpySemantics(ArrowStringArray): - _storage = "pyarrow_numpy" - - @classmethod - def _result_converter(cls, values, na=None): - if not isna(na): - values = values.fill_null(bool(na)) - return ArrowExtensionArray(values).to_numpy(na_value=np.nan) - - def __getattribute__(self, item): - # ArrowStringArray and we both inherit from ArrowExtensionArray, which - # creates inheritance problems (Diamond inheritance) - if item in ArrowStringArrayMixin.__dict__ and item not in ( - "_pa_array", - "__dict__", - ): - return partial(getattr(ArrowStringArrayMixin, item), self) - return super().__getattribute__(item) - - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - if is_integer_dtype(dtype): - na_value = np.nan - else: - na_value = False - try: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(dtype), # type: ignore[arg-type] - ) - return result - - except ValueError: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - ) - if convert and result.dtype == object: - result = lib.maybe_convert_objects(result) - return result - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value + def value_counts(self, dropna: bool = True) -> Series: + result = super().value_counts(dropna=dropna) + if self.dtype.na_value is np.nan: + res_values = result._values.to_numpy() + return result._constructor( + res_values, index=result.index, name=result.name, copy=False ) - result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) - return type(self)(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) - - def _convert_int_dtype(self, result): - if isinstance(result, pa.Array): - result = result.to_numpy(zero_copy_only=False) - else: - result = result.to_numpy() - if result.dtype == np.int32: - result = result.astype(np.int64) return result def _cmp_method(self, other, op): - try: - result = super()._cmp_method(other, op) - except pa.ArrowNotImplementedError: - return invalid_comparison(self, other, op) - if op == operator.ne: - return result.to_numpy(np.bool_, na_value=True) - else: - return result.to_numpy(np.bool_, na_value=False) - - def value_counts(self, dropna: bool = True) -> Series: - from pandas import Series + result = super()._cmp_method(other, op) + if self.dtype.na_value is np.nan: + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) + else: + return result.to_numpy(np.bool_, na_value=False) + return result - result = super().value_counts(dropna) - return Series( - result._values.to_numpy(), index=result.index, name=result.name, copy=False - ) + def __pos__(self) -> Self: + raise TypeError(f"bad operand type for unary +: '{self.dtype}'") - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): - if name in ["any", "all"]: - if not skipna and name == "all": - nas = pc.invert(pc.is_null(self._pa_array)) - arr = pc.and_kleene(nas, pc.not_equal(self._pa_array, "")) - else: - arr = pc.not_equal(self._pa_array, "") - return ArrowExtensionArray(arr)._reduce( - name, skipna=skipna, keepdims=keepdims, **kwargs - ) - else: - return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) - def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics: - if item is np.nan: - item = libmissing.NA - return super().insert(loc, item) # type: ignore[return-value] +class ArrowStringArrayNumpySemantics(ArrowStringArray): + _na_value = np.nan diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 1b885a2bdcd47..d4caec4bfd58a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -468,6 +468,10 @@ def __mul__(self, other) -> Self: if is_scalar(other): # numpy will accept float and int, raise TypeError for others result = self._ndarray * other + if result.dtype.kind != "m": + # numpy >= 2.1 may not raise a TypeError + # and seems to dispatch to others.__rmul__? + raise TypeError(f"Cannot multiply with {type(other).__name__}") freq = None if self.freq is not None and not isna(other): freq = self.freq * other @@ -495,6 +499,10 @@ def __mul__(self, other) -> Self: # numpy will accept float or int dtype, raise TypeError for others result = self._ndarray * other + if result.dtype.kind != "m": + # numpy >= 2.1 may not raise a TypeError + # and seems to dispatch to others.__rmul__? + raise TypeError(f"Cannot multiply with {type(other).__name__}") return type(self)._simple_new(result, dtype=result.dtype) __rmul__ = __mul__ @@ -1072,7 +1080,10 @@ def sequence_to_td64ns( # This includes datetime64-dtype, see GH#23539, GH#29794 raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]") - data = np.array(data, copy=copy) + if not copy: + data = np.asarray(data) + else: + data = np.array(data, copy=copy) assert data.dtype.kind == "m" assert data.dtype != "m8" # i.e. not unit-less @@ -1150,7 +1161,7 @@ def _objects_to_td64ns(data, unit=None, errors: DateTimeErrorChoices = "raise"): higher level. """ # coerce Index to np.ndarray, converting string-dtype if necessary - values = np.array(data, dtype=np.object_, copy=False) + values = np.asarray(data, dtype=np.object_) result = array_to_timedelta64(values, unit=unit, errors=errors) return result.view("timedelta64[ns]") diff --git a/pandas/core/base.py b/pandas/core/base.py index e98f1157572bb..af8f80db6a347 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -48,6 +48,7 @@ from pandas.core.dtypes.generic import ( ABCDataFrame, ABCIndex, + ABCMultiIndex, ABCSeries, ) from pandas.core.dtypes.missing import ( @@ -360,8 +361,11 @@ def __len__(self) -> int: # We need this defined here for mypy raise AbstractMethodError(self) + # Temporarily avoid using `-> Literal[1]:` because of an IPython (jedi) bug + # https://github.com/ipython/ipython/issues/14412 + # https://github.com/davidhalter/jedi/issues/1990 @property - def ndim(self) -> Literal[1]: + def ndim(self) -> int: """ Number of dimensions of the underlying data, by definition 1. @@ -1198,13 +1202,18 @@ def factorize( if uniques.dtype == np.float16: uniques = uniques.astype(np.float32) - if isinstance(self, ABCIndex): - # preserve e.g. MultiIndex + if isinstance(self, ABCMultiIndex): + # preserve MultiIndex uniques = self._constructor(uniques) else: from pandas import Index - uniques = Index(uniques) + try: + uniques = Index(uniques, dtype=self.dtype) + except NotImplementedError: + # not all dtypes are supported in Index that are allowed for Series + # e.g. float16 or bytes + uniques = Index(uniques) return codes, uniques _shared_docs[ diff --git a/pandas/core/common.py b/pandas/core/common.py index 7d864e02be54e..9f024498d66ed 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -233,6 +233,8 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi values = list(values) elif isinstance(values, ABCIndex): return values._values + elif isinstance(values, ABCSeries): + return values._values if isinstance(values, list) and dtype in [np.object_, object]: return construct_1d_object_array_from_listlike(values) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index f1fe528de06f8..7bb623cba3755 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -10,7 +10,10 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.common import ( + is_extension_array_dtype, + is_string_dtype, +) from pandas.core.computation.engines import ENGINES from pandas.core.computation.expr import ( @@ -336,10 +339,13 @@ def eval( parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) if engine == "numexpr" and ( - is_extension_array_dtype(parsed_expr.terms.return_type) + ( + is_extension_array_dtype(parsed_expr.terms.return_type) + and not is_string_dtype(parsed_expr.terms.return_type) + ) or getattr(parsed_expr.terms, "operand_types", None) is not None and any( - is_extension_array_dtype(elem) + (is_extension_array_dtype(elem) and not is_string_dtype(elem)) for elem in parsed_expr.terms.operand_types ) ): diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 4770f403b1bdb..34055d2177626 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -20,6 +20,8 @@ from pandas.errors import UndefinedVariableError +from pandas.core.dtypes.common import is_string_dtype + import pandas.core.common as com from pandas.core.computation.ops import ( ARITH_OPS_SYMS, @@ -31,7 +33,6 @@ UNARY_OPS_SYMS, BinOp, Constant, - Div, FuncNode, Op, Term, @@ -370,7 +371,7 @@ class BaseExprVisitor(ast.NodeVisitor): "Add", "Sub", "Mult", - None, + "Div", "Pow", "FloorDiv", "Mod", @@ -521,10 +522,12 @@ def _maybe_evaluate_binop( elif self.engine != "pytables": if ( getattr(lhs, "return_type", None) == object + or is_string_dtype(getattr(lhs, "return_type", None)) or getattr(rhs, "return_type", None) == object + or is_string_dtype(getattr(rhs, "return_type", None)) ): # evaluate "==" and "!=" in python if either of our operands - # has an object return type + # has an object or string return type return self._maybe_eval(res, eval_in_python + maybe_eval_in_python) return res @@ -533,9 +536,6 @@ def visit_BinOp(self, node, **kwargs): left, right = self._maybe_downcast_constants(left, right) return self._maybe_evaluate_binop(op, op_class, left, right) - def visit_Div(self, node, **kwargs): - return lambda lhs, rhs: Div(lhs, rhs) - def visit_UnaryOp(self, node, **kwargs): op = self.visit(node.op) operand = self.visit(node.operand) @@ -695,8 +695,8 @@ def visit_Call(self, node, side=None, **kwargs): if not isinstance(key, ast.keyword): # error: "expr" has no attribute "id" raise ValueError( - "keyword error in function call " # type: ignore[attr-defined] - f"'{node.func.id}'" + "keyword error in function call " + f"'{node.func.id}'" # type: ignore[attr-defined] ) if key.arg: diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 95ac20ba39edc..d8265456dfced 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -332,31 +332,6 @@ def _not_in(x, y): _binary_ops_dict.update(d) -def _cast_inplace(terms, acceptable_dtypes, dtype) -> None: - """ - Cast an expression inplace. - - Parameters - ---------- - terms : Op - The expression that should cast. - acceptable_dtypes : list of acceptable numpy.dtype - Will not cast if term's dtype in this list. - dtype : str or numpy.dtype - The dtype to cast to. - """ - dt = np.dtype(dtype) - for term in terms: - if term.type in acceptable_dtypes: - continue - - try: - new_value = term.value.astype(dt) - except AttributeError: - new_value = dt.type(term.value) - term.update(new_value) - - def is_term(obj) -> bool: return isinstance(obj, Term) @@ -517,30 +492,6 @@ def isnumeric(dtype) -> bool: return issubclass(np.dtype(dtype).type, np.number) -class Div(BinOp): - """ - Div operator to special case casting. - - Parameters - ---------- - lhs, rhs : Term or Op - The Terms or Ops in the ``/`` expression. - """ - - def __init__(self, lhs, rhs) -> None: - super().__init__("/", lhs, rhs) - - if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): - raise TypeError( - f"unsupported operand type(s) for {self.op}: " - f"'{lhs.return_type}' and '{rhs.return_type}'" - ) - - # do not upcast float32s to float64 un-necessarily - acceptable_dtypes = [np.float32, np.float64] - _cast_inplace(com.flatten(self), acceptable_dtypes, np.float64) - - UNARY_OPS_SYMS = ("+", "-", "~", "not") _unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert) _unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs)) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index a8b63f97141c2..a1df455eebacf 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -12,7 +12,10 @@ from __future__ import annotations import os -from typing import Callable +from typing import ( + Any, + Callable, +) import pandas._config.config as cf from pandas._config.config import ( @@ -502,16 +505,30 @@ def use_inf_as_na_cb(key) -> None: string_storage_doc = """ : string - The default storage for StringDtype. This option is ignored if - ``future.infer_string`` is set to True. + The default storage for StringDtype. """ + +def is_valid_string_storage(value: Any) -> None: + legal_values = ["auto", "python", "pyarrow"] + if value not in legal_values: + msg = "Value must be one of python|pyarrow" + if value == "pyarrow_numpy": + # TODO: we can remove extra message after 3.0 + msg += ( + ". 'pyarrow_numpy' was specified, but this option should be " + "enabled using pandas.options.future.infer_string instead" + ) + raise ValueError(msg) + + with cf.config_prefix("mode"): cf.register_option( "string_storage", - "python", + "auto", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), + # validator=is_one_of_factory(["python", "pyarrow"]), + validator=is_valid_string_storage, ) @@ -905,7 +922,7 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - False, + True if os.environ.get("PANDAS_FUTURE_INFER_STRING", "0") == "1" else False, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", diff --git a/pandas/core/construction.py b/pandas/core/construction.py index d41a9c80a10ec..59e87f28a3dce 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -19,7 +19,7 @@ import numpy as np from numpy import ma -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas._libs.tslibs import ( @@ -566,14 +566,10 @@ def sanitize_array( if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") - if ( - isinstance(data, str) - and using_pyarrow_string_dtype() - and original_dtype is None - ): + if isinstance(data, str) and using_string_dtype() and original_dtype is None: from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype("pyarrow_numpy") + dtype = StringDtype(na_value=np.nan) data = construct_1d_arraylike_from_scalar(data, len(index), dtype) return data @@ -593,6 +589,8 @@ def sanitize_array( # create an extension array from its dtype _sanitize_non_ordered(data) cls = dtype.construct_array_type() + if not hasattr(data, "__array__"): + data = list(data) subarr = cls._from_sequence(data, dtype=dtype, copy=copy) # GH#846 @@ -604,20 +602,19 @@ def sanitize_array( subarr = data if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) - if ( - object_index - and using_pyarrow_string_dtype() - and is_string_dtype(subarr) - ): + if object_index and using_string_dtype() and is_string_dtype(subarr): # Avoid inference when string option is set subarr = data - elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): + elif data.dtype.kind == "U" and using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(na_value=np.nan) subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) - if subarr is data and copy: + if ( + subarr is data + or (subarr.dtype == "str" and subarr.dtype.storage == "python") # type: ignore[union-attr] + ) and copy: subarr = subarr.copy() else: @@ -626,7 +623,10 @@ def sanitize_array( elif hasattr(data, "__array__"): # e.g. dask array GH#38645 - data = np.array(data, copy=copy) + if not copy: + data = np.asarray(data) + else: + data = np.array(data, copy=copy) return sanitize_array( data, index=index, @@ -744,8 +744,11 @@ def _sanitize_str_dtypes( # GH#19853: If data is a scalar, result has already the result if not lib.is_scalar(data): if not np.all(isna(data)): - data = np.array(data, dtype=dtype, copy=False) - result = np.array(data, dtype=object, copy=copy) + data = np.asarray(data, dtype=dtype) + if not copy: + result = np.asarray(data, dtype=object) + else: + result = np.array(data, dtype=object, copy=copy) return result @@ -810,6 +813,8 @@ def _try_cast( # this will raise if we have e.g. floats subarr = maybe_cast_to_integer_array(arr, dtype) + elif not copy: + subarr = np.asarray(arr, dtype=dtype) else: subarr = np.array(arr, dtype=dtype, copy=copy) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7a088bf84c48e..d4263f7488a14 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -18,7 +18,7 @@ import numpy as np -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import ( Interval, @@ -39,7 +39,6 @@ is_supported_dtype, ) from pandas._libs.tslibs.timedeltas import array_to_timedelta64 -from pandas.compat.numpy import np_version_gt2 from pandas.errors import ( IntCastingNaNError, LossySetitemError, @@ -88,8 +87,8 @@ if TYPE_CHECKING: from collections.abc import ( + Collection, Sequence, - Sized, ) from pandas._typing import ( @@ -799,10 +798,10 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: # coming out as np.str_! dtype = _dtype_obj - if using_pyarrow_string_dtype(): + if using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(na_value=np.nan) elif isinstance(val, (np.datetime64, dt.datetime)): try: @@ -1026,6 +1025,8 @@ def convert_dtypes( ------- np.dtype, or ExtensionDtype """ + from pandas.core.arrays.string_ import StringDtype + inferred_dtype: str | DtypeObj if ( @@ -1104,12 +1105,18 @@ def convert_dtypes( # If we couldn't do anything else, then we retain the dtype inferred_dtype = input_array.dtype + elif ( + convert_string + and isinstance(input_array.dtype, StringDtype) + and input_array.dtype.na_value is np.nan + ): + inferred_dtype = pandas_dtype_func("string") + else: inferred_dtype = input_array.dtype if dtype_backend == "pyarrow": from pandas.core.arrays.arrow.array import to_pyarrow_type - from pandas.core.arrays.string_ import StringDtype assert not isinstance(inferred_dtype, str) @@ -1156,6 +1163,7 @@ def convert_dtypes( def maybe_infer_to_datetimelike( value: npt.NDArray[np.object_], + convert_to_nullable_dtype: bool = False, ) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray: """ we might have a array (or single object) that is datetime like, @@ -1193,6 +1201,7 @@ def maybe_infer_to_datetimelike( # numpy would have done it for us. convert_numeric=False, convert_non_numeric=True, + convert_to_nullable_dtype=convert_to_nullable_dtype, dtype_if_all_nat=np.dtype("M8[ns]"), ) @@ -1332,7 +1341,7 @@ def find_result_type(left_dtype: DtypeObj, right: Any) -> DtypeObj: right = left_dtype elif ( not np.issubdtype(left_dtype, np.unsignedinteger) - and 0 < right <= 2 ** (8 * right_dtype.itemsize - 1) - 1 + and 0 < right <= np.iinfo(right_dtype).max ): # If left dtype isn't unsigned, check if it fits in the signed dtype right = np.dtype(f"i{right_dtype.itemsize}") @@ -1501,7 +1510,10 @@ def construct_2d_arraylike_from_scalar( # Attempt to coerce to a numpy array try: - arr = np.array(value, dtype=dtype, copy=copy) + if not copy: + arr = np.asarray(value, dtype=dtype) + else: + arr = np.array(value, dtype=dtype, copy=copy) except (ValueError, TypeError) as err: raise TypeError( f"DataFrame constructor called with incompatible data and dtype: {err}" @@ -1574,7 +1586,7 @@ def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj): return _maybe_unbox_datetimelike(value, dtype) -def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: +def construct_1d_object_array_from_listlike(values: Collection) -> np.ndarray: """ Transform any list-like object in a 1-dimensional numpy array of object dtype. @@ -1592,10 +1604,11 @@ def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: ------- 1-dimensional numpy array of dtype object """ - # numpy will try to interpret nested lists as further dimensions, hence - # making a 1D array that contains list-likes is a bit tricky: + # numpy will try to interpret nested lists as further dimensions in np.array(), + # hence explicitly making a 1D array using np.fromiter result = np.empty(len(values), dtype="object") - result[:] = values + for i, obj in enumerate(values): + result[i] = obj return result @@ -1644,14 +1657,12 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n with warnings.catch_warnings(): # We already disallow dtype=uint w/ negative numbers # (test_constructor_coercion_signed_to_unsigned) so safe to ignore. - if not np_version_gt2: - warnings.filterwarnings( - "ignore", - "NumPy will stop allowing conversion of " - "out-of-bound Python int", - DeprecationWarning, - ) - casted = np.array(arr, dtype=dtype, copy=False) + warnings.filterwarnings( + "ignore", + "NumPy will stop allowing conversion of out-of-bound Python int", + DeprecationWarning, + ) + casted = np.asarray(arr, dtype=dtype) else: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) @@ -1682,6 +1693,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n arr = np.asarray(arr) if np.issubdtype(arr.dtype, str): + # TODO(numpy-2.0 min): This case will raise an OverflowError above if (casted.astype(str) == arr).all(): return casted raise ValueError(f"string values cannot be losslessly cast to {dtype}") @@ -1745,6 +1757,13 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: except (ValueError, TypeError): return False + if dtype == "string": + try: + arr._maybe_convert_setitem_value(element) # type: ignore[union-attr] + return True + except (ValueError, TypeError): + return False + # This is technically incorrect, but maintains the behavior of # ExtensionBlock._can_hold_element return True diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 2245359fd8eac..6dea15ac0bc24 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -12,6 +12,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( Interval, Period, @@ -169,6 +171,9 @@ def is_sparse(arr) -> bool: """ Check whether an array-like is a 1-D pandas sparse array. + .. deprecated:: 2.1.0 + Use isinstance(dtype, pd.SparseDtype) instead. + Check that the one-dimensional array-like is a pandas sparse array. Returns True if it is a pandas sparse array, not another type of sparse array. @@ -295,6 +300,9 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of a DatetimeTZDtype dtype. + .. deprecated:: 2.1.0 + Use isinstance(dtype, pd.DatetimeTZDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -381,6 +389,9 @@ def is_period_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Period dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.Period) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -424,6 +435,9 @@ def is_interval_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Interval dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.IntervalDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -470,6 +484,9 @@ def is_categorical_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Categorical dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.CategoricalDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -1310,7 +1327,15 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: elif isinstance(dtype, np.dtype): return False else: - return registry.find(dtype) is not None + try: + with warnings.catch_warnings(): + # pandas_dtype(..) can raise UserWarning for class input + warnings.simplefilter("ignore", UserWarning) + dtype = pandas_dtype(dtype) + except (TypeError, ValueError): + # np.dtype(..) can raise ValueError + return False + return isinstance(dtype, ExtensionDtype) def is_ea_or_datetimelike_dtype(dtype: DtypeObj | None) -> bool: @@ -1605,6 +1630,12 @@ def pandas_dtype(dtype) -> DtypeObj: elif isinstance(dtype, (np.dtype, ExtensionDtype)): return dtype + # builtin aliases + if dtype is str and using_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + return StringDtype(na_value=np.nan) + # registered extension types result = registry.find(dtype) if result is not None: @@ -1623,6 +1654,8 @@ def pandas_dtype(dtype) -> DtypeObj: # raise a consistent TypeError if failed try: with warnings.catch_warnings(): + # TODO: warnings.catch_warnings can be removed when numpy>2.3.0 + # is the minimum version # GH#51523 - Series.astype(np.integer) doesn't show # numpy deprecation warning of np.integer # Hence enabling DeprecationWarning diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index ed5256922377a..542bc85110cad 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -453,7 +453,7 @@ def __eq__(self, other: object) -> bool: # Because left and right have the same length and are unique, # `indexer` not having any -1s implies that there is a # bijection between `left` and `right`. - return (indexer != -1).all() + return bool((indexer != -1).all()) # With object-dtype we need a comparison that identifies # e.g. int(2) as distinct from float(2) @@ -919,7 +919,7 @@ def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> DatetimeArray: else: np_arr = array.to_numpy() - return DatetimeArray._from_sequence(np_arr, dtype=self, copy=False) + return DatetimeArray._simple_new(np_arr, dtype=self) def __setstate__(self, state) -> None: # for pickle compat. __get_state__ is defined in the @@ -1791,7 +1791,7 @@ def _is_na_fill_value(self) -> bool: @property def _is_numeric(self) -> bool: - return not self.subtype == object + return self.subtype != object @property def _is_boolean(self) -> bool: @@ -2190,7 +2190,9 @@ def numpy_dtype(self) -> np.dtype: # This can be removed if/when pyarrow addresses it: # https://github.com/apache/arrow/issues/34462 return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") - if pa.types.is_string(self.pyarrow_dtype): + if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( + self.pyarrow_dtype + ): # pa.string().to_pandas_dtype() = object which we don't want return np.dtype(str) try: @@ -2240,7 +2242,7 @@ def construct_from_string(cls, string: str) -> ArrowDtype: ) if not string.endswith("[pyarrow]"): raise TypeError(f"'{string}' must end with '[pyarrow]'") - if string == "string[pyarrow]": + if string in ("string[pyarrow]", "str[pyarrow]"): # Ensure Registry.find skips ArrowDtype to use StringDtype instead raise TypeError("string[pyarrow] should be constructed by StringDtype") diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 4dc0d477f89e8..c341ff9dff7e6 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -632,7 +632,7 @@ def infer_fill_value(val): """ if not is_list_like(val): val = [val] - val = np.array(val, copy=False) + val = np.asarray(val) if val.dtype.kind in "mM": return np.array("NaT", dtype=val.dtype) elif val.dtype == object: @@ -647,6 +647,20 @@ def infer_fill_value(val): return np.nan +def construct_1d_array_from_inferred_fill_value( + value: object, length: int +) -> ArrayLike: + # Find our empty_value dtype by constructing an array + # from our value and doing a .take on it + from pandas.core.algorithms import take_nd + from pandas.core.construction import sanitize_array + from pandas.core.indexes.base import Index + + arr = sanitize_array(value, Index(range(1)), copy=False) + taker = -1 * np.ones(length, dtype=np.intp) + return take_nd(arr, taker) + + def maybe_fill(arr: np.ndarray) -> np.ndarray: """ Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3e2e589440bd9..ef48090f02c3f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -656,26 +656,37 @@ class DataFrame(NDFrame, OpsMixin): def _constructor(self) -> Callable[..., DataFrame]: return DataFrame - def _constructor_from_mgr(self, mgr, axes): - if self._constructor is DataFrame: - # we are pandas.DataFrame (or a subclass that doesn't override _constructor) - return DataFrame._from_mgr(mgr, axes=axes) - else: - assert axes is mgr.axes + def _constructor_from_mgr(self, mgr, axes) -> DataFrame: + df = DataFrame._from_mgr(mgr, axes=axes) + + if type(self) is DataFrame: + # This would also work `if self._constructor is DataFrame`, but + # this check is slightly faster, benefiting the most-common case. + return df + + elif type(self).__name__ == "GeoDataFrame": + # Shim until geopandas can override their _constructor_from_mgr + # bc they have different behavior for Managers than for DataFrames return self._constructor(mgr) + # We assume that the subclass __init__ knows how to handle a + # pd.DataFrame object. + return self._constructor(df) + _constructor_sliced: Callable[..., Series] = Series - def _sliced_from_mgr(self, mgr, axes) -> Series: - return Series._from_mgr(mgr, axes) + def _constructor_sliced_from_mgr(self, mgr, axes) -> Series: + ser = Series._from_mgr(mgr, axes) + ser._name = None # caller is responsible for setting real name - def _constructor_sliced_from_mgr(self, mgr, axes): - if self._constructor_sliced is Series: - ser = self._sliced_from_mgr(mgr, axes) - ser._name = None # caller is responsible for setting real name + if type(self) is DataFrame: + # This would also work `if self._constructor_sliced is Series`, but + # this check is slightly faster, benefiting the most-common case. return ser - assert axes is mgr.axes - return self._constructor_sliced(mgr) + + # We assume that the subclass __init__ knows how to handle a + # pd.Series object. + return self._constructor_sliced(ser) # ---------------------------------------------------------------------- # Constructors @@ -987,6 +998,33 @@ def __dataframe_consortium_standard__( ) return convert_to_standard_compliant_dataframe(self, api_version=api_version) + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the pandas DataFrame as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas DataFrame to the Arrow + format (and follows the default behaviour of ``pyarrow.Table.from_pandas`` + in its handling of the index, i.e. store the index as a column except + for RangeIndex). + This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + """ + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + if requested_schema is not None: + requested_schema = pa.Schema._import_from_c_capsule(requested_schema) + table = pa.Table.from_pandas(self, schema=requested_schema) + return table.__arrow_c_stream__() + # ---------------------------------------------------------------------- @property @@ -1376,7 +1414,8 @@ def _get_values_for_csv( na_rep=na_rep, quoting=quoting, ) - return self._constructor_from_mgr(mgr, axes=mgr.axes) + # error: Incompatible return value type (got "DataFrame", expected "Self") + return self._constructor_from_mgr(mgr, axes=mgr.axes) # type: ignore[return-value] # ---------------------------------------------------------------------- @@ -1400,6 +1439,11 @@ def style(self) -> Styler: Please see `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. """ + # Raise AttributeError so that inspect works even if jinja2 is not installed. + has_jinja2 = import_optional_dependency("jinja2", errors="ignore") + if not has_jinja2: + raise AttributeError("The '.style' accessor requires jinja2") + from pandas.io.formats.style import Styler return Styler(self) @@ -1953,7 +1997,7 @@ def to_numpy( dtype = np.dtype(dtype) result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) if result.dtype is not dtype: - result = np.array(result, dtype=dtype, copy=False) + result = np.asarray(result, dtype=dtype) return result @@ -4016,7 +4060,9 @@ def _getitem_nocopy(self, key: list): copy=False, only_slice=True, ) - return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + result = result.__finalize__(self) + return result def __getitem__(self, key): check_dict_or_set_indexers(key) @@ -4938,7 +4984,9 @@ def select_dtypes(self, include=None, exclude=None) -> Self: ----- * To select all *numeric* types, use ``np.number`` or ``'number'`` * To select strings you must use the ``object`` dtype, but note that - this will return *all* object dtype columns + this will return *all* object dtype columns. With + ``pd.options.future.infer_string`` enabled, using ``"str"`` will + work to select all string columns. * See the `numpy dtype hierarchy `__ * To select datetimes, use ``np.datetime64``, ``'datetime'`` or @@ -5048,7 +5096,8 @@ def predicate(arr: ArrayLike) -> bool: return True mgr = self._mgr._get_data_subset(predicate).copy(deep=None) - return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) + # error: Incompatible return value type (got "DataFrame", expected "Self") + return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) # type: ignore[return-value] def insert( self, @@ -5830,6 +5879,9 @@ def shift( ) fill_value = lib.no_default + if self.empty: + return self.copy() + axis = self._get_axis_number(axis) if is_list_like(periods): @@ -8930,6 +8982,7 @@ def update( 1 2 500.0 2 3 6.0 """ + if not PYPY and using_copy_on_write(): if sys.getrefcount(self) <= REF_COUNT: warnings.warn( @@ -8978,7 +9031,17 @@ def update( if mask.all(): continue - self.loc[:, col] = self[col].where(mask, that) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="Downcasting behavior", + category=FutureWarning, + ) + # GH#57124 - `that` might get upcasted because of NA values, and then + # downcasted in where because of the mask. Ignoring the warning + # is a stopgap, will replace with a new implementation of update + # in 3.0. + self.loc[:, col] = self[col].where(mask, that) # ---------------------------------------------------------------------- # Data reshaping diff --git a/pandas/core/generic.py b/pandas/core/generic.py index de25a02c6b37c..70b72577dd5d1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -336,6 +336,7 @@ def _as_manager(self, typ: str, copy: bool_t = True) -> Self: # fastpath of passing a manager doesn't check the option/manager class return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self) + @final @classmethod def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self: """ @@ -657,7 +658,7 @@ def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]: return { clean_column_name(k): Series( - v, copy=False, index=self.index, name=k + v, copy=False, index=self.index, name=k, dtype=self.dtypes[k] ).__finalize__(self) for k, v in zip(self.columns, self._iter_column_arrays()) if not isinstance(k, int) @@ -2145,11 +2146,32 @@ def empty(self) -> bool_t: # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented __array_priority__: int = 1000 - def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: + def __array__( + self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None + ) -> np.ndarray: + if copy is False and not self._mgr.is_single_block and not self.empty: + # check this manually, otherwise ._values will already return a copy + # and np.array(values, copy=False) will not raise a warning + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) values = self._values - arr = np.asarray(values, dtype=dtype) + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.array(values, dtype=dtype, copy=copy) + if ( - astype_is_view(values.dtype, arr.dtype) + copy is not True + and astype_is_view(values.dtype, arr.dtype) and using_copy_on_write() and self._mgr.is_single_block ): @@ -2969,6 +2991,9 @@ def to_sql( database. Otherwise, the datetimes will be stored as timezone unaware timestamps local to the original timezone. + Not all datastores support ``method="multi"``. Oracle, for example, + does not support multi-value insert. + References ---------- .. [1] https://docs.sqlalchemy.org @@ -3275,18 +3300,18 @@ def to_xarray(self): 2 lion mammal 80.5 4 3 monkey mammal NaN 4 - >>> df.to_xarray() + >>> df.to_xarray() # doctest: +SKIP Dimensions: (index: 4) Coordinates: - * index (index) int64 0 1 2 3 + * index (index) int64 32B 0 1 2 3 Data variables: - name (index) object 'falcon' 'parrot' 'lion' 'monkey' - class (index) object 'bird' 'bird' 'mammal' 'mammal' - max_speed (index) float64 389.0 24.0 80.5 nan - num_legs (index) int64 2 2 4 4 + name (index) object 32B 'falcon' 'parrot' 'lion' 'monkey' + class (index) object 32B 'bird' 'bird' 'mammal' 'mammal' + max_speed (index) float64 32B 389.0 24.0 80.5 nan + num_legs (index) int64 32B 2 2 4 4 - >>> df['max_speed'].to_xarray() + >>> df['max_speed'].to_xarray() # doctest: +SKIP array([389. , 24. , 80.5, nan]) Coordinates: @@ -3308,7 +3333,7 @@ class (index) object 'bird' 'bird' 'mammal' 'mammal' 2018-01-02 falcon 361 parrot 15 - >>> df_multiindex.to_xarray() + >>> df_multiindex.to_xarray() # doctest: +SKIP Dimensions: (date: 2, animal: 2) Coordinates: @@ -7187,6 +7212,8 @@ def fillna( or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). + .. deprecated:: 2.2.0 + Returns ------- {klass} or None @@ -7522,6 +7549,8 @@ def ffill( or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). + .. deprecated:: 2.2.0 + Returns ------- {klass} or None @@ -7713,6 +7742,8 @@ def bfill( or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). + .. deprecated:: 2.2.0 + Returns ------- {klass} or None @@ -12120,19 +12151,20 @@ def pct_change( if limit is lib.no_default: cols = self.items() if self.ndim == 2 else [(None, self)] for _, col in cols: - mask = col.isna().values - mask = mask[np.argmax(~mask) :] - if mask.any(): - warnings.warn( - "The default fill_method='pad' in " - f"{type(self).__name__}.pct_change is deprecated and will " - "be removed in a future version. Either fill in any " - "non-leading NA values prior to calling pct_change or " - "specify 'fill_method=None' to not fill NA values.", - FutureWarning, - stacklevel=find_stack_level(), - ) - break + if len(col) > 0: + mask = col.isna().values + mask = mask[np.argmax(~mask) :] + if mask.any(): + warnings.warn( + "The default fill_method='pad' in " + f"{type(self).__name__}.pct_change is deprecated and " + "will be removed in a future version. Either fill in " + "any non-leading NA values prior to calling pct_change " + "or specify 'fill_method=None' to not fill NA values.", + FutureWarning, + stacklevel=find_stack_level(), + ) + break fill_method = "pad" if limit is lib.no_default: limit = None diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 089e15afd465b..c8e2ccc7bdaeb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3335,9 +3335,13 @@ def max( ) @final - def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: + def first( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: """ - Compute the first non-null entry of each column. + Compute the first entry of each column within each group. + + Defaults to skipping NA elements. Parameters ---------- @@ -3345,12 +3349,17 @@ def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: Include only float, int, boolean columns. min_count : int, default -1 The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 2.2.1 Returns ------- Series or DataFrame - First non-null of values within each group. + First values within each group. See Also -------- @@ -3402,12 +3411,17 @@ def first(x: Series): min_count=min_count, alias="first", npfunc=first_compat, + skipna=skipna, ) @final - def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: + def last( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: """ - Compute the last non-null entry of each column. + Compute the last entry of each column within each group. + + Defaults to skipping NA elements. Parameters ---------- @@ -3416,12 +3430,17 @@ def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: everything, then use only numeric data. min_count : int, default -1 The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 2.2.1 Returns ------- Series or DataFrame - Last non-null of values within each group. + Last of values within each group. See Also -------- @@ -3461,6 +3480,7 @@ def last(x: Series): min_count=min_count, alias="last", npfunc=last_compat, + skipna=skipna, ) @final @@ -4374,9 +4394,9 @@ def quantile( starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups) def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: - if is_object_dtype(vals.dtype): + if isinstance(vals.dtype, StringDtype) or is_object_dtype(vals.dtype): raise TypeError( - "'quantile' cannot be performed against 'object' dtypes!" + f"dtype '{vals.dtype}' does not support operation 'quantile'" ) inference: DtypeObj | None = None diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e2224caad9e84..4bf2e8b90a0b0 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -1023,7 +1023,7 @@ def is_in_obj(gpr) -> bool: return False for gpr, level in zip(keys, levels): - if is_in_obj(gpr): # df.groupby(df['name']) + if isinstance(obj, DataFrame) and is_in_obj(gpr): # df.groupby(df['name']) in_axis = True exclusions.add(gpr.name) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5e83eaee02afc..e2ddf9aa5c0c1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -424,6 +424,7 @@ def _call_cython_op( mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, + **kwargs, ) elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]: if self.how in ["std", "sem"]: diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 929c7f4a63f8f..7e3ba4089ff60 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -148,6 +148,20 @@ def _delegate_method(self, name: str, *args, **kwargs): return result +@delegate_names( + delegate=ArrowExtensionArray, + accessors=TimedeltaArray._datetimelike_ops, + typ="property", + accessor_mapping=lambda x: f"_dt_{x}", + raise_on_missing=False, +) +@delegate_names( + delegate=ArrowExtensionArray, + accessors=TimedeltaArray._datetimelike_methods, + typ="method", + accessor_mapping=lambda x: f"_dt_{x}", + raise_on_missing=False, +) @delegate_names( delegate=ArrowExtensionArray, accessors=DatetimeArray._datetimelike_ops, @@ -213,6 +227,9 @@ def _delegate_method(self, name: str, *args, **kwargs): return result + def to_pytimedelta(self): + return cast(ArrowExtensionArray, self._parent.array)._dt_to_pytimedelta() + def to_pydatetime(self): # GH#20306 warnings.warn( @@ -241,6 +258,26 @@ def isocalendar(self) -> DataFrame: ) return iso_calendar_df + @property + def components(self) -> DataFrame: + from pandas import DataFrame + + components_df = DataFrame( + { + col: getattr(self._parent.array, f"_dt_{col}") + for col in [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ] + } + ) + return components_df + @delegate_names( delegate=DatetimeArray, @@ -592,7 +629,7 @@ def __new__(cls, data: Series): # pyright: ignore[reportInconsistentConstructor index=orig.index, ) - if isinstance(data.dtype, ArrowDtype) and data.dtype.kind == "M": + if isinstance(data.dtype, ArrowDtype) and data.dtype.kind in "Mm": return ArrowTemporalProperties(data, orig) if lib.is_np_dtype(data.dtype, "M"): return DatetimeProperties(data, orig) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 560285bd57a22..15292953e72d0 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -295,6 +295,7 @@ def _find_common_index_dtype(inds): raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") if len(dtis) == len(indexes): + sort = True result = indexes[0] elif len(dtis) > 1: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 88a08dd55f739..ad39907e7400e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -23,7 +23,7 @@ from pandas._config import ( get_option, using_copy_on_write, - using_pyarrow_string_dtype, + using_string_dtype, ) from pandas._libs import ( @@ -506,7 +506,8 @@ def __new__( elif is_ea_or_datetimelike_dtype(dtype): # non-EA dtype indexes have special casting logic, so we punt here - pass + if isinstance(data, (set, frozenset)): + data = list(data) elif is_ea_or_datetimelike_dtype(data_dtype): pass @@ -883,6 +884,8 @@ def _engine( # error: Item "ExtensionArray" of "Union[ExtensionArray, # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] target_values = self._data._ndarray # type: ignore[union-attr] + elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype): + return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr] # error: Argument 1 to "ExtensionEngine" has incompatible type # "ndarray[Any, Any]"; expected "ExtensionArray" @@ -912,11 +915,15 @@ def __len__(self) -> int: """ return len(self._data) - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: """ The array interface, return my values. """ - return np.asarray(self._data, dtype=dtype) + if copy is None: + # Note, that the if branch exists for NumPy 1.x support + return np.asarray(self._data, dtype=dtype) + + return np.array(self._data, dtype=dtype, copy=copy) def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs): @@ -956,7 +963,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): return self.__array_wrap__(result) @final - def __array_wrap__(self, result, context=None): + def __array_wrap__(self, result, context=None, return_scalar=False): """ Gets called after a ufunc and other functions e.g. np.split. """ @@ -3663,9 +3670,12 @@ def difference(self, other, sort=None): def _difference(self, other, sort): # overridden by RangeIndex + this = self + if isinstance(self, ABCCategoricalIndex) and self.hasnans and other.hasnans: + this = this.dropna() other = other.unique() - the_diff = self[other.get_indexer_for(self) == -1] - the_diff = the_diff if self.is_unique else the_diff.unique() + the_diff = this[other.get_indexer_for(this) == -1] + the_diff = the_diff if this.is_unique else the_diff.unique() the_diff = _maybe_try_sort(the_diff, sort) return the_diff @@ -4615,38 +4625,12 @@ def join( if level is not None and (self._is_multi or other._is_multi): return self._join_level(other, level, how=how) - lidx: np.ndarray | None - ridx: np.ndarray | None - - if len(other) == 0: - if how in ("left", "outer"): - if sort and not self.is_monotonic_increasing: - lidx = self.argsort() - join_index = self.take(lidx) - else: - lidx = None - join_index = self._view() - ridx = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, lidx, ridx - elif how in ("right", "inner", "cross"): - join_index = other._view() - lidx = np.array([], dtype=np.intp) - return join_index, lidx, None - - if len(self) == 0: - if how in ("right", "outer"): - if sort and not other.is_monotonic_increasing: - ridx = other.argsort() - join_index = other.take(ridx) - else: - ridx = None - join_index = other._view() - lidx = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, lidx, ridx - elif how in ("left", "inner", "cross"): - join_index = self._view() - ridx = np.array([], dtype=np.intp) - return join_index, None, ridx + if len(self) == 0 or len(other) == 0: + try: + return self._join_empty(other, how, sort) + except TypeError: + # object dtype; non-comparable objects + pass if self.dtype != other.dtype: dtype = self._find_common_type_compat(other) @@ -4681,6 +4665,33 @@ def join( return self._join_via_get_indexer(other, how, sort) + @final + def _join_empty( + self, other: Index, how: JoinHow, sort: bool + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + assert len(self) == 0 or len(other) == 0 + _validate_join_method(how) + + lidx: np.ndarray | None + ridx: np.ndarray | None + + if len(other): + how = cast(JoinHow, {"left": "right", "right": "left"}.get(how, how)) + join_index, ridx, lidx = other._join_empty(self, how, sort) + elif how in ["left", "outer"]: + if sort and not self.is_monotonic_increasing: + lidx = self.argsort() + join_index = self.take(lidx) + else: + lidx = None + join_index = self._view() + ridx = np.broadcast_to(np.intp(-1), len(join_index)) + else: + join_index = other._view() + lidx = np.array([], dtype=np.intp) + ridx = None + return join_index, lidx, ridx + @final def _join_via_get_indexer( self, other: Index, how: JoinHow, sort: bool @@ -5068,7 +5079,10 @@ def _can_use_libjoin(self) -> bool: return ( isinstance(self.dtype, np.dtype) or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray)) - or self.dtype == "string[python]" + or ( + isinstance(self.dtype, StringDtype) + and self.dtype.storage == "python" + ) ) # Exclude index types where the conversion to numpy converts to object dtype, # which negates the performance benefit of libjoin @@ -5314,7 +5328,9 @@ def _is_memory_usage_qualified(self) -> bool: """ Return a boolean if we need a qualified .info display. """ - return is_object_dtype(self.dtype) + return is_object_dtype(self.dtype) or ( + is_string_dtype(self.dtype) and self.dtype.storage == "python" # type: ignore[union-attr] + ) def __contains__(self, key: Any) -> bool: """ @@ -5616,9 +5632,10 @@ def equals(self, other: Any) -> bool: if ( isinstance(self.dtype, StringDtype) - and self.dtype.storage == "pyarrow_numpy" + and self.dtype.na_value is np.nan and other.dtype != self.dtype ): + # TODO(infer_string) can we avoid this special case? # special case for object behavior return other.equals(self.astype(object)) @@ -5913,17 +5930,14 @@ def sort_values( (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) """ if key is None and ( - self.is_monotonic_increasing or self.is_monotonic_decreasing + (ascending and self.is_monotonic_increasing) + or (not ascending and self.is_monotonic_decreasing) ): - reverse = ascending != self.is_monotonic_increasing - sorted_index = self[::-1] if reverse else self.copy() if return_indexer: indexer = np.arange(len(self), dtype=np.intp) - if reverse: - indexer = indexer[::-1] - return sorted_index, indexer + return self.copy(), indexer else: - return sorted_index + return self.copy() # GH 35584. Sort missing values according to na_position kwarg # ignore na_position for MultiIndex @@ -6121,7 +6135,6 @@ def _should_fallback_to_positional(self) -> bool: def get_indexer_non_unique( self, target ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: - target = ensure_index(target) target = self._maybe_cast_listlike_indexer(target) if not self._should_compare(target) and not self._should_partial_index(target): @@ -6409,7 +6422,11 @@ def _should_compare(self, other: Index) -> bool: return False dtype = _unpack_nested_dtype(other) - return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) + return ( + self._is_comparable_dtype(dtype) + or is_object_dtype(dtype) + or is_string_dtype(dtype) + ) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ @@ -6679,7 +6696,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index: """ Analogue to maybe_cast_indexer for get_indexer instead of get_loc. """ - return ensure_index(target) + target_index = ensure_index(target) + if ( + not hasattr(target, "dtype") + and self.dtype == object + and target_index.dtype == "string" + ): + # If we started with a list-like, avoid inference to string dtype if self + # is object dtype (coercing to string dtype will alter the missing values) + target_index = Index(target, dtype=self.dtype) + return target_index @final def _validate_indexer( @@ -6990,6 +7016,9 @@ def insert(self, loc: int, item) -> Index: # We cannot keep the same dtype, so cast to the (often object) # minimal shared dtype before doing the insert. dtype = self._find_common_type_compat(item) + if dtype == self.dtype: + # EA's might run into recursion errors if loc is invalid + raise return self.astype(dtype).insert(loc, item) if arr.dtype != object or not isinstance( @@ -7010,7 +7039,7 @@ def insert(self, loc: int, item) -> Index: out = Index._with_infer(new_values, name=self.name) if ( - using_pyarrow_string_dtype() + using_string_dtype() and is_string_dtype(out.dtype) and new_values.dtype == object ): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index c978abd8c2427..3204a9c97ee73 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -276,7 +276,7 @@ def _engine_type(self) -> type[libindex.DatetimeEngine]: @doc(DatetimeArray.strftime) def strftime(self, date_format) -> Index: arr = self._data.strftime(date_format) - return Index(arr, name=self.name, dtype=object) + return Index(arr, name=self.name, dtype=arr.dtype) @doc(DatetimeArray.tz_convert) def tz_convert(self, tz) -> Self: diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 61949531f37df..371d3c811e772 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -71,7 +71,7 @@ def fget(self): return type(self)._simple_new(result, name=self.name) elif isinstance(result, ABCDataFrame): return result.set_index(self) - return Index(result, name=self.name) + return Index(result, name=self.name, dtype=result.dtype) return result def fset(self, value) -> None: @@ -98,7 +98,7 @@ def method(self, *args, **kwargs): # type: ignore[misc] return type(self)._simple_new(result, name=self.name) elif isinstance(result, ABCDataFrame): return result.set_index(self) - return Index(result, name=self.name) + return Index(result, name=self.name, dtype=result.dtype) return result # error: "property" has no attribute "__name__" diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 4fcdb87974511..635924674d9f4 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -50,6 +50,7 @@ is_number, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -699,7 +700,7 @@ def _get_indexer( # left/right get_indexer, compare elementwise, equality -> match indexer = self._get_indexer_unique_sides(target) - elif not is_object_dtype(target.dtype): + elif not (is_object_dtype(target.dtype) or is_string_dtype(target.dtype)): # homogeneous scalar index: use IntervalTree # we should always have self._should_partial_index(target) here target = self._maybe_convert_i8(target) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2a4e027e2b806..8954d49649a2b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -65,6 +65,7 @@ is_list_like, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -774,7 +775,7 @@ def _values(self) -> np.ndarray: ): vals = vals.astype(object) - vals = np.array(vals, copy=False) + vals = np.asarray(vals) vals = algos.take_nd(vals, codes, fill_value=index._na_value) values.append(vals) @@ -1309,8 +1310,24 @@ def copy( # type: ignore[override] new_index._id = self._id return new_index - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: """the array interface, return my values""" + if copy is False: + # self.values is always a newly construct array, so raise. + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if copy is True: + # explicit np.array call to ensure a copy is made and unique objects + # are returned, because self.values is cached + return np.array(self.values, dtype=dtype) return self.values def view(self, cls=None) -> Self: @@ -1335,10 +1352,12 @@ def dtype(self) -> np.dtype: def _is_memory_usage_qualified(self) -> bool: """return a boolean if we need a qualified .info display""" - def f(level) -> bool: - return "mixed" in level or "string" in level or "unicode" in level + def f(dtype) -> bool: + return is_object_dtype(dtype) or ( + is_string_dtype(dtype) and dtype.storage == "python" + ) - return any(f(level) for level in self._inferred_type_levels) + return any(f(level.dtype) for level in self.levels) # Cannot determine type of "memory_usage" @doc(Index.memory_usage) # type: ignore[has-type] @@ -3397,7 +3416,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): locs = (level_codes >= idx.start) & (level_codes < idx.stop) return locs - locs = np.array(level_codes == idx, dtype=bool, copy=False) + locs = np.asarray(level_codes == idx, dtype=bool) if not locs.any(): # The label is present in self.levels[level] but unused: @@ -3488,6 +3507,8 @@ def _to_bool_indexer(indexer) -> npt.NDArray[np.bool_]: "is not the same length as the index" ) lvl_indexer = np.asarray(k) + if indexer is None: + lvl_indexer = lvl_indexer.copy() elif is_list_like(k): # a collection of labels to include from this level (these are or'd) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 4be7e17035128..869e511fc0720 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -57,6 +57,7 @@ ABCSeries, ) from pandas.core.dtypes.missing import ( + construct_1d_array_from_inferred_fill_value, infer_fill_value, is_valid_na_for_dtype, isna, @@ -68,7 +69,6 @@ from pandas.core.construction import ( array as pd_array, extract_array, - sanitize_array, ) from pandas.core.indexers import ( check_array_indexer, @@ -844,7 +844,6 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: if self.ndim != 2: return - orig_key = key if isinstance(key, tuple) and len(key) > 1: # key may be a tuple if we are .loc # if length of key is > 1 set key to column part @@ -862,7 +861,7 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: keys = self.obj.columns.union(key, sort=False) diff = Index(key).difference(self.obj.columns, sort=False) - if len(diff) and com.is_null_slice(orig_key[0]): + if len(diff): # e.g. if we are doing df.loc[:, ["A", "B"]] = 7 and "B" # is a new column, add the new columns with dtype=np.void # so that later when we go through setitem_single_column @@ -1878,12 +1877,9 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): self.obj[key] = empty_value elif not is_list_like(value): - # Find our empty_value dtype by constructing an array - # from our value and doing a .take on it - arr = sanitize_array(value, Index(range(1)), copy=False) - taker = -1 * np.ones(len(self.obj), dtype=np.intp) - empty_value = algos.take_nd(arr, taker) - self.obj[key] = empty_value + self.obj[key] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) else: # FIXME: GH#42099#issuecomment-864326014 self.obj[key] = infer_fill_value(value) @@ -2141,10 +2137,41 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: # If we're setting an entire column and we can't do it inplace, # then we can use value's dtype (or inferred dtype) # instead of object + dtype = self.obj.dtypes.iloc[loc] + if dtype not in (np.void, object) and not self.obj.empty: + # - Exclude np.void, as that is a special case for expansion. + # We want to warn for + # df = pd.DataFrame({'a': [1, 2]}) + # df.loc[:, 'a'] = .3 + # but not for + # df = pd.DataFrame({'a': [1, 2]}) + # df.loc[:, 'b'] = .3 + # - Exclude `object`, as then no upcasting happens. + # - Exclude empty initial object with enlargement, + # as then there's nothing to be inconsistent with. + warnings.warn( + f"Setting an item of incompatible dtype is deprecated " + "and will raise in a future error of pandas. " + f"Value '{value}' has dtype incompatible with {dtype}, " + "please explicitly cast to a compatible dtype first.", + FutureWarning, + stacklevel=find_stack_level(), + ) self.obj.isetitem(loc, value) else: # set value into the column (first attempting to operate inplace, then # falling back to casting if necessary) + dtype = self.obj.dtypes.iloc[loc] + if dtype == np.void: + # This means we're expanding, with multiple columns, e.g. + # df = pd.DataFrame({'A': [1,2,3], 'B': [4,5,6]}) + # df.loc[df.index <= 2, ['F', 'G']] = (1, 'abc') + # Columns F and G will initially be set to np.void. + # Here, we replace those temporary `np.void` columns with + # columns of the appropriate dtype, based on `value`. + self.obj.iloc[:, loc] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) self.obj._mgr.column_setitem(loc, plane_indexer, value) self.obj._clear_item_cache() diff --git a/pandas/core/interchange/buffer.py b/pandas/core/interchange/buffer.py index a54e4428bd836..5d24325e67f62 100644 --- a/pandas/core/interchange/buffer.py +++ b/pandas/core/interchange/buffer.py @@ -12,6 +12,7 @@ if TYPE_CHECKING: import numpy as np + import pyarrow as pa class PandasBuffer(Buffer): @@ -23,7 +24,7 @@ def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: """ Handle only regular columns (= numpy arrays) for now. """ - if not x.strides == (x.dtype.itemsize,): + if x.strides[0] and not x.strides == (x.dtype.itemsize,): # The protocol does not support strided buffers, so a copy is # necessary. If that's not allowed, we need to raise an exception. if allow_copy: @@ -76,3 +77,60 @@ def __repr__(self) -> str: ) + ")" ) + + +class PandasBufferPyarrow(Buffer): + """ + Data in the buffer is guaranteed to be contiguous in memory. + """ + + def __init__( + self, + buffer: pa.Buffer, + *, + length: int, + ) -> None: + """ + Handle pyarrow chunked arrays. + """ + self._buffer = buffer + self._length = length + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self._buffer.size + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return self._buffer.address + + def __dlpack__(self) -> Any: + """ + Represent this structure as DLPack interface. + """ + raise NotImplementedError() + + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + """ + return (DlpackDeviceType.CPU, None) + + def __repr__(self) -> str: + return ( + "PandasBuffer[pyarrow](" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "device": "CPU", + } + ) + + ")" + ) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index acfbc5d9e6c62..d59a3df694bb3 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -1,6 +1,9 @@ from __future__ import annotations -from typing import Any +from typing import ( + TYPE_CHECKING, + Any, +) import numpy as np @@ -9,14 +12,18 @@ from pandas.errors import NoBufferPresent from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.dtypes import ( +from pandas.core.dtypes.dtypes import BaseMaskedDtype + +import pandas as pd +from pandas import ( ArrowDtype, DatetimeTZDtype, ) - -import pandas as pd from pandas.api.types import is_string_dtype -from pandas.core.interchange.buffer import PandasBuffer +from pandas.core.interchange.buffer import ( + PandasBuffer, + PandasBufferPyarrow, +) from pandas.core.interchange.dataframe_protocol import ( Column, ColumnBuffers, @@ -29,6 +36,9 @@ dtype_to_arrow_c_fmt, ) +if TYPE_CHECKING: + from pandas.core.interchange.dataframe_protocol import Buffer + _NP_KINDS = { "i": DtypeKind.INT, "u": DtypeKind.UINT, @@ -76,6 +86,14 @@ def __init__(self, column: pd.Series, allow_copy: bool = True) -> None: Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. """ + if isinstance(column, pd.DataFrame): + raise TypeError( + "Expected a Series, got a DataFrame. This likely happened " + "because you called __dataframe__ on a DataFrame which, " + "after converting column names to string, resulted in duplicated " + f"names: {column.columns}. Please rename these columns before " + "using the interchange protocol." + ) if not isinstance(column, pd.Series): raise NotImplementedError(f"Columns of type {type(column)} not handled yet") @@ -116,7 +134,7 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]: Endianness.NATIVE, ) elif is_string_dtype(dtype): - if infer_dtype(self._col) == "string": + if infer_dtype(self._col) in ("string", "empty"): return ( DtypeKind.STRING, 8, @@ -143,9 +161,21 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: byteorder = dtype.numpy_dtype.byteorder elif isinstance(dtype, DatetimeTZDtype): byteorder = dtype.base.byteorder # type: ignore[union-attr] + elif isinstance(dtype, BaseMaskedDtype): + byteorder = dtype.numpy_dtype.byteorder else: byteorder = dtype.byteorder + if dtype == "bool[pyarrow]": + # return early to avoid the `* 8` below, as this is a bitmask + # rather than a bytemask + return ( + kind, + dtype.itemsize, # pyright: ignore[reportGeneralTypeIssues] + ArrowCTypes.BOOL, + byteorder, + ) + return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder @property @@ -179,6 +209,16 @@ def describe_categorical(self): @property def describe_null(self): + if isinstance(self._col.dtype, BaseMaskedDtype): + column_null_dtype = ColumnNullType.USE_BYTEMASK + null_value = 1 + return column_null_dtype, null_value + if isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, so this + # is already single-chunk by the time we get here. + if self._col.array._pa_array.chunks[0].buffers()[0] is None: # type: ignore[attr-defined] + return ColumnNullType.NON_NULLABLE, None + return ColumnNullType.USE_BITMASK, 0 kind = self.dtype[0] try: null, value = _NULL_DESCRIPTION[kind] @@ -263,10 +303,11 @@ def get_buffers(self) -> ColumnBuffers: def _get_data_buffer( self, - ) -> tuple[PandasBuffer, Any]: # Any is for self.dtype tuple + ) -> tuple[Buffer, tuple[DtypeKind, int, str, str]]: """ Return the buffer containing the data and the buffer's associated dtype. """ + buffer: Buffer if self.dtype[0] in ( DtypeKind.INT, DtypeKind.UINT, @@ -276,12 +317,25 @@ def _get_data_buffer( ): # self.dtype[2] is an ArrowCTypes.TIMESTAMP where the tz will make # it longer than 4 characters + dtype = self.dtype if self.dtype[0] == DtypeKind.DATETIME and len(self.dtype[2]) > 4: np_arr = self._col.dt.tz_convert(None).to_numpy() else: - np_arr = self._col.to_numpy() + arr = self._col.array + if isinstance(self._col.dtype, BaseMaskedDtype): + np_arr = arr._data # type: ignore[attr-defined] + elif isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, + # so this is already single-chunk by the time we get here. + arr = arr._pa_array.chunks[0] # type: ignore[attr-defined] + buffer = PandasBufferPyarrow( + arr.buffers()[1], # type: ignore[attr-defined] + length=len(arr), + ) + return buffer, dtype + else: + np_arr = arr._ndarray # type: ignore[attr-defined] buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) - dtype = self.dtype elif self.dtype[0] == DtypeKind.CATEGORICAL: codes = self._col.values._codes buffer = PandasBuffer(codes, allow_copy=self._allow_copy) @@ -301,24 +355,40 @@ def _get_data_buffer( buffer = PandasBuffer(np.frombuffer(b, dtype="uint8")) # Define the dtype for the returned buffer - dtype = ( - DtypeKind.STRING, - 8, - ArrowCTypes.STRING, - Endianness.NATIVE, - ) # note: currently only support native endianness + # TODO: this will need correcting + # https://github.com/pandas-dev/pandas/issues/54781 + dtype = self.dtype else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") return buffer, dtype - def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]: + def _get_validity_buffer(self) -> tuple[Buffer, Any] | None: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. Raises NoBufferPresent if null representation is not a bit or byte mask. """ null, invalid = self.describe_null + buffer: Buffer + if isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, so this + # is already single-chunk by the time we get here. + arr = self._col.array._pa_array.chunks[0] # type: ignore[attr-defined] + dtype = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE) + if arr.buffers()[0] is None: + return None + buffer = PandasBufferPyarrow( + arr.buffers()[0], + length=len(arr), + ) + return buffer, dtype + + if isinstance(self._col.dtype, BaseMaskedDtype): + mask = self._col.array._mask # type: ignore[attr-defined] + buffer = PandasBuffer(mask) + dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE) + return buffer, dtype if self.dtype[0] == DtypeKind.STRING: # For now, use byte array as the mask. diff --git a/pandas/core/interchange/dataframe.py b/pandas/core/interchange/dataframe.py index 4f08b2c2b3a7b..1abacddfc7e3b 100644 --- a/pandas/core/interchange/dataframe.py +++ b/pandas/core/interchange/dataframe.py @@ -5,6 +5,7 @@ from pandas.core.interchange.column import PandasColumn from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg +from pandas.core.interchange.utils import maybe_rechunk if TYPE_CHECKING: from collections.abc import ( @@ -32,8 +33,12 @@ def __init__(self, df: DataFrame, allow_copy: bool = True) -> None: Constructor - an instance of this (private) class is returned from `pd.DataFrame.__dataframe__`. """ - self._df = df + self._df = df.rename(columns=str, copy=False) self._allow_copy = allow_copy + for i, _col in enumerate(self._df.columns): + rechunked = maybe_rechunk(self._df.iloc[:, i], allow_copy=allow_copy) + if rechunked is not None: + self._df.isetitem(i, rechunked) def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index d45ae37890ba7..53f18883ea3ad 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -6,6 +6,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas.compat._optional import import_optional_dependency from pandas.errors import SettingWithCopyError @@ -124,8 +126,6 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: ------- pd.DataFrame """ - # We need a dict of columns here, with each column being a NumPy array (at - # least for now, deal with non-NumPy dtypes later). columns: dict[str, Any] = {} buffers = [] # hold on to buffers, keeps memory alive for name in df.column_names(): @@ -295,13 +295,14 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: null_pos = None if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): - assert buffers["validity"], "Validity buffers cannot be empty for masks" - valid_buff, valid_dtype = buffers["validity"] - null_pos = buffer_to_ndarray( - valid_buff, valid_dtype, offset=col.offset, length=col.size() - ) - if sentinel_val == 0: - null_pos = ~null_pos + validity = buffers["validity"] + if validity is not None: + valid_buff, valid_dtype = validity + null_pos = buffer_to_ndarray( + valid_buff, valid_dtype, offset=col.offset, length=col.size() + ) + if sentinel_val == 0: + null_pos = ~null_pos # Assemble the strings from the code units str_list: list[None | float | str] = [None] * col.size() @@ -323,8 +324,12 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: # Add to our list of strings str_list[i] = string - # Convert the string list to a NumPy array - return np.asarray(str_list, dtype="object"), buffers + if using_string_dtype(): + res = pd.Series(str_list, dtype="str") + else: + res = np.asarray(str_list, dtype="object") # type: ignore[assignment] + + return res, buffers # type: ignore[return-value] def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray: @@ -486,6 +491,8 @@ def set_nulls( np.ndarray or pd.Series Data with the nulls being set. """ + if validity is None: + return data null_kind, sentinel_val = col.describe_null null_pos = None diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index 4ac063080e62d..035a1f8abdbc5 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -16,6 +16,8 @@ DatetimeTZDtype, ) +import pandas as pd + if typing.TYPE_CHECKING: from pandas._typing import DtypeObj @@ -37,6 +39,7 @@ "float": "f", # float32 "double": "g", # float64 "string": "u", + "large_string": "U", "binary": "z", "time32[s]": "tts", "time32[ms]": "ttm", @@ -132,7 +135,12 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: if format_str is not None: return format_str - if lib.is_np_dtype(dtype, "M"): + if isinstance(dtype, pd.StringDtype): + # TODO(infer_string) this should be LARGE_STRING for pyarrow storage, + # but current tests don't cover this distinction + return ArrowCTypes.STRING + + elif lib.is_np_dtype(dtype, "M"): # Selecting the first char of resolution string: # dtype.str -> ' 'n' resolution = np.datetime_data(dtype)[0][0] @@ -141,6 +149,35 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: elif isinstance(dtype, DatetimeTZDtype): return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz) + elif isinstance(dtype, pd.BooleanDtype): + return ArrowCTypes.BOOL + raise NotImplementedError( f"Conversion of {dtype} to Arrow C format string is not implemented." ) + + +def maybe_rechunk(series: pd.Series, *, allow_copy: bool) -> pd.Series | None: + """ + Rechunk a multi-chunk pyarrow array into a single-chunk array, if necessary. + + - Returns `None` if the input series is not backed by a multi-chunk pyarrow array + (and so doesn't need rechunking) + - Returns a single-chunk-backed-Series if the input is backed by a multi-chunk + pyarrow array and `allow_copy` is `True`. + - Raises a `RuntimeError` if `allow_copy` is `False` and input is a + based by a multi-chunk pyarrow array. + """ + if not isinstance(series.dtype, pd.ArrowDtype): + return None + chunked_array = series.array._pa_array # type: ignore[attr-defined] + if len(chunked_array.chunks) == 1: + return None + if not allow_copy: + raise RuntimeError( + "Found multi-chunk pyarrow array, but `allow_copy` is False. " + "Please rechunk the array before calling this function, or set " + "`allow_copy=True`." + ) + arr = chunked_array.combine_chunks() + return pd.Series(arr, dtype=series.dtype, name=series.name, index=series.index) diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index e5ef44d07061e..b0b3937ca47ea 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -9,12 +9,10 @@ from __future__ import annotations from typing import TYPE_CHECKING -import warnings import numpy as np from pandas._libs.internals import BlockPlacement -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.dtypes import ( @@ -52,14 +50,6 @@ def make_block( - Block.make_block_same_class - Block.__init__ """ - warnings.warn( - # GH#40226 - "make_block is deprecated and will be removed in a future version. " - "Use public APIs instead.", - DeprecationWarning, - stacklevel=find_stack_level(), - ) - if dtype is not None: dtype = pandas_dtype(dtype) @@ -123,6 +113,7 @@ def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int def __getattr__(name: str): # GH#55139 + import warnings if name in [ "Block", diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 20eff9315bc80..452c919449ec4 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import wraps +import inspect import re from typing import ( TYPE_CHECKING, @@ -83,6 +84,7 @@ ABCNumpyExtensionArray, ABCSeries, ) +from pandas.core.dtypes.inference import is_re from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, @@ -114,6 +116,7 @@ PeriodArray, TimedeltaArray, ) +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.computation import expressions @@ -475,7 +478,9 @@ def split_and_operate(self, func, *args, **kwargs) -> list[Block]: # Up/Down-casting @final - def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: + def coerce_to_target_dtype( + self, other, warn_on_upcast: bool = False, using_cow: bool = False + ) -> Block: """ coerce the current block to a dtype compat for other we will return a block, possibly object, and not raise @@ -498,6 +503,9 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: and is_integer_dtype(self.values.dtype) and isna(other) and other is not NaT + and not ( + isinstance(other, (np.datetime64, np.timedelta64)) and np.isnat(other) + ) ): warn_on_upcast = False elif ( @@ -512,7 +520,7 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: if warn_on_upcast: warnings.warn( f"Setting an item of incompatible dtype is deprecated " - "and will raise in a future error of pandas. " + "and will raise an error in a future version of pandas. " f"Value '{other}' has dtype incompatible with {self.values.dtype}, " "please explicitly cast to a compatible dtype first.", FutureWarning, @@ -524,7 +532,14 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: f"{self.values.dtype}. Please report a bug at " "https://github.com/pandas-dev/pandas/issues." ) - return self.astype(new_dtype, copy=False) + copy = False + if ( + not using_cow + and isinstance(self.dtype, StringDtype) + and self.dtype.storage == "python" + ): + copy = True + return self.astype(new_dtype, copy=copy, using_cow=using_cow) @final def _maybe_downcast( @@ -548,7 +563,12 @@ def _maybe_downcast( return blocks nbs = extend_blocks( - [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] + [ + blk.convert( + using_cow=using_cow, copy=not using_cow, convert_string=False + ) + for blk in blocks + ] ) if caller == "fillna": if len(nbs) != len(blocks) or not all( @@ -621,6 +641,7 @@ def convert( *, copy: bool = True, using_cow: bool = False, + convert_string: bool = True, ) -> list[Block]: """ Attempt to coerce any object types to better types. Return a copy @@ -633,7 +654,10 @@ def convert( if self.ndim != 1 and self.shape[0] != 1: blocks = self.split_and_operate( - Block.convert, copy=copy, using_cow=using_cow + Block.convert, + copy=copy, + using_cow=using_cow, + convert_string=convert_string, ) if all(blk.dtype.kind == "O" for blk in blocks): # Avoid fragmenting the block if convert is a no-op @@ -651,10 +675,16 @@ def convert( res_values = lib.maybe_convert_objects( values, # type: ignore[arg-type] convert_non_numeric=True, + convert_string=convert_string, ) refs = None - if copy and res_values is values: - res_values = values.copy() + if ( + copy + and res_values is values + or isinstance(res_values, NumpyExtensionArray) + and res_values._ndarray is values + ): + res_values = res_values.copy() elif res_values is values: refs = self.refs @@ -831,6 +861,7 @@ def replace( mask: npt.NDArray[np.bool_] | None = None, using_cow: bool = False, already_warned=None, + convert_string=None, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -870,7 +901,7 @@ def replace( else: return [self] if inplace else [self.copy()] - elif self._can_hold_element(value): + elif self._can_hold_element(value) or (self.dtype == "string" and is_re(value)): # TODO(CoW): Maybe split here as well into columns where mask has True # and rest? blk = self._maybe_copy(using_cow, inplace) @@ -895,7 +926,11 @@ def replace( if get_option("future.no_silent_downcasting") is True: blocks = [blk] else: - blocks = blk.convert(copy=False, using_cow=using_cow) + blocks = blk.convert( + copy=False, + using_cow=using_cow, + convert_string=convert_string or self.dtype == "string", + ) if len(blocks) > 1 or blocks[0].dtype != blk.dtype: warnings.warn( # GH#54710 @@ -917,12 +952,14 @@ def replace( if value is None or value is NA: blk = self.astype(np.dtype(object)) else: - blk = self.coerce_to_target_dtype(value) + blk = self.coerce_to_target_dtype(value, using_cow=using_cow) return blk.replace( to_replace=to_replace, value=value, inplace=True, mask=mask, + using_cow=using_cow, + convert_string=convert_string, ) else: @@ -937,6 +974,7 @@ def replace( inplace=True, mask=mask[i : i + 1], using_cow=using_cow, + convert_string=convert_string, ) ) return blocks @@ -949,6 +987,7 @@ def _replace_regex( inplace: bool = False, mask=None, using_cow: bool = False, + convert_string=None, already_warned=None, ) -> list[Block]: """ @@ -971,16 +1010,26 @@ def _replace_regex( ------- List[Block] """ - if not self._can_hold_element(to_replace): + if not is_re(to_replace) and not self._can_hold_element(to_replace): # i.e. only if self.is_object is True, but could in principle include a # String ExtensionBlock if using_cow: return [self.copy(deep=False)] return [self] if inplace else [self.copy()] - rx = re.compile(to_replace) + if is_re(to_replace) and self.dtype not in [object, "string"]: + # only object or string dtype can hold strings, and a regex object + # will only match strings + return [self.copy(deep=False)] + + if not ( + self._can_hold_element(value) or (self.dtype == "string" and is_re(value)) + ): + block = self.astype(np.dtype(object)) + else: + block = self._maybe_copy(using_cow, inplace) - block = self._maybe_copy(using_cow, inplace) + rx = re.compile(to_replace) replace_regex(block.values, rx, value, mask) @@ -998,9 +1047,19 @@ def _replace_regex( ) already_warned.warned_already = True - nbs = block.convert(copy=False, using_cow=using_cow) + nbs = block.convert( + copy=False, + using_cow=using_cow, + convert_string=convert_string or self.dtype == "string", + ) opt = get_option("future.no_silent_downcasting") - if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: + if ( + len(nbs) > 1 + or ( + nbs[0].dtype != block.dtype + and not (self.dtype == "string" and nbs[0].dtype == "string") + ) + ) and not opt: warnings.warn( # GH#54710 "Downcasting behavior in `replace` is deprecated and " @@ -1037,9 +1096,13 @@ def replace_list( values._replace(to_replace=src_list, value=dest_list, inplace=True) return [blk] + convert_string = self.dtype == "string" + # Exclude anything that we know we won't contain pairs = [ - (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) + (x, y) + for x, y in zip(src_list, dest_list) + if (self._can_hold_element(x) or (self.dtype == "string" and is_re(x))) ] if not len(pairs): if using_cow: @@ -1119,6 +1182,7 @@ def replace_list( inplace=inplace, regex=regex, using_cow=using_cow, + convert_string=convert_string, ) if using_cow and i != src_len: @@ -1141,7 +1205,9 @@ def replace_list( nbs = [] for res_blk in result: converted = res_blk.convert( - copy=True and not using_cow, using_cow=using_cow + copy=True and not using_cow, + using_cow=using_cow, + convert_string=convert_string, ) if len(converted) > 1 or converted[0].dtype != res_blk.dtype: warnings.warn( @@ -1171,6 +1237,7 @@ def _replace_coerce( inplace: bool = True, regex: bool = False, using_cow: bool = False, + convert_string: bool = True, ) -> list[Block]: """ Replace value corresponding to the given boolean array with another @@ -1199,6 +1266,8 @@ def _replace_coerce( value, inplace=inplace, mask=mask, + using_cow=using_cow, + convert_string=convert_string, ) else: if value is None: @@ -1214,7 +1283,7 @@ def _replace_coerce( putmask_inplace(nb.values, mask, value) return [nb] if using_cow: - return [self] + return [self.copy(deep=False)] return [self] if inplace else [self.copy()] return self.replace( to_replace=to_replace, @@ -1222,6 +1291,7 @@ def _replace_coerce( inplace=inplace, mask=mask, using_cow=using_cow, + convert_string=convert_string, ) # --------------------------------------------------------------------- @@ -1421,7 +1491,14 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: if isinstance(casted, np.ndarray) and casted.ndim == 1 and len(casted) == 1: # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 casted = casted[0, ...] - values[indexer] = casted + try: + values[indexer] = casted + except (TypeError, ValueError) as err: + if is_list_like(casted): + raise ValueError( + "setting an array element with a sequence." + ) from err + raise return self def putmask( @@ -1669,7 +1746,7 @@ def fillna( return nbs if limit is not None: - mask[mask.cumsum(self.ndim - 1) > limit] = False + mask[mask.cumsum(self.values.ndim - 1) > limit] = False if inplace: nbs = self.putmask( @@ -2095,9 +2172,16 @@ def where( res_values = arr._where(cond, other).T except (ValueError, TypeError): if self.ndim == 1 or self.shape[0] == 1: - if isinstance(self.dtype, IntervalDtype): + if isinstance(self.dtype, (IntervalDtype, StringDtype)): # TestSetitemFloatIntervalWithIntIntervalValues blk = self.coerce_to_target_dtype(orig_other) + if ( + self.ndim == 2 + and isinstance(orig_cond, np.ndarray) + and orig_cond.ndim == 1 + and not is_1d_only_ea_dtype(blk.dtype) + ): + orig_cond = orig_cond[:, None] nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) return self._maybe_downcast( nbs, downcast=_downcast, using_cow=using_cow, caller="where" @@ -2256,11 +2340,21 @@ def pad_or_backfill( ) -> list[Block]: values = self.values + kwargs: dict[str, Any] = {"method": method, "limit": limit} + if "limit_area" in inspect.signature(values._pad_or_backfill).parameters: + kwargs["limit_area"] = limit_area + elif limit_area is not None: + raise NotImplementedError( + f"{type(values).__name__} does not implement limit_area " + "(added in pandas 2.2). 3rd-party ExtnsionArray authors " + "need to add this argument to _pad_or_backfill." + ) + if values.ndim == 2 and axis == 1: # NDArrayBackedExtensionArray.fillna assumes axis=0 - new_values = values.T._pad_or_backfill(method=method, limit=limit).T + new_values = values.T._pad_or_backfill(**kwargs).T else: - new_values = values._pad_or_backfill(method=method, limit=limit) + new_values = values._pad_or_backfill(**kwargs) return [self.make_block_same_class(new_values)] @@ -2287,7 +2381,7 @@ def fillna( using_cow: bool = False, already_warned=None, ) -> list[Block]: - if isinstance(self.dtype, IntervalDtype): + if isinstance(self.dtype, (IntervalDtype, StringDtype)): # Block.fillna handles coercion (test_fillna_interval) return super().fillna( value=value, diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 609d2c9a7a285..64fac5fcfcdc2 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -13,7 +13,7 @@ import numpy as np from numpy import ma -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib @@ -305,12 +305,12 @@ def ndarray_to_mgr( elif isinstance(values, (np.ndarray, ExtensionArray)): # drop subclass info - _copy = ( - copy_on_sanitize - if (dtype is None or astype_is_view(values.dtype, dtype)) - else False - ) - values = np.array(values, copy=_copy) + if copy_on_sanitize and (dtype is None or astype_is_view(values.dtype, dtype)): + # only force a copy now if copy=True was requested + # and a subsequent `astype` will not already result in a copy + values = np.array(values, copy=True, order="F") + else: + values = np.asarray(values) values = _ensure_2d(values) else: @@ -375,8 +375,8 @@ def ndarray_to_mgr( bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] - elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype(): - dtype = StringDtype(storage="pyarrow_numpy") + elif dtype is None and values.dtype.kind == "U" and using_string_dtype(): + dtype = StringDtype(na_value=np.nan) obj_columns = list(values) block_values = [ @@ -1042,8 +1042,9 @@ def convert(arr): if dtype is None: if arr.dtype == np.dtype("O"): # i.e. maybe_convert_objects didn't convert - arr = maybe_infer_to_datetimelike(arr) - if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): + convert_to_nullable_dtype = dtype_backend != "numpy" + arr = maybe_infer_to_datetimelike(arr, convert_to_nullable_dtype) + if convert_to_nullable_dtype and arr.dtype == np.dtype("O"): new_dtype = StringDtype() arr_cls = new_dtype.construct_array_type() arr = arr_cls._from_sequence(arr, dtype=new_dtype) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3719bf1f77f85..2e0e04717373f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -12,7 +12,6 @@ cast, ) import warnings -import weakref import numpy as np @@ -282,8 +281,8 @@ def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool: Checks if two blocks from two different block managers reference the same underlying values. """ - ref = weakref.ref(self.blocks[blkno]) - return ref in mgr.blocks[blkno].refs.referenced_blocks + blk = self.blocks[blkno] + return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks) def get_dtypes(self) -> npt.NDArray[np.object_]: dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object) @@ -1683,6 +1682,8 @@ def as_array( na_value=na_value, copy=copy, ).reshape(blk.shape) + elif not copy: + arr = np.asarray(blk.values, dtype=dtype) else: arr = np.array(blk.values, dtype=dtype, copy=copy) diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index 7bd4851425c3b..accbd92a91ed6 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -171,13 +171,9 @@ def to_dict( return into_c( ( k, - list( - map( - maybe_box_native, v.to_numpy(na_value=box_na_values[i]).tolist() - ) - ) + list(map(maybe_box_native, v.to_numpy(na_value=box_na_values[i]))) if i in object_dtype_indices_as_set - else v.to_numpy().tolist(), + else list(map(maybe_box_native, v.to_numpy())), ) for i, (k, v) in enumerate(df.items()) ) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index d275445983b6f..c016aab8ad074 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -3,10 +3,7 @@ """ from __future__ import annotations -from functools import ( - partial, - wraps, -) +from functools import wraps from typing import ( TYPE_CHECKING, Any, @@ -34,6 +31,7 @@ from pandas.core.dtypes.cast import infer_dtype_from from pandas.core.dtypes.common import ( is_array_like, + is_bool_dtype, is_numeric_dtype, is_numeric_v_string_like, is_object_dtype, @@ -103,21 +101,34 @@ def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]: # GH 21977 mask = np.zeros(arr.shape, dtype=bool) - for x in nonna: - if is_numeric_v_string_like(arr, x): - # GH#29553 prevent numpy deprecation warnings - pass - else: - if potential_na: - new_mask = np.zeros(arr.shape, dtype=np.bool_) - new_mask[arr_mask] = arr[arr_mask] == x + if ( + is_numeric_dtype(arr.dtype) + and not is_bool_dtype(arr.dtype) + and is_bool_dtype(nonna.dtype) + ): + pass + elif ( + is_bool_dtype(arr.dtype) + and is_numeric_dtype(nonna.dtype) + and not is_bool_dtype(nonna.dtype) + ): + pass + else: + for x in nonna: + if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings + pass else: - new_mask = arr == x + if potential_na: + new_mask = np.zeros(arr.shape, dtype=np.bool_) + new_mask[arr_mask] = arr[arr_mask] == x + else: + new_mask = arr == x - if not isinstance(new_mask, np.ndarray): - # usually BooleanArray - new_mask = new_mask.to_numpy(dtype=bool, na_value=False) - mask |= new_mask + if not isinstance(new_mask, np.ndarray): + # usually BooleanArray + new_mask = new_mask.to_numpy(dtype=bool, na_value=False) + mask |= new_mask if na_mask.any(): mask |= isna(arr) @@ -338,6 +349,7 @@ def interpolate_2d_inplace( limit_direction: str = "forward", limit_area: str | None = None, fill_value: Any | None = None, + mask=None, **kwargs, ) -> None: """ @@ -385,6 +397,7 @@ def func(yvalues: np.ndarray) -> None: limit_area=limit_area_validated, fill_value=fill_value, bounds_error=False, + mask=mask, **kwargs, ) @@ -429,6 +442,7 @@ def _interpolate_1d( fill_value: Any | None = None, bounds_error: bool = False, order: int | None = None, + mask=None, **kwargs, ) -> None: """ @@ -442,8 +456,10 @@ def _interpolate_1d( ----- Fills 'yvalues' in-place. """ - - invalid = isna(yvalues) + if mask is not None: + invalid = mask + else: + invalid = isna(yvalues) valid = ~invalid if not valid.any(): @@ -520,7 +536,10 @@ def _interpolate_1d( **kwargs, ) - if is_datetimelike: + if mask is not None: + mask[:] = False + mask[preserve_nans] = True + elif is_datetimelike: yvalues[preserve_nans] = NaT.value else: yvalues[preserve_nans] = np.nan @@ -823,6 +842,7 @@ def _interpolate_with_limit_area( values, method=method, limit=limit, + limit_area=limit_area, ) if limit_area == "inside": @@ -863,27 +883,6 @@ def pad_or_backfill_inplace( ----- Modifies values in-place. """ - if limit_area is not None: - np.apply_along_axis( - # error: Argument 1 to "apply_along_axis" has incompatible type - # "partial[None]"; expected - # "Callable[..., Union[_SupportsArray[dtype[]], - # Sequence[_SupportsArray[dtype[]]], - # Sequence[Sequence[_SupportsArray[dtype[]]]], - # Sequence[Sequence[Sequence[_SupportsArray[dtype[]]]]], - # Sequence[Sequence[Sequence[Sequence[_ - # SupportsArray[dtype[]]]]]]]]" - partial( # type: ignore[arg-type] - _interpolate_with_limit_area, - method=method, - limit=limit, - limit_area=limit_area, - ), - axis, - values, - ) - return - transf = (lambda x: x) if axis == 0 else (lambda x: x.T) # reshape a 1 dim if needed @@ -897,8 +896,7 @@ def pad_or_backfill_inplace( func = get_fill_func(method, ndim=2) # _pad_2d and _backfill_2d both modify tvalues inplace - func(tvalues, limit=limit) - return + func(tvalues, limit=limit, limit_area=limit_area) def _fillna_prep( @@ -909,7 +907,6 @@ def _fillna_prep( if mask is None: mask = isna(values) - mask = mask.view(np.uint8) return mask @@ -919,16 +916,23 @@ def _datetimelike_compat(func: F) -> F: """ @wraps(func) - def new_func(values, limit: int | None = None, mask=None): + def new_func( + values, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + mask=None, + ): if needs_i8_conversion(values.dtype): if mask is None: # This needs to occur before casting to int64 mask = isna(values) - result, mask = func(values.view("i8"), limit=limit, mask=mask) + result, mask = func( + values.view("i8"), limit=limit, limit_area=limit_area, mask=mask + ) return result.view(values.dtype), mask - return func(values, limit=limit, mask=mask) + return func(values, limit=limit, limit_area=limit_area, mask=mask) return cast(F, new_func) @@ -937,9 +941,12 @@ def new_func(values, limit: int | None = None, mask=None): def _pad_1d( values: np.ndarray, limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, mask: npt.NDArray[np.bool_] | None = None, ) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: mask = _fillna_prep(values, mask) + if limit_area is not None and not mask.all(): + _fill_limit_area_1d(mask, limit_area) algos.pad_inplace(values, mask, limit=limit) return values, mask @@ -948,9 +955,12 @@ def _pad_1d( def _backfill_1d( values: np.ndarray, limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, mask: npt.NDArray[np.bool_] | None = None, ) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: mask = _fillna_prep(values, mask) + if limit_area is not None and not mask.all(): + _fill_limit_area_1d(mask, limit_area) algos.backfill_inplace(values, mask, limit=limit) return values, mask @@ -959,9 +969,12 @@ def _backfill_1d( def _pad_2d( values: np.ndarray, limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, mask: npt.NDArray[np.bool_] | None = None, ): mask = _fillna_prep(values, mask) + if limit_area is not None: + _fill_limit_area_2d(mask, limit_area) if values.size: algos.pad_2d_inplace(values, mask, limit=limit) @@ -973,9 +986,14 @@ def _pad_2d( @_datetimelike_compat def _backfill_2d( - values, limit: int | None = None, mask: npt.NDArray[np.bool_] | None = None + values, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + mask: npt.NDArray[np.bool_] | None = None, ): mask = _fillna_prep(values, mask) + if limit_area is not None: + _fill_limit_area_2d(mask, limit_area) if values.size: algos.backfill_2d_inplace(values, mask, limit=limit) @@ -985,6 +1003,63 @@ def _backfill_2d( return values, mask +def _fill_limit_area_1d( + mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"] +) -> None: + """Prepare 1d mask for ffill/bfill with limit_area. + + Caller is responsible for checking at least one value of mask is False. + When called, mask will no longer faithfully represent when + the corresponding are NA or not. + + Parameters + ---------- + mask : np.ndarray[bool, ndim=1] + Mask representing NA values when filling. + limit_area : { "outside", "inside" } + Whether to limit filling to outside or inside the outer most non-NA value. + """ + neg_mask = ~mask + first = neg_mask.argmax() + last = len(neg_mask) - neg_mask[::-1].argmax() - 1 + if limit_area == "inside": + mask[:first] = False + mask[last + 1 :] = False + elif limit_area == "outside": + mask[first + 1 : last] = False + + +def _fill_limit_area_2d( + mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"] +) -> None: + """Prepare 2d mask for ffill/bfill with limit_area. + + When called, mask will no longer faithfully represent when + the corresponding are NA or not. + + Parameters + ---------- + mask : np.ndarray[bool, ndim=1] + Mask representing NA values when filling. + limit_area : { "outside", "inside" } + Whether to limit filling to outside or inside the outer most non-NA value. + """ + neg_mask = ~mask.T + if limit_area == "outside": + # Identify inside + la_mask = ( + np.maximum.accumulate(neg_mask, axis=0) + & np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1] + ) + else: + # Identify outside + la_mask = ( + ~np.maximum.accumulate(neg_mask, axis=0) + | ~np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1] + ) + mask[la_mask.T] = False + + _fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d} diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 48a5f85e1c388..229595202cccb 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1306,12 +1306,15 @@ def first( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, *args, **kwargs, ): maybe_warn_args_and_kwargs(type(self), "first", args, kwargs) nv.validate_resampler_func("first", args, kwargs) - return self._downsample("first", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "first", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) @final @doc(GroupBy.last) @@ -1319,12 +1322,15 @@ def last( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, *args, **kwargs, ): maybe_warn_args_and_kwargs(type(self), "last", args, kwargs) nv.validate_resampler_func("last", args, kwargs) - return self._downsample("last", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "last", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) @final @doc(GroupBy.median) @@ -2542,7 +2548,8 @@ def _take_new_index( if axis == 1: raise NotImplementedError("axis 1 is not supported") new_mgr = obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) - return obj._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + # error: Incompatible return value type (got "DataFrame", expected "NDFrameT") + return obj._constructor_from_mgr(new_mgr, axes=new_mgr.axes) # type: ignore[return-value] else: raise ValueError("'obj' should be either a Series or a DataFrame") diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index aacea92611697..dc18bb65b35bc 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -205,8 +205,10 @@ def concat( Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation. sort : bool, default False - Sort non-concatenation axis if it is not already aligned. - + Sort non-concatenation axis if it is not already aligned. One exception to + this is when the non-concatentation axis is a DatetimeIndex and join='outer' + and the axis is not already aligned. In that case, the non-concatenation + axis is always sorted lexicographically. copy : bool, default True If False, do not copy data unnecessarily. diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 3ed67bb7b7c02..85c10f1166577 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -13,6 +13,7 @@ import numpy as np +from pandas._libs import missing as libmissing from pandas._libs.sparse import IntIndex from pandas.core.dtypes.common import ( @@ -260,7 +261,7 @@ def _get_dummies_1d( dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment] elif ( isinstance(input_dtype, StringDtype) - and input_dtype.storage != "pyarrow_numpy" + and input_dtype.na_value is libmissing.NA ): dtype = pandas_dtype("boolean") # type: ignore[assignment] else: diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index bb1cd0d738dac..e54f847895f1a 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -458,8 +458,7 @@ def wide_to_long( def get_var_names(df, stub: str, sep: str, suffix: str): regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$" - pattern = re.compile(regex) - return df.columns[df.columns.str.match(pattern)] + return df.columns[df.columns.str.match(regex)] def melt_stub(df, stub: str, i, j, value_vars, sep: str): newdf = melt( diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 690e3c2700c6c..dc2df25c3f786 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1526,6 +1526,11 @@ def _maybe_coerce_merge_keys(self) -> None: ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"): # allows datetime with different resolutions continue + # datetime and timedelta not allowed + elif lk.dtype.kind == "M" and rk.dtype.kind == "m": + raise ValueError(msg) + elif lk.dtype.kind == "m" and rk.dtype.kind == "M": + raise ValueError(msg) elif is_object_dtype(lk.dtype) and is_object_dtype(rk.dtype): continue @@ -1925,10 +1930,9 @@ def get_result(self, copy: bool | None = True) -> DataFrame: if self.fill_method == "ffill": if left_indexer is None: - raise TypeError("left_indexer cannot be None") - left_indexer = cast("npt.NDArray[np.intp]", left_indexer) - right_indexer = cast("npt.NDArray[np.intp]", right_indexer) - left_join_indexer = libjoin.ffill_indexer(left_indexer) + left_join_indexer = None + else: + left_join_indexer = libjoin.ffill_indexer(left_indexer) if right_indexer is None: right_join_indexer = None else: @@ -2469,8 +2473,7 @@ def _factorize_keys( elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( - isinstance(lk.dtype, StringDtype) - and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] + isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" ): import pyarrow as pa import pyarrow.compute as pc @@ -2483,18 +2486,30 @@ def _factorize_keys( .combine_chunks() .dictionary_encode() ) - length = len(dc.dictionary) llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_lk)], length) + pc.fill_null(dc.indices[slice(len_lk)], -1) .to_numpy() .astype(np.intp, copy=False), - pc.fill_null(dc.indices[slice(len_lk, None)], length) + pc.fill_null(dc.indices[slice(len_lk, None)], -1) .to_numpy() .astype(np.intp, copy=False), len(dc.dictionary), ) + + if sort: + uniques = dc.dictionary.to_numpy(zero_copy_only=False) + llab, rlab = _sort_labels(uniques, llab, rlab) + if dc.null_count > 0: + lmask = llab == -1 + lany = lmask.any() + rmask = rlab == -1 + rany = rmask.any() + if lany: + np.putmask(llab, lmask, count) + if rany: + np.putmask(rlab, rmask, count) count += 1 return llab, rlab, count diff --git a/pandas/core/series.py b/pandas/core/series.py index e3b401cd3c88b..4e2e363885594 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -67,6 +67,9 @@ from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.cast import ( LossySetitemError, + construct_1d_arraylike_from_scalar, + find_common_type, + infer_dtype_from, maybe_box_native, maybe_cast_pointwise_result, ) @@ -83,8 +86,12 @@ from pandas.core.dtypes.dtypes import ( CategoricalDtype, ExtensionDtype, + SparseDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, ) -from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import ( isna, @@ -113,6 +120,7 @@ from pandas.core.arrays.sparse import SparseAccessor from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import ( + array as pd_array, extract_array, sanitize_array, ) @@ -525,7 +533,7 @@ def __init__( data = data.reindex(index, copy=copy) copy = False data = data._mgr - elif is_dict_like(data): + elif isinstance(data, Mapping): data, index = self._init_dict(data, index, dtype) dtype = None copy = False @@ -597,7 +605,7 @@ def __init__( ) def _init_dict( - self, data, index: Index | None = None, dtype: DtypeObj | None = None + self, data: Mapping, index: Index | None = None, dtype: DtypeObj | None = None ): """ Derive the "_mgr" and "index" attributes of a new Series from a @@ -654,14 +662,17 @@ def _constructor(self) -> Callable[..., Series]: return Series def _constructor_from_mgr(self, mgr, axes): - if self._constructor is Series: - # we are pandas.Series (or a subclass that doesn't override _constructor) - ser = Series._from_mgr(mgr, axes=axes) - ser._name = None # caller is responsible for setting real name + ser = Series._from_mgr(mgr, axes=axes) + ser._name = None # caller is responsible for setting real name + + if type(self) is Series: + # This would also work `if self._constructor is Series`, but + # this check is slightly faster, benefiting the most-common case. return ser - else: - assert axes is mgr.axes - return self._constructor(mgr) + + # We assume that the subclass __init__ knows how to handle a + # pd.Series object. + return self._constructor(ser) @property def _constructor_expanddim(self) -> Callable[..., DataFrame]: @@ -673,18 +684,19 @@ def _constructor_expanddim(self) -> Callable[..., DataFrame]: return DataFrame - def _expanddim_from_mgr(self, mgr, axes) -> DataFrame: + def _constructor_expanddim_from_mgr(self, mgr, axes): from pandas.core.frame import DataFrame - return DataFrame._from_mgr(mgr, axes=mgr.axes) + df = DataFrame._from_mgr(mgr, axes=mgr.axes) - def _constructor_expanddim_from_mgr(self, mgr, axes): - from pandas.core.frame import DataFrame + if type(self) is Series: + # This would also work `if self._constructor_expanddim is DataFrame`, + # but this check is slightly faster, benefiting the most-common case. + return df - if self._constructor_expanddim is DataFrame: - return self._expanddim_from_mgr(mgr, axes) - assert axes is mgr.axes - return self._constructor_expanddim(mgr) + # We assume that the subclass __init__ knows how to handle a + # pd.DataFrame object. + return self._constructor_expanddim(df) # types @property @@ -963,7 +975,9 @@ def view(self, dtype: Dtype | None = None) -> Series: # ---------------------------------------------------------------------- # NDArray Compat - def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: + def __array__( + self, dtype: npt.DTypeLike | None = None, copy: bool | None = None + ) -> np.ndarray: """ Return the values as a NumPy array. @@ -976,6 +990,9 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: The dtype to use for the resulting NumPy array. By default, the dtype is inferred from the data. + copy : bool or None, optional + See :func:`numpy.asarray`. + Returns ------- numpy.ndarray @@ -1011,8 +1028,17 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: dtype='datetime64[ns]') """ values = self._values - arr = np.asarray(values, dtype=dtype) - if using_copy_on_write() and astype_is_view(values.dtype, arr.dtype): + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.array(values, dtype=dtype, copy=copy) + + if copy is True: + return arr + if using_copy_on_write() and ( + copy is False or astype_is_view(values.dtype, arr.dtype) + ): arr = arr.view() arr.flags.writeable = False return arr @@ -2788,13 +2814,11 @@ def round(self, decimals: int = 0, *args, **kwargs) -> Series: dtype: float64 """ nv.validate_round(args, kwargs) - result = self._values.round(decimals) - result = self._constructor(result, index=self.index, copy=False).__finalize__( + new_mgr = self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()) + return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__( self, method="round" ) - return result - @overload def quantile( self, q: float = ..., interpolation: QuantileInterpolation = ... @@ -3505,6 +3529,13 @@ def combine_first(self, other) -> Series: """ from pandas.core.reshape.concat import concat + if self.dtype == other.dtype: + if self.index.equals(other.index): + return self.mask(self.isna(), other) + elif self._can_hold_na and not isinstance(self.dtype, SparseDtype): + this, other = self.align(other, join="outer") + return this.mask(this.isna(), other) + new_index = self.index.union(other.index) this = self @@ -4061,6 +4092,7 @@ def argsort( axis: Axis = 0, kind: SortKind = "quicksort", order: None = None, + stable: None = None, ) -> Series: """ Return the integer indices that would sort the Series values. @@ -4077,6 +4109,8 @@ def argsort( information. 'mergesort' and 'stable' are the only stable algorithms. order : None Has no effect but is accepted for compatibility with numpy. + stable : None + Has no effect but is accepted for compatibility with numpy. Returns ------- @@ -5629,6 +5663,121 @@ def between( return lmask & rmask + def case_when( + self, + caselist: list[ + tuple[ + ArrayLike | Callable[[Series], Series | np.ndarray | Sequence[bool]], + ArrayLike | Scalar | Callable[[Series], Series | np.ndarray], + ], + ], + ) -> Series: + """ + Replace values where the conditions are True. + + Parameters + ---------- + caselist : A list of tuples of conditions and expected replacements + Takes the form: ``(condition0, replacement0)``, + ``(condition1, replacement1)``, ... . + ``condition`` should be a 1-D boolean array-like object + or a callable. If ``condition`` is a callable, + it is computed on the Series + and should return a boolean Series or array. + The callable must not change the input Series + (though pandas doesn`t check it). ``replacement`` should be a + 1-D array-like object, a scalar or a callable. + If ``replacement`` is a callable, it is computed on the Series + and should return a scalar or Series. The callable + must not change the input Series + (though pandas doesn`t check it). + + .. versionadded:: 2.2.0 + + Returns + ------- + Series + + See Also + -------- + Series.mask : Replace values where the condition is True. + + Examples + -------- + >>> c = pd.Series([6, 7, 8, 9], name='c') + >>> a = pd.Series([0, 0, 1, 2]) + >>> b = pd.Series([0, 3, 4, 5]) + + >>> c.case_when(caselist=[(a.gt(0), a), # condition, replacement + ... (b.gt(0), b)]) + 0 6 + 1 3 + 2 1 + 3 2 + Name: c, dtype: int64 + """ + if not isinstance(caselist, list): + raise TypeError( + f"The caselist argument should be a list; instead got {type(caselist)}" + ) + + if not caselist: + raise ValueError( + "provide at least one boolean condition, " + "with a corresponding replacement." + ) + + for num, entry in enumerate(caselist): + if not isinstance(entry, tuple): + raise TypeError( + f"Argument {num} must be a tuple; instead got {type(entry)}." + ) + if len(entry) != 2: + raise ValueError( + f"Argument {num} must have length 2; " + "a condition and replacement; " + f"instead got length {len(entry)}." + ) + caselist = [ + ( + com.apply_if_callable(condition, self), + com.apply_if_callable(replacement, self), + ) + for condition, replacement in caselist + ] + default = self.copy() + conditions, replacements = zip(*caselist) + common_dtypes = [infer_dtype_from(arg)[0] for arg in [*replacements, default]] + if len(set(common_dtypes)) > 1: + common_dtype = find_common_type(common_dtypes) + updated_replacements = [] + for condition, replacement in zip(conditions, replacements): + if is_scalar(replacement): + replacement = construct_1d_arraylike_from_scalar( + value=replacement, length=len(condition), dtype=common_dtype + ) + elif isinstance(replacement, ABCSeries): + replacement = replacement.astype(common_dtype) + else: + replacement = pd_array(replacement, dtype=common_dtype) + updated_replacements.append(replacement) + replacements = updated_replacements + default = default.astype(common_dtype) + + counter = reversed(range(len(conditions))) + for position, condition, replacement in zip( + counter, conditions[::-1], replacements[::-1] + ): + try: + default = default.mask( + condition, other=replacement, axis=0, inplace=False, level=None + ) + except Exception as error: + raise ValueError( + f"Failed to apply condition{position} and replacement{position}." + ) from error + return default + # error: Cannot determine type of 'isna' @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type] def isna(self) -> Series: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 1b7d632c0fa80..c0e458f7968e7 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -13,6 +13,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import lib from pandas._typing import ( AlignJoin, @@ -31,6 +33,7 @@ is_list_like, is_object_dtype, is_re, + is_string_dtype, ) from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -387,7 +390,9 @@ def cons_row(x): # This is a mess. _dtype: DtypeObj | str | None = dtype vdtype = getattr(result, "dtype", None) - if self._is_string: + if _dtype is not None: + pass + elif self._is_string: if is_bool_dtype(vdtype): _dtype = result.dtype elif returns_string: @@ -1199,7 +1204,12 @@ def join(self, sep: str): @forbid_nonstring_types(["bytes"]) def contains( - self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ): r""" Test if pattern or regex is contained within a string of a Series or Index. @@ -1217,8 +1227,9 @@ def contains( Flags to pass through to the re module, e.g. re.IGNORECASE. na : scalar, optional Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. + array. For object-dtype, ``numpy.nan`` is used. For the nullable + ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype, + ``False`` is used. regex : bool, default True If True, assumes the pat is a regular expression. @@ -1336,22 +1347,23 @@ def contains( return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def match(self, pat, case: bool = True, flags: int = 0, na=None): + def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string starts with a match of a regular expression. Parameters ---------- pat : str - Character sequence or regular expression. + Character sequence. case : bool, default True If True, case sensitive. flags : int, default 0 (no flags) Regex module flags, e.g. re.IGNORECASE. na : scalar, optional Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. + array. For object-dtype, ``numpy.nan`` is used. For the nullable + ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype, + ``False`` is used. Returns ------- @@ -1377,7 +1389,7 @@ def match(self, pat, case: bool = True, flags: int = 0, na=None): return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): + def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string entirely matches a regular expression. @@ -1391,8 +1403,9 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): Regex module flags, e.g. re.IGNORECASE. na : scalar, optional Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. + array. For object-dtype, ``numpy.nan`` is used. For the nullable + ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype, + ``False`` is used. Returns ------- @@ -1969,7 +1982,9 @@ def slice_replace(self, start=None, stop=None, repl=None): result = self._data.array._str_slice_replace(start, stop, repl) return self._wrap_result(result) - def decode(self, encoding, errors: str = "strict"): + def decode( + self, encoding, errors: str = "strict", dtype: str | DtypeObj | None = None + ): """ Decode character string in the Series/Index using indicated encoding. @@ -1980,6 +1995,14 @@ def decode(self, encoding, errors: str = "strict"): ---------- encoding : str errors : str, optional + Specifies the error handling scheme. + Possible values are those supported by :meth:`bytes.decode`. + dtype : str or dtype, optional + The dtype of the result. When not ``None``, must be either a string or + object dtype. When ``None``, the dtype of the result is determined by + ``pd.options.future.infer_string``. + + .. versionadded:: 2.3.0 Returns ------- @@ -1996,6 +2019,10 @@ def decode(self, encoding, errors: str = "strict"): 2 () dtype: object """ + if dtype is not None and not is_string_dtype(dtype): + raise ValueError(f"dtype must be string or object, got {dtype=}") + if dtype is None and get_option("future.infer_string"): + dtype = "str" # TODO: Add a similar _bytes interface. if encoding in _cpython_optimized_decoders: # CPython optimized implementation @@ -2004,9 +2031,8 @@ def decode(self, encoding, errors: str = "strict"): decoder = codecs.getdecoder(encoding) f = lambda x: decoder(x, errors)[0] arr = self._data.array - # assert isinstance(arr, (StringArray,)) result = arr._str_map(f) - return self._wrap_result(result) + return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) def encode(self, encoding, errors: str = "strict"): @@ -2415,7 +2441,7 @@ def count(self, pat, flags: int = 0): @forbid_nonstring_types(["bytes"]) def startswith( - self, pat: str | tuple[str, ...], na: Scalar | None = None + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default ) -> Series | Index: """ Test if the start of each string element matches a pattern. @@ -2427,10 +2453,11 @@ def startswith( pat : str or tuple[str, ...] Character sequence or tuple of strings. Regular expressions are not accepted. - na : object, default NaN + na : scalar, optional Object shown if element tested is not a string. The default depends on dtype of the array. For object-dtype, ``numpy.nan`` is used. - For ``StringDtype``, ``pandas.NA`` is used. + For the nullable ``StringDtype``, ``pandas.NA`` is used. + For the ``"str"`` dtype, ``False`` is used. Returns ------- @@ -2485,7 +2512,7 @@ def startswith( @forbid_nonstring_types(["bytes"]) def endswith( - self, pat: str | tuple[str, ...], na: Scalar | None = None + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default ) -> Series | Index: """ Test if the end of each string element matches a pattern. @@ -2497,10 +2524,11 @@ def endswith( pat : str or tuple[str, ...] Character sequence or tuple of strings. Regular expressions are not accepted. - na : object, default NaN + na : scalar, optional Object shown if element tested is not a string. The default depends on dtype of the array. For object-dtype, ``numpy.nan`` is used. - For ``StringDtype``, ``pandas.NA`` is used. + For the nullable ``StringDtype``, ``pandas.NA`` is used. + For the ``"str"`` dtype, ``False`` is used. Returns ------- diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 96b0352666b41..316c86d152db3 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -7,7 +7,7 @@ Literal, ) -import numpy as np +from pandas._libs import lib if TYPE_CHECKING: from collections.abc import Sequence @@ -85,7 +85,11 @@ def _str_repeat(self, repeats: int | Sequence[int]): @abc.abstractmethod def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar = np.nan + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ): pass @@ -95,7 +99,7 @@ def _str_fullmatch( pat: str | re.Pattern, case: bool = True, flags: int = 0, - na: Scalar = np.nan, + na: Scalar | lib.NoDefault = lib.no_default, ): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 0029beccc40a8..e82c6c20e86d9 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -10,12 +10,14 @@ cast, ) import unicodedata +import warnings import numpy as np from pandas._libs import lib import pandas._libs.missing as libmissing import pandas._libs.ops as libops +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.missing import isna @@ -37,14 +39,16 @@ class ObjectStringArrayMixin(BaseStringArrayMethods): String Methods operating on object-dtype ndarrays. """ - _str_na_value = np.nan - def __len__(self) -> int: # For typing, _str_map relies on the object being sized. raise NotImplementedError def _str_map( - self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True + self, + f, + na_value=lib.no_default, + dtype: NpDtype | None = None, + convert: bool = True, ): """ Map a callable over valid elements of the array. @@ -56,7 +60,7 @@ def _str_map( na_value : Scalar, optional The value to set for NA values. Might also be used for the fill value if the callable `f` raises an exception. - This defaults to ``self._str_na_value`` which is ``np.nan`` + This defaults to ``self.dtype.na_value`` which is ``np.nan`` for object-dtype and Categorical and ``pd.NA`` for StringArray. dtype : Dtype, optional The dtype of the result array. @@ -65,8 +69,8 @@ def _str_map( """ if dtype is None: dtype = np.dtype("object") - if na_value is None: - na_value = self._str_na_value + if na_value is lib.no_default: + na_value = self.dtype.na_value # type: ignore[attr-defined] if not len(self): return np.array([], dtype=dtype) @@ -127,7 +131,12 @@ def _str_pad( return self._str_map(f) def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ): if regex: if not case: @@ -142,14 +151,38 @@ def _str_contains( else: upper_pat = pat.upper() f = lambda x: upper_pat in x.upper() + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.contains is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na, dtype=np.dtype("bool")) - def _str_startswith(self, pat, na=None): + def _str_startswith(self, pat, na=lib.no_default): f = lambda x: x.startswith(pat) + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.startswith is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) - def _str_endswith(self, pat, na=None): + def _str_endswith(self, pat, na=lib.no_default): f = lambda x: x.endswith(pat) + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.endswith is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_replace( @@ -211,7 +244,11 @@ def rep(x, r): return result def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ): if not case: flags |= re.IGNORECASE @@ -226,7 +263,7 @@ def _str_fullmatch( pat: str | re.Pattern, case: bool = True, flags: int = 0, - na: Scalar | None = None, + na: Scalar | lib.NoDefault = lib.no_default, ): if not case: flags |= re.IGNORECASE @@ -270,7 +307,7 @@ def f(x): return x.get(i) elif len(x) > i >= -len(x): return x[i] - return self._str_na_value + return self.dtype.na_value # type: ignore[attr-defined] return self._str_map(f) @@ -473,7 +510,7 @@ def _str_removesuffix(self, suffix: str) -> Series: def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): regex = re.compile(pat, flags=flags) - na_value = self._str_na_value + na_value = self.dtype.na_value # type: ignore[attr-defined] if not expand: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 05262c235568d..8f700cfa63132 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -16,6 +16,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( lib, tslib, @@ -476,6 +478,9 @@ def _array_strptime_with_fallback( unit = np.datetime_data(result.dtype)[0] res = Index(result, dtype=f"M8[{unit}, UTC]", name=name) return res + elif using_string_dtype() and result.dtype == object: + if lib.is_string_array(result): + return Index(result, dtype="str", name=name) return Index(result, dtype=result.dtype, name=name) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 09652a7d8bc92..ca703e0362611 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -8,7 +8,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend @@ -235,7 +238,7 @@ def to_numeric( coerce_numeric=coerce_numeric, convert_to_masked_nullable=dtype_backend is not lib.no_default or isinstance(values_dtype, StringDtype) - and not values_dtype.storage == "pyarrow_numpy", + and values_dtype.na_value is libmissing.NA, ) except (ValueError, TypeError): if errors == "raise": @@ -250,7 +253,7 @@ def to_numeric( dtype_backend is not lib.no_default and new_mask is None or isinstance(values_dtype, StringDtype) - and not values_dtype.storage == "pyarrow_numpy" + and values_dtype.na_value is libmissing.NA ): new_mask = np.zeros(values.shape, dtype=np.bool_) diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index b8d489179338b..4825c9fee24b1 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -1,11 +1,14 @@ """Common utilities for Numba operations""" from __future__ import annotations +import types from typing import ( TYPE_CHECKING, Callable, ) +import numpy as np + from pandas.compat._optional import import_optional_dependency from pandas.errors import NumbaUtilError @@ -83,6 +86,12 @@ def jit_user_function(func: Callable) -> Callable: if numba.extending.is_jitted(func): # Don't jit a user passed jitted function numba_func = func + elif getattr(np, func.__name__, False) is func or isinstance( + func, types.BuiltinFunctionType + ): + # Not necessary to jit builtins or np functions + # This will mess up register_jitable + numba_func = func else: numba_func = numba.extending.register_jitable(func) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index e78bd258c11ff..68cec16ec9eca 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -14,7 +14,6 @@ Any, Callable, Literal, - cast, ) import numpy as np @@ -39,6 +38,7 @@ is_numeric_dtype, needs_i8_conversion, ) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -104,6 +104,7 @@ NDFrameT, QuantileInterpolation, WindowingRankType, + npt, ) from pandas import ( @@ -404,11 +405,12 @@ def _insert_on_column(self, result: DataFrame, obj: DataFrame) -> None: result[name] = extra_col @property - def _index_array(self): + def _index_array(self) -> npt.NDArray[np.int64] | None: # TODO: why do we get here with e.g. MultiIndex? - if needs_i8_conversion(self._on.dtype): - idx = cast("PeriodIndex | DatetimeIndex | TimedeltaIndex", self._on) - return idx.asi8 + if isinstance(self._on, (PeriodIndex, DatetimeIndex, TimedeltaIndex)): + return self._on.asi8 + elif isinstance(self._on.dtype, ArrowDtype) and self._on.dtype.kind in "mM": + return self._on.to_numpy(dtype=np.int64) return None def _resolve_output(self, out: DataFrame, obj: DataFrame) -> DataFrame: @@ -439,7 +441,7 @@ def _apply_series( self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None ) -> Series: """ - Series version of _apply_blockwise + Series version of _apply_columnwise """ obj = self._create_data(self._selected_obj) @@ -455,7 +457,7 @@ def _apply_series( index = self._slice_axis_for_step(obj.index, result) return obj._constructor(result, index=index, name=obj.name) - def _apply_blockwise( + def _apply_columnwise( self, homogeneous_func: Callable[..., ArrayLike], name: str, @@ -614,7 +616,7 @@ def calc(x): return result if self.method == "single": - return self._apply_blockwise(homogeneous_func, name, numeric_only) + return self._apply_columnwise(homogeneous_func, name, numeric_only) else: return self._apply_tablewise(homogeneous_func, name, numeric_only) @@ -1232,7 +1234,9 @@ def calc(x): return result - return self._apply_blockwise(homogeneous_func, name, numeric_only)[:: self.step] + return self._apply_columnwise(homogeneous_func, name, numeric_only)[ + :: self.step + ] @doc( _shared_docs["aggregate"], @@ -1868,6 +1872,7 @@ def _validate(self): if ( self.obj.empty or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex)) + or (isinstance(self._on.dtype, ArrowDtype) and self._on.dtype.kind in "mM") ) and isinstance(self.window, (str, BaseOffset, timedelta)): self._validate_datetimelike_monotonic() diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 3b2ae5daffdba..35fdfb1a9ee82 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,11 +1,30 @@ from __future__ import annotations -from typing import Callable +from typing import ( + TYPE_CHECKING, + Literal, +) +import numpy as np + +from pandas._config import using_string_dtype + +from pandas._libs import lib +from pandas.compat import ( + pa_version_under18p0, + pa_version_under19p0, +) from pandas.compat._optional import import_optional_dependency import pandas as pd +if TYPE_CHECKING: + from collections.abc import Callable + + import pyarrow + + from pandas._typing import DtypeBackend + def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") @@ -22,13 +41,54 @@ def _arrow_dtype_mapping() -> dict: pa.string(): pd.StringDtype(), pa.float32(): pd.Float32Dtype(), pa.float64(): pd.Float64Dtype(), + pa.string(): pd.StringDtype(), + pa.large_string(): pd.StringDtype(), } -def arrow_string_types_mapper() -> Callable: +def _arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") - return { - pa.string(): pd.StringDtype(storage="pyarrow_numpy"), - pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"), - }.get + mapping = { + pa.string(): pd.StringDtype(na_value=np.nan), + pa.large_string(): pd.StringDtype(na_value=np.nan), + } + if not pa_version_under18p0: + mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan) + + return mapping.get + + +def arrow_table_to_pandas( + table: pyarrow.Table, + dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default, + null_to_int64: bool = False, + to_pandas_kwargs: dict | None = None, +) -> pd.DataFrame: + if to_pandas_kwargs is None: + to_pandas_kwargs = {} + + pa = import_optional_dependency("pyarrow") + + types_mapper: type[pd.ArrowDtype] | None | Callable + if dtype_backend == "numpy_nullable": + mapping = _arrow_dtype_mapping() + if null_to_int64: + # Modify the default mapping to also map null to Int64 + # (to match other engines - only for CSV parser) + mapping[pa.null()] = pd.Int64Dtype() + types_mapper = mapping.get + elif dtype_backend == "pyarrow": + types_mapper = pd.ArrowDtype + elif using_string_dtype(): + if pa_version_under19p0: + types_mapper = _arrow_string_types_mapper() + else: + types_mapper = None + elif dtype_backend is lib.no_default or dtype_backend == "numpy": + types_mapper = None + else: + raise NotImplementedError + + df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) + return df diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index bce890c6f73b0..786f719337b84 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -160,36 +160,24 @@ If converters are specified, they will be applied INSTEAD of dtype conversion. If you use ``None``, it will infer the dtype of each column based on the data. -engine : str, default None +engine : {{'openpyxl', 'calamine', 'odf', 'pyxlsb', 'xlrd'}}, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", "calamine". Engine compatibility : - - ``xlr`` supports old-style Excel files (.xls). - ``openpyxl`` supports newer Excel file formats. - - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - - ``pyxlsb`` supports Binary Excel files. - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) and OpenDocument (.ods) file formats. + - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). + - ``pyxlsb`` supports Binary Excel files. + - ``xlrd`` supports old-style Excel files (.xls). - .. versionchanged:: 1.2.0 - The engine `xlrd `_ - now only supports old-style ``.xls`` files. - When ``engine=None``, the following logic will be - used to determine the engine: - - - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is an xls format, - ``xlrd`` will be used. - - Otherwise if ``path_or_buffer`` is in xlsb format, - ``pyxlsb`` will be used. - - .. versionadded:: 1.3.0 - - Otherwise ``openpyxl`` will be used. - - .. versionchanged:: 1.3.0 + When ``engine=None``, the following logic will be used to determine the engine: + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. + - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used. + - Otherwise ``openpyxl`` will be used. converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one diff --git a/pandas/io/excel/_calamine.py b/pandas/io/excel/_calamine.py index 4f65acf1aa40e..5259469f7a569 100644 --- a/pandas/io/excel/_calamine.py +++ b/pandas/io/excel/_calamine.py @@ -74,9 +74,7 @@ def load_workbook( ) -> CalamineWorkbook: from python_calamine import load_workbook - return load_workbook( - filepath_or_buffer, **engine_kwargs # type: ignore[arg-type] - ) + return load_workbook(filepath_or_buffer, **engine_kwargs) @property def sheet_names(self) -> list[str]: diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index d0aaf83b84cb2..1bdb732cb10de 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -6,18 +6,17 @@ Any, ) -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.api import DataFrame from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import get_handle if TYPE_CHECKING: @@ -120,7 +119,7 @@ def read_feather( with get_handle( path, "rb", storage_options=storage_options, is_text=False ) as handles: - if dtype_backend is lib.no_default and not using_pyarrow_string_dtype(): + if dtype_backend is lib.no_default and not using_string_dtype(): return feather.read_feather( handles.handle, columns=columns, use_threads=bool(use_threads) ) @@ -128,16 +127,4 @@ def read_feather( pa_table = feather.read_table( handles.handle, columns=columns, use_threads=bool(use_threads) ) - - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get) - - elif dtype_backend == "pyarrow": - return pa_table.to_pandas(types_mapper=pd.ArrowDtype) - - elif using_pyarrow_string_dtype(): - return pa_table.to_pandas(types_mapper=arrow_string_types_mapper()) - else: - raise NotImplementedError + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index b62f7581ac220..987577057e058 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1580,7 +1580,7 @@ def _update_ctx_header(self, attrs: DataFrame, axis: AxisInt) -> None: for j in attrs.columns: ser = attrs[j] for i, c in ser.items(): - if not c: + if not c or pd.isna(c): continue css_list = maybe_convert_css_to_tuples(c) if axis == 0: diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 350002bf461ff..24e4e0b7cef0a 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -11,7 +11,7 @@ from pandas.util._exceptions import find_stack_level if TYPE_CHECKING: - import google.auth + from google.auth.credentials import Credentials from pandas import DataFrame @@ -37,7 +37,7 @@ def read_gbq( dialect: str | None = None, location: str | None = None, configuration: dict[str, Any] | None = None, - credentials: google.auth.credentials.Credentials | None = None, + credentials: Credentials | None = None, use_bqstorage_api: bool | None = None, max_results: int | None = None, progress_bar_type: str | None = None, @@ -230,7 +230,7 @@ def to_gbq( table_schema: list[dict[str, str]] | None = None, location: str | None = None, progress_bar: bool = True, - credentials: google.auth.credentials.Credentials | None = None, + credentials: Credentials | None = None, ) -> None: warnings.warn( "to_gbq is deprecated and will be removed in a future version. " diff --git a/pandas/io/html.py b/pandas/io/html.py index 5d5bf079784be..4eeeb1b655f8a 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -269,7 +269,7 @@ def _attr_getter(self, obj, attr): # Both lxml and BeautifulSoup have the same implementation: return obj.get(attr) - def _href_getter(self, obj): + def _href_getter(self, obj) -> str | None: """ Return a href if the DOM node contains a child or None. @@ -392,7 +392,7 @@ def _parse_tables(self, document, match, attrs): """ raise AbstractMethodError(self) - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: """ Return whether an individual DOM node matches a tag @@ -591,14 +591,8 @@ class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): :class:`pandas.io.html._HtmlFrameParser`. """ - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - from bs4 import SoupStrainer - - self._strainer = SoupStrainer("table") - def _parse_tables(self, document, match, attrs): - element_name = self._strainer.name + element_name = "table" tables = document.find_all(element_name, attrs=attrs) if not tables: raise ValueError("No tables found") @@ -629,7 +623,7 @@ def _href_getter(self, obj) -> str | None: def _text_getter(self, obj): return obj.text - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: return obj.name == tag def _parse_td(self, row): @@ -758,7 +752,7 @@ def _parse_tables(self, document, match, kwargs): raise ValueError(f"No tables found matching regex {repr(pattern)}") return tables - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: return obj.tag == tag def _build_doc(self): diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ed66e46b300f7..c0499ce750cf0 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -40,7 +40,6 @@ from pandas.core.dtypes.dtypes import PeriodDtype from pandas import ( - ArrowDtype, DataFrame, Index, MultiIndex, @@ -52,6 +51,7 @@ from pandas.core.reshape.concat import concat from pandas.core.shared_docs import _shared_docs +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, dedup_names, @@ -255,7 +255,7 @@ def __init__( self.is_copy = None self._format_axes() - def _format_axes(self): + def _format_axes(self) -> None: raise AbstractMethodError(self) def write(self) -> str: @@ -287,7 +287,7 @@ def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: else: return self.obj - def _format_axes(self): + def _format_axes(self) -> None: if not self.obj.index.is_unique and self.orient == "index": raise ValueError(f"Series index must be unique for orient='{self.orient}'") @@ -304,7 +304,7 @@ def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: obj_to_write = self.obj return obj_to_write - def _format_axes(self): + def _format_axes(self) -> None: """ Try to format axes if they are datelike. """ @@ -997,18 +997,7 @@ def read(self) -> DataFrame | Series: if self.engine == "pyarrow": pyarrow_json = import_optional_dependency("pyarrow.json") pa_table = pyarrow_json.read_json(self.data) - - mapping: type[ArrowDtype] | None | Callable - if self.dtype_backend == "pyarrow": - mapping = ArrowDtype - elif self.dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - else: - mapping = None - - return pa_table.to_pandas(types_mapper=mapping) + return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) elif self.engine == "ujson": if self.lines: if self.chunksize: @@ -1193,7 +1182,7 @@ def parse(self): self._try_convert_types() return self.obj - def _parse(self): + def _parse(self) -> None: raise AbstractMethodError(self) @final @@ -1217,7 +1206,7 @@ def _convert_axes(self) -> None: new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False) setattr(self.obj, axis_name, new_axis) - def _try_convert_types(self): + def _try_convert_types(self) -> None: raise AbstractMethodError(self) @final @@ -1266,6 +1255,7 @@ def _try_convert_data( if result: return new_data, True + converted = False if self.dtype_backend is not lib.no_default and not is_axis: # Fall through for conversion later on return data, True @@ -1273,16 +1263,17 @@ def _try_convert_data( # try float try: data = data.astype("float64") + converted = True except (TypeError, ValueError): pass - if data.dtype.kind == "f": - if data.dtype != "float64": - # coerce floats to 64 - try: - data = data.astype("float64") - except (TypeError, ValueError): - pass + if data.dtype.kind == "f" and data.dtype != "float64": + # coerce floats to 64 + try: + data = data.astype("float64") + converted = True + except (TypeError, ValueError): + pass # don't coerce 0-len data if len(data) and data.dtype in ("float", "object"): @@ -1291,14 +1282,15 @@ def _try_convert_data( new_data = data.astype("int64") if (new_data == data).all(): data = new_data + converted = True except (TypeError, ValueError, OverflowError): pass - # coerce ints to 64 - if data.dtype == "int": - # coerce floats to 64 + if data.dtype == "int" and data.dtype != "int64": + # coerce ints to 64 try: data = data.astype("int64") + converted = True except (TypeError, ValueError): pass @@ -1307,7 +1299,7 @@ def _try_convert_data( if self.orient == "split": return data, False - return data, True + return data, converted @final def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: diff --git a/pandas/io/orc.py b/pandas/io/orc.py index fed9463c38d5d..d7f473a929568 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -9,16 +9,13 @@ Literal, ) -from pandas._config import using_pyarrow_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.indexes.api import default_index -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( get_handle, is_fsspec_url, @@ -117,21 +114,7 @@ def read_orc( pa_table = orc.read_table( source=source, columns=columns, filesystem=filesystem, **kwargs ) - if dtype_backend is not lib.no_default: - if dtype_backend == "pyarrow": - df = pa_table.to_pandas(types_mapper=pd.ArrowDtype) - else: - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - df = pa_table.to_pandas(types_mapper=mapping.get) - return df - else: - if using_pyarrow_string_dtype(): - types_mapper = arrow_string_types_mapper() - else: - types_mapper = None - return pa_table.to_pandas(types_mapper=types_mapper) + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) def to_orc( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 9570d6f8b26bd..01e320cdb1b72 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -10,9 +10,11 @@ Literal, ) import warnings -from warnings import catch_warnings +from warnings import ( + catch_warnings, + filterwarnings, +) -from pandas._config import using_pyarrow_string_dtype from pandas._config.config import _get_option from pandas._libs import lib @@ -22,14 +24,13 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas import ( DataFrame, get_option, ) from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, get_handle, @@ -250,20 +251,10 @@ def read( kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - to_pandas_kwargs["types_mapper"] = mapping.get - elif dtype_backend == "pyarrow": - to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] - elif using_pyarrow_string_dtype(): - to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() manager = _get_option("mode.data_manager", silent=True) if manager == "array": - to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] - + to_pandas_kwargs["split_blocks"] = True path_or_handle, handles, filesystem = _get_path_or_handle( path, filesystem, @@ -278,7 +269,18 @@ def read( filters=filters, **kwargs, ) - result = pa_table.to_pandas(**to_pandas_kwargs) + + with catch_warnings(): + filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + result = arrow_table_to_pandas( + pa_table, + dtype_backend=dtype_backend, + to_pandas_kwargs=to_pandas_kwargs, + ) if manager == "array": result = result._as_manager("array", copy=False) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 66a7ccacf675b..7fe5ecb0e54c2 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,8 +3,6 @@ from typing import TYPE_CHECKING import warnings -from pandas._config import using_pyarrow_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -16,18 +14,14 @@ from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.inference import is_integer -import pandas as pd -from pandas import DataFrame - -from pandas.io._util import ( - _arrow_dtype_mapping, - arrow_string_types_mapper, -) +from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase if TYPE_CHECKING: from pandas._typing import ReadBuffer + from pandas import DataFrame + class ArrowParserWrapper(ParserBase): """ @@ -41,7 +35,7 @@ def __init__(self, src: ReadBuffer[bytes], **kwds) -> None: self._parse_kwds() - def _parse_kwds(self): + def _parse_kwds(self) -> None: """ Validates keywords before passing to pyarrow. """ @@ -104,7 +98,7 @@ def _get_pyarrow_options(self) -> None: ] = None # PyArrow raises an exception by default elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN: - def handle_warning(invalid_row): + def handle_warning(invalid_row) -> str: warnings.warn( f"Expected {invalid_row.expected_columns} columns, but found " f"{invalid_row.actual_columns}: {invalid_row.text}", @@ -171,7 +165,8 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: # The only way self.names is not the same length as number of cols is # if we have int index_col. We should just pad the names(they will get # removed anyways) to expected length then. - self.names = list(range(num_cols - len(self.names))) + self.names + columns_prefix = [str(x) for x in range(num_cols - len(self.names))] + self.names = columns_prefix + self.names multi_index_named = False frame.columns = self.names # we only need the frame not the names @@ -219,7 +214,7 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: raise ValueError(e) return frame - def _validate_usecols(self, usecols): + def _validate_usecols(self, usecols) -> None: if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols): raise ValueError( "The pyarrow engine does not allow 'usecols' to be integer " @@ -287,17 +282,14 @@ def read(self) -> DataFrame: table = table.cast(new_schema) - if dtype_backend == "pyarrow": - frame = table.to_pandas(types_mapper=pd.ArrowDtype) - elif dtype_backend == "numpy_nullable": - # Modify the default mapping to also - # map null to Int64 (to match other engines) - dtype_mapping = _arrow_dtype_mapping() - dtype_mapping[pa.null()] = pd.Int64Dtype() - frame = table.to_pandas(types_mapper=dtype_mapping.get) - elif using_pyarrow_string_dtype(): - frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + frame = arrow_table_to_pandas( + table, dtype_backend=dtype_backend, null_to_int64=True + ) - else: - frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 09f0f2af8e5c6..40e3ea6450647 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -464,7 +464,11 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: arrays = [] converters = self._clean_mapping(self.converters) - for i, arr in enumerate(index): + if self.index_names is not None: + names: Iterable = self.index_names + else: + names = itertools.cycle([None]) + for i, (arr, name) in enumerate(zip(index, names)): if try_parse_dates and self._should_parse_dates(i): arr = self._date_conv( arr, @@ -504,12 +508,17 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: arr, _ = self._infer_types( arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool ) - arrays.append(arr) - - names = self.index_names - index = ensure_index_from_sequences(arrays, names) + if cast_type is not None: + # Don't perform RangeIndex inference + idx = Index(arr, name=name, dtype=cast_type) + else: + idx = ensure_index_from_sequences([arr], [name]) + arrays.append(idx) - return index + if len(arrays) == 1: + return arrays[0] + else: + return MultiIndex.from_arrays(arrays) @final def _convert_to_ndarrays( @@ -1084,12 +1093,11 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): dtype_dict: defaultdict[Hashable, Any] if not is_dict_like(dtype): # if dtype == None, default will be object. - default_dtype = dtype or object - dtype_dict = defaultdict(lambda: default_dtype) + dtype_dict = defaultdict(lambda: dtype) else: dtype = cast(dict, dtype) dtype_dict = defaultdict( - lambda: object, + lambda: None, {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, ) @@ -1106,8 +1114,14 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): if (index_col is None or index_col is False) or index_names is None: index = default_index(0) else: - data = [Series([], dtype=dtype_dict[name]) for name in index_names] - index = ensure_index_from_sequences(data, names=index_names) + # TODO: We could return default_index(0) if dtype_dict[name] is None + data = [ + Index([], name=name, dtype=dtype_dict[name]) for name in index_names + ] + if len(data) == 1: + index = data[0] + else: + index = MultiIndex.from_arrays(data) index_col.sort() for i, n in enumerate(index_col): diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index a9b41b45aba2f..e04f27b560610 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -240,6 +240,8 @@ performance of reading a large file. verbose : bool, default False Indicate number of ``NA`` values placed in non-numeric columns. + + .. deprecated:: 2.2.0 skip_blank_lines : bool, default True If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. parse_dates : bool, list of Hashable, list of lists or dict of {{Hashable : list}}, \ @@ -396,7 +398,7 @@ - Callable, function with signature as described in `pyarrow documentation _` when ``engine='pyarrow'`` + #pyarrow.csv.ParseOptions.invalid_row_handler>`_ when ``engine='pyarrow'`` delim_whitespace : bool, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 50611197ad7dd..65f95dab7b42f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -31,7 +31,7 @@ config, get_option, using_copy_on_write, - using_pyarrow_string_dtype, + using_string_dtype, ) from pandas._libs import ( @@ -76,6 +76,7 @@ PeriodIndex, RangeIndex, Series, + StringDtype, TimedeltaIndex, concat, isna, @@ -85,12 +86,16 @@ DatetimeArray, PeriodArray, ) +from pandas.core.arrays.string_ import BaseStringArray import pandas.core.common as com from pandas.core.computation.pytables import ( PyTablesExpr, maybe_expression, ) -from pandas.core.construction import extract_array +from pandas.core.construction import ( + array as pd_array, + extract_array, +) from pandas.core.indexes.api import ensure_index from pandas.core.internals import ( ArrayManager, @@ -1707,7 +1712,7 @@ def info(self) -> str: # ------------------------------------------------------------------------ # private methods - def _check_if_open(self): + def _check_if_open(self) -> None: if not self.is_open: raise ClosedFileError(f"{self._path} file is not open!") @@ -2954,6 +2959,9 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if isinstance(node, tables.VLArray): ret = node[0][start:stop] + dtype = getattr(attrs, "value_type", None) + if dtype is not None: + ret = pd_array(ret, dtype=dtype) else: dtype = _ensure_decoded(getattr(attrs, "value_type", None)) shape = getattr(attrs, "shape", None) @@ -3192,6 +3200,11 @@ def write_array( elif lib.is_np_dtype(value.dtype, "m"): self._handle.create_array(self.group, key, value.view("i8")) getattr(self.group, key)._v_attrs.value_type = "timedelta64" + elif isinstance(value, BaseStringArray): + vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) + vlarr.append(value.to_numpy()) + node = getattr(self.group, key) + node._v_attrs.value_type = str(value.dtype) elif empty_array: self.write_array_empty(key, value) else: @@ -3224,8 +3237,12 @@ def read( index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) - if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): - result = result.astype("string[pyarrow_numpy]") + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) + ): + result = result.astype(StringDtype(na_value=np.nan)) return result def write(self, obj, **kwargs) -> None: @@ -3293,8 +3310,12 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) - if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): - df = df.astype("string[pyarrow_numpy]") + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) + ): + df = df.astype(StringDtype(na_value=np.nan)) dfs.append(df) if len(dfs) > 0: @@ -3443,6 +3464,12 @@ def validate(self, other) -> None: # Value of type "Optional[Any]" is not indexable [index] oax = ov[i] # type: ignore[index] if sax != oax: + if c == "values_axes" and sax.kind != oax.kind: + raise ValueError( + f"Cannot serialize the column [{oax.values[0]}] " + f"because its data contents are not [{sax.kind}] " + f"but [{oax.kind}] object dtype" + ) raise ValueError( f"invalid combination of [{c}] on appending data " f"[{sax}] vs current table [{oax}]" @@ -4065,7 +4092,9 @@ def _create_axes( if isinstance(data_converted.dtype, CategoricalDtype): ordered = data_converted.ordered meta = "category" - metadata = np.array(data_converted.categories, copy=False).ravel() + metadata = np.asarray(data_converted.categories).ravel() + elif isinstance(blk.dtype, StringDtype): + meta = str(blk.dtype) data, dtype_name = _get_data_and_dtype_name(data_converted) @@ -4333,7 +4362,9 @@ def read_column( encoding=self.encoding, errors=self.errors, ) - return Series(_set_tz(col_values[1], a.tz), name=column, copy=False) + cvs = _set_tz(col_values[1], a.tz) + dtype = getattr(self.table.attrs, f"{column}_meta", None) + return Series(cvs, name=column, copy=False, dtype=dtype) raise KeyError(f"column [{column}] not found in the table") @@ -4679,13 +4710,27 @@ def read( else: # Categorical df = DataFrame._from_arrays([values], columns=cols_, index=index_) - if not (using_pyarrow_string_dtype() and values.dtype.kind == "O"): + if not (using_string_dtype() and values.dtype.kind == "O"): assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) - if using_pyarrow_string_dtype() and is_string_array( - values, # type: ignore[arg-type] - skipna=True, + + # If str / string dtype is stored in meta, use that. + converted = False + for column in cols_: + dtype = getattr(self.table.attrs, f"{column}_meta", None) + if dtype in ["str", "string"]: + df[column] = df[column].astype(dtype) + converted = True + # Otherwise try inference. + if ( + not converted + and using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array( + values, + skipna=True, + ) ): - df = df.astype("string[pyarrow_numpy]") + df = df.astype(StringDtype(na_value=np.nan)) frames.append(df) if len(frames) == 1: @@ -5062,6 +5107,9 @@ def _maybe_convert_for_string_atom( errors, columns: list[str], ): + if isinstance(bvalues.dtype, StringDtype): + # "ndarray[Any, Any]" has no attribute "to_numpy" + bvalues = bvalues.to_numpy() # type: ignore[union-attr] if bvalues.dtype != object: return bvalues @@ -5086,6 +5134,9 @@ def _maybe_convert_for_string_atom( data = bvalues.copy() data[mask] = nan_rep + if existing_col and mask.any() and len(nan_rep) > existing_col.itemsize: + raise ValueError("NaN representation is too large for existing column size") + # see if we have a valid string type inferred_type = lib.infer_dtype(data, skipna=False) if inferred_type != "string": @@ -5183,7 +5234,9 @@ def _unconvert_string_array( dtype = f"U{itemsize}" if isinstance(data[0], bytes): - data = Series(data, copy=False).str.decode(encoding, errors=errors)._values + ser = Series(data, copy=False).str.decode(encoding, errors=errors) + data = ser.to_numpy() + data.flags.writeable = True else: data = data.astype(dtype, copy=False).astype(object, copy=False) @@ -5273,6 +5326,8 @@ def _dtype_to_kind(dtype_str: str) -> str: kind = "integer" elif dtype_str == "object": kind = "object" + elif dtype_str == "str": + kind = "str" else: raise ValueError(f"cannot interpret dtype of [{dtype_str}]") diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index c5bdfb5541788..1d424425cd927 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -25,6 +25,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs.byteswap import ( read_double_with_byteswap, read_float_with_byteswap, @@ -722,6 +724,7 @@ def _chunk_to_dataframe(self) -> DataFrame: rslt = {} js, jb = 0, 0 + infer_string = get_option("future.infer_string") for j in range(self.column_count): name = self.column_names[j] @@ -738,6 +741,9 @@ def _chunk_to_dataframe(self) -> DataFrame: rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False) if self.convert_text and (self.encoding is not None): rslt[name] = self._decode_string(rslt[name].str) + if infer_string: + rslt[name] = rslt[name].astype("str") + js += 1 else: self.close() diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index e68f4789f0a06..11b2ed0ee7316 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -288,7 +288,7 @@ def close(self) -> None: def _get_row(self): return self.filepath_or_buffer.read(80).decode() - def _read_header(self): + def _read_header(self) -> None: self.filepath_or_buffer.seek(0) # read file header diff --git a/pandas/io/sql.py b/pandas/io/sql.py index b0fa6bc6e90c4..7027702a696fe 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -32,7 +32,7 @@ import numpy as np -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -46,11 +46,10 @@ from pandas.core.dtypes.common import ( is_dict_like, is_list_like, + is_object_dtype, + is_string_dtype, ) -from pandas.core.dtypes.dtypes import ( - ArrowDtype, - DatetimeTZDtype, -) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas import get_option @@ -59,12 +58,15 @@ Series, ) from pandas.core.arrays import ArrowExtensionArray +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.common import maybe_make_list from pandas.core.internals.construction import convert_object_array from pandas.core.tools.datetimes import to_datetime +from pandas.io._util import arrow_table_to_pandas + if TYPE_CHECKING: from collections.abc import ( Iterator, @@ -1012,22 +1014,19 @@ def _execute_insert(self, conn, keys: list[str], data_iter) -> int: def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int: """ - Alternative to _execute_insert for DBs support multivalue INSERT. + Alternative to _execute_insert for DBs support multi-value INSERT. Note: multi-value insert is usually faster for analytics DBs and tables containing a few columns but performance degrades quickly with increase of columns. + """ from sqlalchemy import insert data = [dict(zip(keys, row)) for row in data_iter] - stmt = insert(self.table) - # conn.execute is used here to ensure compatibility with Oracle. - # Using stmt.values(data) would produce a multi row insert that - # isn't supported by Oracle. - # see: https://docs.sqlalchemy.org/en/20/core/dml.html#sqlalchemy.sql.expression.Insert.values - result = conn.execute(stmt, data) + stmt = insert(self.table).values(data) + result = conn.execute(stmt) return result.rowcount def insert_data(self) -> tuple[list[str], list[np.ndarray]]: @@ -1334,7 +1333,12 @@ def _harmonize_columns( elif dtype_backend == "numpy" and col_type is float: # floats support NA, can always convert! self.frame[col_name] = df_col.astype(col_type, copy=False) - + elif ( + using_string_dtype() + and is_string_dtype(col_type) + and is_object_dtype(self.frame[col_name]) + ): + self.frame[col_name] = df_col.astype(col_type, copy=False) elif dtype_backend == "numpy" and len(df_col) == df_col.count(): # No NA values, can convert ints and bools if col_type is np.dtype("int64") or col_type is bool: @@ -1421,6 +1425,7 @@ def _get_dtype(self, sqltype): DateTime, Float, Integer, + String, ) if isinstance(sqltype, Float): @@ -1440,6 +1445,10 @@ def _get_dtype(self, sqltype): return date elif isinstance(sqltype, Boolean): return bool + elif isinstance(sqltype, String): + if using_string_dtype(): + return StringDtype(na_value=np.nan) + return object @@ -1514,7 +1523,7 @@ def _create_sql_schema( keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: pass @@ -2073,7 +2082,7 @@ def _create_sql_schema( keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: table = SQLTable( table_name, self, @@ -2211,23 +2220,10 @@ def read_table( else: stmt = f"SELECT {select_list} FROM {table_name}" - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - elif using_pyarrow_string_dtype(): - from pandas.io._util import arrow_string_types_mapper - - arrow_string_types_mapper() - else: - mapping = None - with self.con.cursor() as cur: cur.execute(stmt) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, @@ -2295,19 +2291,10 @@ def read_query( if chunksize: raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - else: - mapping = None - with self.con.cursor() as cur: cur.execute(sql) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, @@ -2403,7 +2390,9 @@ def to_sql( raise ValueError("datatypes not supported") from exc with self.con.cursor() as cur: - total_inserted = cur.adbc_ingest(table_name, tbl, mode=mode) + total_inserted = cur.adbc_ingest( + table_name=name, data=tbl, mode=mode, db_schema_name=schema + ) self.con.commit() return total_inserted @@ -2433,7 +2422,7 @@ def _create_sql_schema( keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: raise NotImplementedError("not implemented for adbc") @@ -2879,7 +2868,7 @@ def _create_sql_schema( keys=None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: table = SQLiteTable( table_name, self, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 0f097c6059c7c..b5057a6681638 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -47,9 +47,11 @@ ) from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( ensure_object, is_numeric_dtype, + is_string_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype @@ -62,8 +64,6 @@ to_datetime, to_timedelta, ) -from pandas.core.arrays.boolean import BooleanDtype -from pandas.core.arrays.integer import IntegerDtype from pandas.core.frame import DataFrame from pandas.core.indexes.base import Index from pandas.core.indexes.range import RangeIndex @@ -591,17 +591,26 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: for col in data: # Cast from unsupported types to supported types - is_nullable_int = isinstance(data[col].dtype, (IntegerDtype, BooleanDtype)) + is_nullable_int = ( + isinstance(data[col].dtype, ExtensionDtype) + and data[col].dtype.kind in "iub" + ) # We need to find orig_missing before altering data below orig_missing = data[col].isna() if is_nullable_int: - missing_loc = data[col].isna() - if missing_loc.any(): - # Replace with always safe value - fv = 0 if isinstance(data[col].dtype, IntegerDtype) else False - data.loc[missing_loc, col] = fv + fv = 0 if data[col].dtype.kind in "iu" else False # Replace with NumPy-compatible column - data[col] = data[col].astype(data[col].dtype.numpy_dtype) + data[col] = data[col].fillna(fv).astype(data[col].dtype.numpy_dtype) + elif isinstance(data[col].dtype, ExtensionDtype): + if getattr(data[col].dtype, "numpy_dtype", None) is not None: + data[col] = data[col].astype(data[col].dtype.numpy_dtype) + elif is_string_dtype(data[col].dtype): + # TODO could avoid converting string dtype to object here, + # but handle string dtype in _encode_strings + data[col] = data[col].astype("object") + # generate_table checks for None values + data.loc[data[col].isna(), col] = None + dtype = data[col].dtype empty_df = data.shape[0] == 0 for c_data in conversion_data: @@ -687,7 +696,7 @@ def __init__( self._prepare_value_labels() - def _prepare_value_labels(self): + def _prepare_value_labels(self) -> None: """Encode value labels.""" self.text_len = 0 @@ -2666,6 +2675,7 @@ def _encode_strings(self) -> None: continue column = self.data[col] dtype = column.dtype + # TODO could also handle string dtype here specifically if dtype.type is np.object_: inferred_dtype = infer_dtype(column, skipna=True) if not ((inferred_dtype == "string") or len(column) == 0): diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index d2b76decaa75d..80f0349b205e6 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -7,6 +7,7 @@ ) import warnings +import matplotlib as mpl from matplotlib.artist import setp import numpy as np @@ -20,6 +21,7 @@ import pandas as pd import pandas.core.common as com +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import ( @@ -54,7 +56,8 @@ def _set_ticklabels(ax: Axes, labels: list[str], is_vertical: bool, **kwargs) -> ticks = ax.get_xticks() if is_vertical else ax.get_yticks() if len(ticks) != len(labels): i, remainder = divmod(len(ticks), len(labels)) - assert remainder == 0, remainder + if Version(mpl.__version__) < Version("3.10"): + assert remainder == 0, remainder labels *= i if is_vertical: ax.set_xticklabels(labels, **kwargs) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 0eb3318ac96c5..9acb93ce69a9c 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -584,7 +584,8 @@ def _get_periods_per_ymd(freq: BaseOffset) -> tuple[int, int, int]: return ppd, ppm, ppy -def _daily_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _daily_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: # error: "BaseOffset" has no attribute "_period_dtype_code" dtype_code = freq._period_dtype_code # type: ignore[attr-defined] @@ -783,7 +784,8 @@ def _second_finder(label_interval: int) -> None: return info -def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _monthly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin @@ -854,7 +856,8 @@ def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: return info -def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _quarterly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin (vmin, vmax) = (int(vmin), int(vmax)) @@ -901,7 +904,8 @@ def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: return info -def _annual_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _annual_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: # Note: small difference here vs other finders in adding 1 to vmax (vmin, vmax) = (int(vmin), int(vmax + 1)) span = vmax - vmin + 1 diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 479a5e19dc1c5..3a1e589c2279b 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -547,7 +547,7 @@ def _maybe_right_yaxis(self, ax: Axes, axes_num: int) -> Axes: new_ax.set_yscale("log") elif self.logy == "sym" or self.loglog == "sym": new_ax.set_yscale("symlog") - return new_ax # type: ignore[return-value] + return new_ax @final @cache_readonly @@ -662,7 +662,7 @@ def _ensure_frame(self, data) -> DataFrame: return data @final - def _compute_plot_data(self): + def _compute_plot_data(self) -> None: data = self.data # GH15079 reconstruct data if by is defined @@ -699,7 +699,7 @@ def _compute_plot_data(self): self.data = numeric_data.apply(type(self)._convert_to_ndarray) - def _make_plot(self, fig: Figure): + def _make_plot(self, fig: Figure) -> None: raise AbstractMethodError(self) @final @@ -745,7 +745,7 @@ def _post_plot_logic(self, ax: Axes, data) -> None: """Post process for each axes. Overridden in child classes""" @final - def _adorn_subplots(self, fig: Figure): + def _adorn_subplots(self, fig: Figure) -> None: """Common post process unrelated to data""" if len(self.axes) > 0: all_axes = self._get_subplots(fig) @@ -893,7 +893,13 @@ def _make_legend(self) -> None: elif self.subplots and self.legend: for ax in self.axes: if ax.get_visible(): - ax.legend(loc="best") + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "No artists with labels found to put in legend.", + UserWarning, + ) + ax.legend(loc="best") @final @staticmethod @@ -1323,7 +1329,7 @@ def __init__( c = self.data.columns[c] self.c = c - def _make_plot(self, fig: Figure): + def _make_plot(self, fig: Figure) -> None: x, y, c, data = self.x, self.y, self.c, self.data ax = self.axes[0] diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index bf1c0f6346f02..c7ddfa55d0417 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -205,7 +205,10 @@ def _get_ax_freq(ax: Axes): def _get_period_alias(freq: timedelta | BaseOffset | str) -> str | None: - freqstr = to_offset(freq, is_period=True).rule_code + if isinstance(freq, BaseOffset): + freqstr = freq.name + else: + freqstr = to_offset(freq, is_period=True).rule_code return get_period_alias(freqstr) diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 898b5b25e7b01..98441c5afbaa4 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -57,7 +57,7 @@ def format_date_labels(ax: Axes, rot) -> None: fig = ax.get_figure() if fig is not None: # should always be a Figure but can technically be None - maybe_adjust_figure(fig, bottom=0.2) + maybe_adjust_figure(fig, bottom=0.2) # type: ignore[arg-type] def table( diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index b7eac6b8f0ea1..1a776892b7bb7 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm + from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -16,6 +18,7 @@ ) import pandas._testing as tm from pandas.tests.frame.common import zip_frames +from pandas.util.version import Version @pytest.fixture @@ -65,6 +68,9 @@ def test_apply(float_frame, engine, request): @pytest.mark.parametrize("raw", [True, False]) def test_apply_args(float_frame, axis, raw, engine, request): if engine == "numba": + numba = pytest.importorskip("numba") + if Version(numba.__version__) == Version("0.61") and is_platform_arm(): + pytest.skip(f"Segfaults on ARM platforms with numba {numba.__version__}") mark = pytest.mark.xfail(reason="numba engine doesn't support args") request.node.add_marker(mark) result = float_frame.apply( diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index b5ad1094f5bf5..68f3fe36546a0 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -218,11 +218,13 @@ def transform(row): def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string): # GH 21224 if using_infer_string: - import pyarrow as pa + expected = (expected, NotImplementedError) - expected = (expected, pa.lib.ArrowNotImplementedError) - - msg = "can't multiply sequence by non-int of type 'str'|has no kernel" + msg = ( + "can't multiply sequence by non-int of type 'str'" + "|cannot perform cumprod with type str" # NotImplementedError python backend + "|operation 'cumprod' not supported for dtype 'str'" # TypeError pyarrow + ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): with tm.assert_produces_warning(warn, match="using DataFrame.cumprod"): @@ -251,12 +253,12 @@ def test_agg_cython_table_raises_series(series, func, expected, using_infer_stri if func == "median" or func is np.nanmedian or func is np.median: msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" - if using_infer_string: - import pyarrow as pa + if using_infer_string and func in ("cumprod", np.cumprod, np.nancumprod): + expected = (expected, NotImplementedError) - expected = (expected, pa.lib.ArrowNotImplementedError) - - msg = msg + "|does not support|has no kernel" + msg = ( + msg + "|does not support|has no kernel|Cannot perform|cannot perform|operation" + ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 57b81711ddb48..c211073f75888 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,15 +1,26 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm import pandas.util._test_decorators as td +import pandas as pd from pandas import ( DataFrame, Index, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu] +pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu, pytest.mark.skipif()] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.fixture(params=[0, 1]) @@ -26,11 +37,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis): def test_numba_vs_python_string_index(): # GH#56189 - pytest.importorskip("pyarrow") df = DataFrame( 1, - index=Index(["a", "b"], dtype="string[pyarrow_numpy]"), - columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)), ) func = lambda x: x result = df.apply(func, engine="numba", axis=0) @@ -100,13 +110,14 @@ def test_numba_nonunique_unsupported(apply_axis): def test_numba_unsupported_dtypes(apply_axis): + pytest.importorskip("pyarrow") f = lambda x: x df = DataFrame({"a": [1, 2], "b": ["a", "b"], "c": [4, 5]}) df["c"] = df["c"].astype("double[pyarrow]") with pytest.raises( ValueError, - match="Column b must have a numeric dtype. Found 'object|string' instead", + match="Column b must have a numeric dtype. Found 'object|str' instead", ): df.apply(f, engine="numba", axis=apply_axis) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index df24fa08f48e1..69f84ca74ab0b 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -244,7 +244,7 @@ def test_apply_categorical(by_row, using_infer_string): result = ser.apply(lambda x: "A") exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object if not using_infer_string else "string[pyarrow_numpy]" + assert result.dtype == object if not using_infer_string else "str" @pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]]) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index dbff88dc6f4f6..a468449efd507 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1586,6 +1586,38 @@ def test_dti_add_sub_nonzero_mth_offset( expected = tm.box_expected(expected, box_with_array, False) tm.assert_equal(result, expected) + def test_dt64arr_series_add_DateOffset_with_milli(self): + # GH 57529 + dti = DatetimeIndex( + [ + "2000-01-01 00:00:00.012345678", + "2000-01-31 00:00:00.012345678", + "2000-02-29 00:00:00.012345678", + ], + dtype="datetime64[ns]", + ) + result = dti + DateOffset(milliseconds=4) + expected = DatetimeIndex( + [ + "2000-01-01 00:00:00.016345678", + "2000-01-31 00:00:00.016345678", + "2000-02-29 00:00:00.016345678", + ], + dtype="datetime64[ns]", + ) + tm.assert_index_equal(result, expected) + + result = dti + DateOffset(days=1, milliseconds=4) + expected = DatetimeIndex( + [ + "2000-01-02 00:00:00.016345678", + "2000-02-01 00:00:00.016345678", + "2000-03-01 00:00:00.016345678", + ], + dtype="datetime64[ns]", + ) + tm.assert_index_equal(result, expected) + class TestDatetime64OverflowHandling: # TODO: box + de-duplicate diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 4ffd76722286a..44e485d40ba53 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -185,6 +183,10 @@ def test_objarr_add_invalid(self, op, box_with_array): "unsupported operand type", "must be str", "has no kernel", + "operation 'add' not supported", + "operation 'radd' not supported", + "operation 'sub' not supported", + "operation 'rsub' not supported", ] ) with pytest.raises(Exception, match=msg): @@ -303,7 +305,6 @@ def test_iadd_string(self): index += "_x" assert "a_x" in index - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="add doesn't work") def test_add(self): index = pd.Index([str(i) for i in range(10)]) expected = pd.Index(index.values * 2) @@ -318,24 +319,17 @@ def test_add(self): expected = pd.Index(["1a", "1b", "1c"]) tm.assert_index_equal("1" + index, expected) - def test_sub_fail(self, using_infer_string): + def test_sub_fail(self): index = pd.Index([str(i) for i in range(10)]) - if using_infer_string: - import pyarrow as pa - - err = pa.lib.ArrowNotImplementedError - msg = "has no kernel" - else: - err = TypeError - msg = "unsupported operand type|Cannot broadcast" - with pytest.raises(err, match=msg): + msg = "unsupported operand type|Cannot broadcast|sub' not supported" + with pytest.raises(TypeError, match=msg): index - "a" - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): index - index - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): index - index.tolist() - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): index.tolist() - index def test_sub_object(self): diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 007d1e670e1e0..d02e827d435cf 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1454,7 +1454,13 @@ def test_td64arr_mul_int(self, box_with_array): def test_td64arr_mul_tdlike_scalar_raises(self, two_hours, box_with_array): rng = timedelta_range("1 days", "10 days", name="foo") rng = tm.box_expected(rng, box_with_array) - msg = "argument must be an integer|cannot use operands with types dtype" + msg = "|".join( + [ + "argument must be an integer", + "cannot use operands with types dtype", + "Cannot multiply with", + ] + ) with pytest.raises(TypeError, match=msg): rng * two_hours diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 0c4fcf149eb20..9ff690cdc914d 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -90,16 +90,8 @@ def test_op_int8(left_array, right_array, opname): # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): +def test_error_invalid_values(data, all_arithmetic_operators): # invalid ops - - if using_infer_string: - import pyarrow as pa - - err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) - else: - err = TypeError - op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) @@ -109,7 +101,8 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "did not contain a loop with signature matching types|" "BooleanArray cannot perform the operation|" "not supported for the input types, and the inputs could not be safely coerced " - "to any supported types according to the casting rule ''safe''" + "to any supported types according to the casting rule ''safe''|" + "not supported for dtype" ) with pytest.raises(TypeError, match=msg): ops("foo") @@ -118,9 +111,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string r"unsupported operand type\(s\) for", "Concatenation operation is not implemented for NumPy arrays", "has no kernel", + "not supported for dtype", ] ) - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes @@ -133,7 +127,8 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "not all arguments converted during string formatting", "has no kernel", "not implemented", + "not supported for dtype", ] ) - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series("foo", index=s.index)) diff --git a/pandas/tests/arrays/boolean/test_astype.py b/pandas/tests/arrays/boolean/test_astype.py index 932e903c0e448..8c2672218f273 100644 --- a/pandas/tests/arrays/boolean/test_astype.py +++ b/pandas/tests/arrays/boolean/test_astype.py @@ -5,7 +5,7 @@ import pandas._testing as tm -def test_astype(): +def test_astype(using_infer_string): # with missing values arr = pd.array([True, False, None], dtype="boolean") @@ -20,8 +20,14 @@ def test_astype(): tm.assert_numpy_array_equal(result, expected) result = arr.astype("str") - expected = np.array(["True", "False", ""], dtype=f"{tm.ENDIAN}U5") - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array( + ["True", "False", None], dtype=pd.StringDtype(na_value=np.nan) + ) + tm.assert_extension_array_equal(result, expected) + else: + expected = np.array(["True", "False", ""], dtype=f"{tm.ENDIAN}U5") + tm.assert_numpy_array_equal(result, expected) # no missing values arr = pd.array([True, False, True], dtype="boolean") diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index a5a2dd33940b8..645e763fbf00c 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -308,8 +308,6 @@ def test_to_numpy(box): # converting to int or float without specifying na_value raises with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): arr.to_numpy(dtype="int64") - with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): - arr.to_numpy(dtype="float64") def test_to_numpy_copy(): diff --git a/pandas/tests/arrays/categorical/conftest.py b/pandas/tests/arrays/categorical/conftest.py deleted file mode 100644 index 37249210f28f4..0000000000000 --- a/pandas/tests/arrays/categorical/conftest.py +++ /dev/null @@ -1,9 +0,0 @@ -import pytest - -from pandas import Categorical - - -@pytest.fixture -def factor(): - """Fixture returning a Categorical object""" - return Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index c2c53fbc4637e..9a0356cbc422b 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -296,7 +296,7 @@ def test_nbytes(self): exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories assert cat.nbytes == exp - def test_memory_usage(self): + def test_memory_usage(self, using_infer_string): cat = Categorical([1, 2, 3]) # .categories is an index, so we include the hashtable @@ -304,7 +304,13 @@ def test_memory_usage(self): assert 0 < cat.nbytes <= cat.memory_usage(deep=True) cat = Categorical(["foo", "foo", "bar"]) - assert cat.memory_usage(deep=True) > cat.nbytes + if using_infer_string: + if cat.categories.dtype.storage == "python": + assert cat.memory_usage(deep=True) > cat.nbytes + else: + assert cat.memory_usage(deep=True) >= cat.nbytes + else: + assert cat.memory_usage(deep=True) > cat.nbytes if not PYPY: # sys.getsizeof will call the .memory_usage with diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index b4215b4a6fe21..a939ee5f6f53f 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -385,7 +385,8 @@ def test_remove_unused_categories(self): class TestCategoricalAPIWithFactor: - def test_describe(self, factor): + def test_describe(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) # string type desc = factor.describe() assert factor.ordered diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index a2a53af6ab1ad..ee930ac84aaf2 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -89,7 +89,7 @@ def test_astype(self, ordered): expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = r"Cannot cast object|string dtype to float64" + msg = r"Cannot cast object|str dtype to float64" with pytest.raises(ValueError, match=msg): cat.astype(float) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 373f1c95463fc..8ac479cf8a0a4 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -6,7 +6,9 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import ( is_float_dtype, @@ -449,7 +451,9 @@ def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="Can't be NumPy strings" + ) def test_constructor_np_strs(self): # GH#31499 Hashtable.map_locations needs to work on np.str_ objects cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 3377c411a7084..5e1c5c64fa660 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -21,7 +21,8 @@ class TestCategoricalIndexingWithFactor: - def test_getitem(self, factor): + def test_getitem(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) assert factor[0] == "a" assert factor[-1] == "c" @@ -31,7 +32,8 @@ def test_getitem(self, factor): subf = factor[np.asarray(factor) == "c"] tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8)) - def test_setitem(self, factor): + def test_setitem(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) # int/positional c = factor.copy() c[0] = "b" diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 16b941eab4830..4174d2adc810b 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -17,7 +17,8 @@ def test_categories_none_comparisons(self): factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) tm.assert_categorical_equal(factor, factor) - def test_comparisons(self, factor): + def test_comparisons(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) result = factor[factor == "a"] expected = factor[np.asarray(factor) == "a"] tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index d6f93fbbd912f..3a2c489920eb0 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( Categorical, @@ -17,11 +17,12 @@ class TestCategoricalReprWithFactor: - def test_print(self, factor, using_infer_string): + def test_print(self, using_infer_string): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) if using_infer_string: expected = [ "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", - "Categories (3, string): [a < b < c]", + "Categories (3, str): [a < b < c]", ] else: expected = [ @@ -77,7 +78,7 @@ def test_print_none_width(self): assert exp == repr(a) @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="Change once infer_string is set to True by default", ) def test_unicode_print(self): diff --git a/pandas/tests/arrays/datetimes/test_constructors.py b/pandas/tests/arrays/datetimes/test_constructors.py index daf4aa3b47f56..3652b5fec46bb 100644 --- a/pandas/tests/arrays/datetimes/test_constructors.py +++ b/pandas/tests/arrays/datetimes/test_constructors.py @@ -223,7 +223,7 @@ def test_2d(self, order): ("s", "ns", "US/Central", "Asia/Kolkata", COARSE_TO_FINE_SAFE), ], ) -def test_from_arrowtest_from_arrow_with_different_units_and_timezones_with_( +def test_from_arrow_with_different_units_and_timezones_with( pa_unit, pd_unit, pa_tz, pd_tz, data ): pa = pytest.importorskip("pyarrow") @@ -233,9 +233,8 @@ def test_from_arrowtest_from_arrow_with_different_units_and_timezones_with_( dtype = DatetimeTZDtype(unit=pd_unit, tz=pd_tz) result = dtype.__from_arrow__(arr) - expected = DatetimeArray._from_sequence( - np.array(data, dtype=f"datetime64[{pa_unit}]").astype(f"datetime64[{pd_unit}]"), - dtype=dtype, + expected = DatetimeArray._from_sequence(data, dtype=f"M8[{pa_unit}, UTC]").astype( + dtype, copy=False ) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index ba081bd01062a..009fac4c2f5ed 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -122,18 +122,11 @@ def test_arith_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): +def test_error_invalid_values(data, all_arithmetic_operators): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) - if using_infer_string: - import pyarrow as pa - - errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) - else: - errs = TypeError - # invalid scalars msg = "|".join( [ @@ -149,15 +142,17 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "Concatenation operation is not implemented for NumPy arrays", "has no kernel", "not implemented", + "not supported for dtype", + "Can only string multiply by an integer", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops("foo") - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series("foo", index=s.index)) msg = "|".join( @@ -178,9 +173,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "cannot subtract DatetimeArray from ndarray", "has no kernel", "not implemented", + "not supported for dtype", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index ade3dbd2c99da..752ebe194ffcf 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -63,12 +63,19 @@ def test_astype_to_integer_array(): tm.assert_extension_array_equal(result, expected) -def test_astype_str(): +def test_astype_str(using_infer_string): a = pd.array([0.1, 0.2, None], dtype="Float64") - expected = np.array(["0.1", "0.2", ""], dtype="U32") - tm.assert_numpy_array_equal(a.astype(str), expected) - tm.assert_numpy_array_equal(a.astype("str"), expected) + if using_infer_string: + expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan)) + + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) + else: + expected = np.array(["0.1", "0.2", ""], dtype="U32") + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) def test_astype_copy(): diff --git a/pandas/tests/arrays/floating/test_to_numpy.py b/pandas/tests/arrays/floating/test_to_numpy.py index a25ac40cb3e7c..e954cecba417a 100644 --- a/pandas/tests/arrays/floating/test_to_numpy.py +++ b/pandas/tests/arrays/floating/test_to_numpy.py @@ -33,10 +33,10 @@ def test_to_numpy_float(box): tm.assert_numpy_array_equal(result, expected) arr = con([0.1, 0.2, None], dtype="Float64") - with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): - result = arr.to_numpy(dtype="float64") + result = arr.to_numpy(dtype="float64") + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) - # need to explicitly specify na_value result = arr.to_numpy(dtype="float64", na_value=np.nan) expected = np.array([0.1, 0.2, np.nan], dtype="float64") tm.assert_numpy_array_equal(result, expected) @@ -100,7 +100,7 @@ def test_to_numpy_dtype(box, dtype): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"]) +@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) def test_to_numpy_na_raises(box, dtype): con = pd.Series if box else pd.array diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index d979dd445a61a..dee3deeee0f2f 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -172,18 +172,11 @@ def test_numpy_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): +def test_error_invalid_values(data, all_arithmetic_operators): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) - if using_infer_string: - import pyarrow as pa - - errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) - else: - errs = TypeError - # invalid scalars msg = "|".join( [ @@ -197,24 +190,22 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "Addition/subtraction of integers and integer-arrays with Timestamp", "has no kernel", "not implemented", + "The 'out' kwarg is necessary. Use numpy.strings.multiply without it.", + "not supported for dtype", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops("foo") - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes str_ser = pd.Series("foo", index=s.index) # with pytest.raises(TypeError, match=msg): - if ( - all_arithmetic_operators - in [ - "__mul__", - "__rmul__", - ] - and not using_infer_string - ): # (data[~data.isna()] >= 0).all(): + if all_arithmetic_operators in [ + "__mul__", + "__rmul__", + ]: # (data[~data.isna()] >= 0).all(): res = ops(str_ser) expected = pd.Series(["foo" * x for x in data], index=s.index) expected = expected.fillna(np.nan) @@ -223,7 +214,7 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string # more-correct than np.nan here. tm.assert_series_equal(res, expected) else: - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(str_ser) msg = "|".join( @@ -238,9 +229,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "cannot subtract DatetimeArray from ndarray", "has no kernel", "not implemented", + "not supported for dtype", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index e3848cdfe3aa9..90879d8bd3063 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -271,19 +271,26 @@ def test_to_numpy_dtype(dtype, in_series): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dtype", ["float64", "int64", "bool"]) +@pytest.mark.parametrize("dtype", ["int64", "bool"]) def test_to_numpy_na_raises(dtype): a = pd.array([0, 1, None], dtype="Int64") with pytest.raises(ValueError, match=dtype): a.to_numpy(dtype=dtype) -def test_astype_str(): +def test_astype_str(using_infer_string): a = pd.array([1, 2, None], dtype="Int64") - expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") - tm.assert_numpy_array_equal(a.astype(str), expected) - tm.assert_numpy_array_equal(a.astype("str"), expected) + if using_infer_string: + expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan)) + + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) + else: + expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) def test_astype_boolean(): diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py index db04862e4ea07..1c91cd25ba69c 100644 --- a/pandas/tests/arrays/integer/test_reduction.py +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -102,9 +102,7 @@ def test_groupby_reductions(op, expected): ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], ], ) -def test_mixed_reductions(op, expected, using_infer_string): - if op in ["any", "all"] and using_infer_string: - expected = expected.astype("bool") +def test_mixed_reductions(op, expected): df = DataFrame( { "A": ["a", "b", "b"], diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 7a89656bd5aa0..293ee4095d02e 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -8,6 +8,7 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) + pa = pytest.importorskip("pyarrow") from pandas.core.arrays.arrow._arrow_utils import pyarrow_array_to_numpy_and_mask diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py index 4c7bd6e293ef4..b259018cd6121 100644 --- a/pandas/tests/arrays/masked/test_function.py +++ b/pandas/tests/arrays/masked/test_function.py @@ -5,6 +5,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import BaseMaskedArray arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES] arrays += [ @@ -55,3 +56,19 @@ def test_tolist(data): result = data.tolist() expected = list(data) tm.assert_equal(result, expected) + + +def test_to_numpy(): + # GH#56991 + + class MyStringArray(BaseMaskedArray): + dtype = pd.StringDtype() + _dtype_cls = pd.StringDtype + _internal_fill_value = pd.NA + + arr = MyStringArray( + values=np.array(["a", "b", "c"]), mask=np.array([False, True, False]) + ) + result = arr.to_numpy() + expected = np.array(["a", pd.NA, "c"]) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/arrays/masked/test_indexing.py b/pandas/tests/arrays/masked/test_indexing.py index 28ee451a7ddd7..753d562c87ffa 100644 --- a/pandas/tests/arrays/masked/test_indexing.py +++ b/pandas/tests/arrays/masked/test_indexing.py @@ -8,7 +8,7 @@ class TestSetitemValidation: def _check_setitem_invalid(self, arr, invalid): - msg = f"Invalid value '{str(invalid)}' for dtype {arr.dtype}" + msg = f"Invalid value '{invalid!s}' for dtype '{arr.dtype}'" msg = re.escape(msg) with pytest.raises(TypeError, match=msg): arr[0] = invalid diff --git a/pandas/tests/arrays/numpy_/test_numpy.py b/pandas/tests/arrays/numpy_/test_numpy.py index 5112ce262f771..f21fb4ccfba07 100644 --- a/pandas/tests/arrays/numpy_/test_numpy.py +++ b/pandas/tests/arrays/numpy_/test_numpy.py @@ -21,7 +21,7 @@ np.array([True, False], dtype=bool), np.array([0, 1], dtype="datetime64[ns]"), np.array([0, 1], dtype="timedelta64[ns]"), - ] + ], ) def any_numpy_array(request): """ @@ -29,7 +29,7 @@ def any_numpy_array(request): This excludes string and bytes. """ - return request.param + return request.param.copy() # ---------------------------------------------------------------------------- @@ -322,3 +322,30 @@ def test_factorize_unsigned(): tm.assert_numpy_array_equal(res_codes, exp_codes) tm.assert_extension_array_equal(res_unique, NumpyExtensionArray(exp_unique)) + + +# ---------------------------------------------------------------------------- +# Output formatting + + +def test_array_repr(any_numpy_array): + # GH#61085 + nparray = any_numpy_array + arr = NumpyExtensionArray(nparray) + if nparray.dtype == "object": + values = "['a', 'b']" + elif nparray.dtype == "float64": + values = "[0.0, 1.0]" + elif str(nparray.dtype).startswith("int"): + values = "[0, 1]" + elif nparray.dtype == "complex128": + values = "[0j, (1+2j)]" + elif nparray.dtype == "bool": + values = "[True, False]" + elif nparray.dtype == "datetime64[ns]": + values = "[1970-01-01T00:00:00.000000000, 1970-01-01T00:00:00.000000001]" + elif nparray.dtype == "timedelta64[ns]": + values = "[0 nanoseconds, 1 nanoseconds]" + expected = f"\n{values}\nLength: 2, dtype: {nparray.dtype}" + result = repr(arr) + assert result == expected, f"{result} vs {expected}" diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 883d6ea3959ff..b2a570b14df3c 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -4,6 +4,7 @@ import pytest from pandas._libs.sparse import IntIndex +from pandas.compat.numpy import np_version_gt2 import pandas as pd from pandas import ( @@ -478,3 +479,33 @@ def test_zero_sparse_column(): expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2]) tm.assert_frame_equal(result, expected) + + +def test_array_interface(arr_data, arr): + # https://github.com/pandas-dev/pandas/pull/60046 + result = np.asarray(arr) + tm.assert_numpy_array_equal(result, arr_data) + + # it always gives a copy by default + result_copy1 = np.asarray(arr) + result_copy2 = np.asarray(arr) + assert not np.may_share_memory(result_copy1, result_copy2) + + # or with explicit copy=True + result_copy1 = np.array(arr, copy=True) + result_copy2 = np.array(arr, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed" + with tm.assert_produces_warning(FutureWarning, match=msg): + np.array(arr, copy=False) + + # except when there are actually no sparse filled values + arr2 = SparseArray(np.array([1, 2, 3])) + result_nocopy1 = np.array(arr2, copy=False) + result_nocopy2 = np.array(arr2, copy=False) + assert np.may_share_memory(result_nocopy1, result_nocopy2) diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py index 83a507e679d46..e6e4a11a0f5ab 100644 --- a/pandas/tests/arrays/sparse/test_astype.py +++ b/pandas/tests/arrays/sparse/test_astype.py @@ -81,8 +81,8 @@ def test_astype_all(self, any_real_numpy_dtype): ), ( SparseArray([0, 1, 10]), - str, - SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")), + np.str_, + SparseArray(["0", "1", "10"], dtype=SparseDtype(np.str_, "0")), ), (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])), ( diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index 234f4092421e5..149c28341ba3d 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -177,7 +177,7 @@ def test_construct_from_string_fill_value_raises(string): [ (SparseDtype(int, 0), float, SparseDtype(float, 0.0)), (SparseDtype(int, 1), float, SparseDtype(float, 1.0)), - (SparseDtype(int, 1), str, SparseDtype(object, "1")), + (SparseDtype(int, 1), np.str_, SparseDtype(object, "1")), (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), ], ) diff --git a/pandas/tests/arrays/string_/test_concat.py b/pandas/tests/arrays/string_/test_concat.py new file mode 100644 index 0000000000000..320d700b2b6c3 --- /dev/null +++ b/pandas/tests/arrays/string_/test_concat.py @@ -0,0 +1,73 @@ +import numpy as np +import pytest + +from pandas.compat import HAS_PYARROW + +from pandas.core.dtypes.cast import find_common_type + +import pandas as pd +import pandas._testing as tm +from pandas.util.version import Version + + +@pytest.mark.parametrize( + "to_concat_dtypes, result_dtype", + [ + # same types + ([("pyarrow", pd.NA), ("pyarrow", pd.NA)], ("pyarrow", pd.NA)), + ([("pyarrow", np.nan), ("pyarrow", np.nan)], ("pyarrow", np.nan)), + ([("python", pd.NA), ("python", pd.NA)], ("python", pd.NA)), + ([("python", np.nan), ("python", np.nan)], ("python", np.nan)), + # pyarrow preference + ([("pyarrow", pd.NA), ("python", pd.NA)], ("pyarrow", pd.NA)), + # NA preference + ([("python", pd.NA), ("python", np.nan)], ("python", pd.NA)), + ], +) +def test_concat_series(request, to_concat_dtypes, result_dtype): + if any(storage == "pyarrow" for storage, _ in to_concat_dtypes) and not HAS_PYARROW: + pytest.skip("Could not import 'pyarrow'") + + ser_list = [ + pd.Series(["a", "b", None], dtype=pd.StringDtype(storage, na_value)) + for storage, na_value in to_concat_dtypes + ] + + result = pd.concat(ser_list, ignore_index=True) + expected = pd.Series( + ["a", "b", None, "a", "b", None], dtype=pd.StringDtype(*result_dtype) + ) + tm.assert_series_equal(result, expected) + + # order doesn't matter for result + result = pd.concat(ser_list[::1], ignore_index=True) + tm.assert_series_equal(result, expected) + + +def test_concat_with_object(string_dtype_arguments): + # _get_common_dtype cannot inspect values, so object dtype with strings still + # results in object dtype + result = pd.concat( + [ + pd.Series(["a", "b", None], dtype=pd.StringDtype(*string_dtype_arguments)), + pd.Series(["a", "b", None], dtype=object), + ] + ) + assert result.dtype == np.dtype("object") + + +def test_concat_with_numpy(string_dtype_arguments): + # common type with a numpy string dtype always preserves the pandas string dtype + dtype = pd.StringDtype(*string_dtype_arguments) + assert find_common_type([dtype, np.dtype("U")]) == dtype + assert find_common_type([np.dtype("U"), dtype]) == dtype + assert find_common_type([dtype, np.dtype("U10")]) == dtype + assert find_common_type([np.dtype("U10"), dtype]) == dtype + + # with any other numpy dtype -> object + assert find_common_type([dtype, np.dtype("S")]) == np.dtype("object") + assert find_common_type([dtype, np.dtype("int64")]) == np.dtype("object") + + if Version(np.__version__) >= Version("2"): + assert find_common_type([dtype, np.dtypes.StringDType()]) == dtype + assert find_common_type([np.dtypes.StringDType(), dtype]) == dtype diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 320bdca60a932..c7f854c11f3dd 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -7,29 +7,35 @@ import numpy as np import pytest -from pandas.compat.pyarrow import pa_version_under12p0 +from pandas._config import using_string_dtype + +from pandas.compat.pyarrow import ( + pa_version_under12p0, + pa_version_under19p0, +) from pandas.core.dtypes.common import is_dtype_equal import pandas as pd import pandas._testing as tm +from pandas.core.arrays.string_ import StringArrayNumpySemantics from pandas.core.arrays.string_arrow import ( ArrowStringArray, ArrowStringArrayNumpySemantics, ) -def na_val(dtype): - if dtype.storage == "pyarrow_numpy": - return np.nan - else: - return pd.NA +@pytest.fixture +def dtype(string_dtype_arguments): + """Fixture giving StringDtype from parametrized storage and na_value arguments""" + storage, na_value = string_dtype_arguments + return pd.StringDtype(storage=storage, na_value=na_value) @pytest.fixture -def dtype(string_storage): - """Fixture giving StringDtype from parametrized 'string_storage'""" - return pd.StringDtype(storage=string_storage) +def dtype2(string_dtype_arguments2): + storage, na_value = string_dtype_arguments2 + return pd.StringDtype(storage=storage, na_value=na_value) @pytest.fixture @@ -38,26 +44,58 @@ def cls(dtype): return dtype.construct_array_type() +def test_dtype_constructor(): + pytest.importorskip("pyarrow") + + with tm.assert_produces_warning(FutureWarning): + dtype = pd.StringDtype("pyarrow_numpy") + assert dtype == pd.StringDtype("pyarrow", na_value=np.nan) + + +def test_dtype_equality(): + pytest.importorskip("pyarrow") + + dtype1 = pd.StringDtype("python") + dtype2 = pd.StringDtype("pyarrow") + dtype3 = pd.StringDtype("pyarrow", na_value=np.nan) + + assert dtype1 == pd.StringDtype("python", na_value=pd.NA) + assert dtype1 != dtype2 + assert dtype1 != dtype3 + + assert dtype2 == pd.StringDtype("pyarrow", na_value=pd.NA) + assert dtype2 != dtype1 + assert dtype2 != dtype3 + + assert dtype3 == pd.StringDtype("pyarrow", na_value=np.nan) + assert dtype3 == pd.StringDtype("pyarrow", na_value=float("nan")) + assert dtype3 != dtype1 + assert dtype3 != dtype2 + + def test_repr(dtype): df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = " A\n0 a\n1 NaN\n2 b" else: expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - if dtype.storage == "pyarrow_numpy": - expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" + if dtype.na_value is np.nan: + expected = "0 a\n1 NaN\n2 b\nName: A, dtype: str" else: expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - if dtype.storage == "pyarrow": + if dtype.storage == "pyarrow" and dtype.na_value is pd.NA: arr_name = "ArrowStringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" - elif dtype.storage == "pyarrow_numpy": + elif dtype.storage == "pyarrow" and dtype.na_value is np.nan: arr_name = "ArrowStringArrayNumpySemantics" - expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str" + elif dtype.storage == "python" and dtype.na_value is np.nan: + arr_name = "StringArrayNumpySemantics" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str" else: arr_name = "StringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" @@ -67,23 +105,17 @@ def test_repr(dtype): def test_none_to_nan(cls, dtype): a = cls._from_sequence(["a", None, "b"], dtype=dtype) assert a[1] is not None - assert a[1] is na_val(a.dtype) + assert a[1] is a.dtype.na_value def test_setitem_validates(cls, dtype): arr = cls._from_sequence(["a", "b"], dtype=dtype) - if cls is pd.arrays.StringArray: - msg = "Cannot set non-string value '10' into a StringArray." - else: - msg = "Scalar must be NA or str" + msg = "Invalid value '10' for dtype 'str" with pytest.raises(TypeError, match=msg): arr[0] = 10 - if cls is pd.arrays.StringArray: - msg = "Must provide strings." - else: - msg = "Scalar must be NA or str" + msg = "Invalid value for dtype 'str" with pytest.raises(TypeError, match=msg): arr[:] = np.array([1, 2]) @@ -149,8 +181,8 @@ def test_add(dtype): tm.assert_series_equal(result, expected) -def test_add_2d(dtype, request, arrow_string_storage): - if dtype.storage in arrow_string_storage: +def test_add_2d(dtype, request): + if dtype.storage == "pyarrow": reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(raises=None, reason=reason) request.applymarker(mark) @@ -224,7 +256,7 @@ def test_comparison_methods_scalar(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = np.array([getattr(item, op_name)(other) for item in a]) if comparison_op == operator.ne: expected[1] = True @@ -243,7 +275,7 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: if operator.ne == comparison_op: expected = np.array([True, True, True]) else: @@ -270,7 +302,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected_data = { "__eq__": [False, False, False], "__ne__": [True, True, True], @@ -292,7 +324,7 @@ def test_comparison_methods_array(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = [None, None, "c"] result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: if operator.ne == comparison_op: expected = np.array([True, True, False]) else: @@ -322,6 +354,8 @@ def test_comparison_methods_array(comparison_op, dtype): def test_constructor_raises(cls): if cls is pd.arrays.StringArray: msg = "StringArray requires a sequence of strings or pandas.NA" + elif cls is StringArrayNumpySemantics: + msg = "StringArrayNumpySemantics requires a sequence of strings or NaN" else: msg = "Unsupported type '' for ArrowExtensionArray" @@ -331,7 +365,7 @@ def test_constructor_raises(cls): with pytest.raises(ValueError, match=msg): cls(np.array([])) - if cls is pd.arrays.StringArray: + if cls is pd.arrays.StringArray or cls is StringArrayNumpySemantics: # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs # for string dtype cls(np.array(["a", np.nan], dtype=object)) @@ -372,6 +406,8 @@ def test_from_sequence_no_mutate(copy, cls, dtype): import pyarrow as pa expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) + elif cls is StringArrayNumpySemantics: + expected = cls(nan_arr) else: expected = cls(na_arr) @@ -386,7 +422,7 @@ def test_astype_int(dtype): tm.assert_numpy_array_equal(result, expected) arr = pd.array(["1", pd.NA, "3"], dtype=dtype) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: err = ValueError msg = "cannot convert float NaN to integer" else: @@ -416,7 +452,6 @@ def test_astype_float(dtype, any_float_dtype): @pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce(skipna, dtype): arr = pd.Series(["a", "b", "c"], dtype=dtype) result = arr.sum(skipna=skipna) @@ -424,7 +459,6 @@ def test_reduce(skipna, dtype): @pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce_missing(skipna, dtype): arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype) result = arr.sum(skipna=skipna) @@ -443,13 +477,13 @@ def test_min_max(method, skipna, dtype): expected = "a" if method == "min" else "c" assert result == expected else: - assert result is na_val(arr.dtype) + assert result is arr.dtype.na_value @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) -def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): - if dtype.storage in arrow_string_storage and box is pd.array: +def test_min_max_numpy(method, box, dtype, request): + if dtype.storage == "pyarrow" and box is pd.array: if box is pd.array: reason = "'<=' not supported between instances of 'str' and 'NoneType'" else: @@ -463,7 +497,7 @@ def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): assert result == expected -def test_fillna_args(dtype, arrow_string_storage): +def test_fillna_args(dtype): # GH 37987 arr = pd.array(["a", pd.NA], dtype=dtype) @@ -476,10 +510,7 @@ def test_fillna_args(dtype, arrow_string_storage): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - if dtype.storage in arrow_string_storage: - msg = "Invalid value '1' for dtype string" - else: - msg = "Cannot set non-string value '1' into a StringArray." + msg = "Invalid value '1' for dtype 'str" with pytest.raises(TypeError, match=msg): arr.fillna(value=1) @@ -492,7 +523,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) - if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: + if dtype.storage == "pyarrow" and pa_version_under12p0: expected = pa.chunked_array(expected) if dtype.storage == "python": expected = pc.cast(expected, pa.string()) @@ -500,17 +531,10 @@ def test_arrow_array(dtype): @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): +def test_arrow_roundtrip(dtype, string_storage, using_infer_string): # roundtrip possible from arrow 1.0.0 pa = pytest.importorskip("pyarrow") - if using_infer_string and string_storage2 != "pyarrow_numpy": - request.applymarker( - pytest.mark.xfail( - reason="infer_string takes precedence over string storage" - ) - ) - data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -518,29 +542,42 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): assert table.field("a").type == "string" else: assert table.field("a").type == "large_string" - with pd.option_context("string_storage", string_storage2): + with pd.option_context("string_storage", string_storage): result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage2}]") + if dtype.na_value is np.nan and not using_infer_string: + assert result["a"].dtype == "object" + else: + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value)) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) + # ensure the missing value is represented by NA and not np.nan or None + assert result.loc[2, "a"] is result["a"].dtype.na_value + + +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_arrow_from_string(using_infer_string): + # not roundtrip, but starting with pyarrow table without pandas metadata + pa = pytest.importorskip("pyarrow") + table = pa.table({"a": pa.array(["a", "b", None], type=pa.string())}) + + result = table.to_pandas() + + if using_infer_string and not pa_version_under19p0: + expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str") + else: + expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object") tm.assert_frame_equal(result, expected) - # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is na_val(result["a"].dtype) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_load_from_zero_chunks( - dtype, string_storage2, request, using_infer_string -): +def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 pa = pytest.importorskip("pyarrow") - if using_infer_string and string_storage2 != "pyarrow_numpy": - request.applymarker( - pytest.mark.xfail( - reason="infer_string takes precedence over string storage" - ) - ) - data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -550,18 +587,26 @@ def test_arrow_load_from_zero_chunks( assert table.field("a").type == "large_string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) - with pd.option_context("string_storage", string_storage2): + with pd.option_context("string_storage", string_storage): result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage2}]") - tm.assert_frame_equal(result, expected) + + if dtype.na_value is np.nan and not using_string_dtype(): + assert result["a"].dtype == "object" + else: + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value)) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) def test_value_counts_na(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "int64[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = "int64" + elif dtype.storage == "pyarrow": + exp_dtype = "int64[pyarrow]" else: exp_dtype = "Int64" arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) @@ -575,10 +620,10 @@ def test_value_counts_na(dtype): def test_value_counts_with_normalize(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "double[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = np.float64 + elif dtype.storage == "pyarrow": + exp_dtype = "double[pyarrow]" else: exp_dtype = "Float64" ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) @@ -612,10 +657,23 @@ def test_use_inf_as_na(values, expected, dtype): tm.assert_frame_equal(result, expected) -def test_memory_usage(dtype, arrow_string_storage): +def test_value_counts_sort_false(dtype): + if dtype.na_value is np.nan: + exp_dtype = "int64" + elif dtype.storage == "pyarrow": + exp_dtype = "int64[pyarrow]" + else: + exp_dtype = "Int64" + ser = pd.Series(["a", "b", "c", "b"], dtype=dtype) + result = ser.value_counts(sort=False) + expected = pd.Series([1, 2, 1], index=ser[:3], dtype=exp_dtype, name="count") + tm.assert_series_equal(result, expected) + + +def test_memory_usage(dtype): # GH 33963 - if dtype.storage in arrow_string_storage: + if dtype.storage == "pyarrow": pytest.skip(f"not applicable for {dtype.storage}") series = pd.Series(["a", "b", "c"], dtype=dtype) @@ -635,7 +693,7 @@ def test_astype_from_float_dtype(float_dtype, dtype): def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) - expected = np.array(["a", na_val(dtype), "b"], dtype=object) + expected = np.array(["a", dtype.na_value, "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -666,6 +724,35 @@ def test_isin(dtype, fixed_now_ts): expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) + result = s.isin([fixed_now_ts]) + expected = pd.Series([False, False, False]) + tm.assert_series_equal(result, expected) + + +def test_isin_string_array(dtype, dtype2): + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(pd.array(["a", "c"], dtype=dtype2)) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(pd.array(["a", None], dtype=dtype2)) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + + +def test_isin_arrow_string_array(dtype): + pa = pytest.importorskip("pyarrow") + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(pd.array(["a", "c"], dtype=pd.ArrowDtype(pa.string()))) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(pd.array(["a", None], dtype=pd.ArrowDtype(pa.string()))) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + def test_setitem_scalar_with_mask_validation(dtype): # https://github.com/pandas-dev/pandas/issues/47628 @@ -675,14 +762,11 @@ def test_setitem_scalar_with_mask_validation(dtype): mask = np.array([False, True, False]) ser[mask] = None - assert ser.array[1] is na_val(ser.dtype) + assert ser.array[1] is ser.dtype.na_value # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) - if type(ser.array) is pd.arrays.StringArray: - msg = "Cannot set non-string value" - else: - msg = "Scalar must be NA or str" + msg = "Invalid value '1' for dtype 'str" with pytest.raises(TypeError, match=msg): ser[mask] = 1 diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index d7811b6fed883..aa87f5fc0f49a 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -26,15 +26,18 @@ def test_eq_all_na(): tm.assert_extension_array_equal(result, expected) -def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage != "pyarrow_numpy": - request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) +def test_config(string_storage, using_infer_string): + # with the default string_storage setting + # always "python" at the moment + assert StringDtype().storage == "python" + with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) assert result.dtype.storage == string_storage - dtype = StringDtype(string_storage) + # pd.array(..) by default always returns the NA-variant + dtype = StringDtype(string_storage, na_value=pd.NA) expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype) tm.assert_equal(result, expected) @@ -46,18 +49,18 @@ def test_config_bad_storage_raises(): @pytest.mark.parametrize("chunked", [True, False]) -@pytest.mark.parametrize("array", ["numpy", "pyarrow"]) -def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage): +@pytest.mark.parametrize("array_lib", ["numpy", "pyarrow"]) +def test_constructor_not_string_type_raises(array_lib, chunked): pa = pytest.importorskip("pyarrow") - array = pa if array in arrow_string_storage else np + array_lib = pa if array_lib == "pyarrow" else np - arr = array.array([1, 2, 3]) + arr = array_lib.array([1, 2, 3]) if chunked: - if array is np: + if array_lib is np: pytest.skip("chunked not applicable to numpy array") arr = pa.chunked_array(arr) - if array is np: + if array_lib is np: msg = "Unsupported type '' for ArrowExtensionArray" else: msg = re.escape( @@ -82,19 +85,32 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked): ArrowStringArray(arr) -@pytest.mark.xfail( - reason="dict conversion does not seem to be implemented for large string in arrow" -) +@pytest.mark.parametrize("string_type", ["string", "large_string"]) @pytest.mark.parametrize("chunked", [True, False]) -def test_constructor_valid_string_type_value_dictionary(chunked): +def test_constructor_valid_string_type_value_dictionary(string_type, chunked): pa = pytest.importorskip("pyarrow") - arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode() + arr = pa.array(["1", "2", "3"], getattr(pa, string_type)()).dictionary_encode() + if chunked: + arr = pa.chunked_array(arr) + + arr = ArrowStringArray(arr) + # dictionary type get converted to dense large string array + assert pa.types.is_large_string(arr._pa_array.type) + + +@pytest.mark.parametrize("chunked", [True, False]) +def test_constructor_valid_string_view(chunked): + # requires pyarrow>=18 for casting string_view to string + pa = pytest.importorskip("pyarrow", minversion="18") + + arr = pa.array(["1", "2", "3"], pa.string_view()) if chunked: arr = pa.chunked_array(arr) arr = ArrowStringArray(arr) - assert pa.types.is_string(arr._pa_array.type.value_type) + # dictionary type get converted to dense large string array + assert pa.types.is_large_string(arr._pa_array.type) def test_constructor_from_list(): @@ -239,10 +255,11 @@ def test_setitem_invalid_indexer_raises(): arr[[0, 1]] = ["foo", "bar", "baz"] -@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) -def test_pickle_roundtrip(dtype): +@pytest.mark.parametrize("na_value", [pd.NA, np.nan]) +def test_pickle_roundtrip(na_value): # GH 42600 pytest.importorskip("pyarrow") + dtype = StringDtype("pyarrow", na_value=na_value) expected = pd.Series(range(10), dtype=dtype) expected_sliced = expected.head(2) full_pickled = pickle.dumps(expected) @@ -260,6 +277,6 @@ def test_pickle_roundtrip(dtype): def test_string_dtype_error_message(): # GH#55051 pytest.importorskip("pyarrow") - msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." + msg = "Storage must be 'python' or 'pyarrow'." with pytest.raises(ValueError, match=msg): StringDtype("bla") diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 96263f498935b..158a963845b06 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -6,6 +6,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.api.extensions import register_extension_dtype @@ -216,6 +218,15 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + ["a", None], + "str", + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)) + if using_string_dtype() + else NumpyExtensionArray(np.array(["a", "None"])), + ), ( ["a", None], pd.StringDtype(), @@ -223,6 +234,29 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + ["a", None], + pd.StringDtype(na_value=np.nan), + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)), + ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype(), + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype()), + ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype(na_value=np.nan), + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + ), # Boolean ( [True, None], @@ -367,6 +401,13 @@ def test_array_copy(): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype()), + ), # Boolean ([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")), ([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 82524ea115019..0397913b69b26 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -12,6 +12,7 @@ Timestamp, ) from pandas._libs.tslibs.dtypes import freq_to_period_freqstr +from pandas.compat.numpy import np_version_gt2 import pandas as pd from pandas import ( @@ -638,13 +639,14 @@ def test_round(self, arr1d): def test_array_interface(self, datetime_index): arr = datetime_index._data + copy_false = None if np_version_gt2 else False # default asarray gives the same underlying data (for tz naive) result = np.asarray(arr) expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, copy=False) + result = np.array(arr, copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) @@ -653,11 +655,13 @@ def test_array_interface(self, datetime_index): expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype="datetime64[ns]", copy=False) + result = np.array(arr, dtype="datetime64[ns]", copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="datetime64[ns]") - assert result is not expected + if not np_version_gt2: + # TODO: GH 57739 + assert result is not expected tm.assert_numpy_array_equal(result, expected) # to object dtype @@ -696,6 +700,7 @@ def test_array_tz(self, arr1d): # GH#23524 arr = arr1d dti = self.index_cls(arr1d) + copy_false = None if np_version_gt2 else False expected = dti.asi8.view("M8[ns]") result = np.array(arr, dtype="M8[ns]") @@ -704,17 +709,18 @@ def test_array_tz(self, arr1d): result = np.array(arr, dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) - # check that we are not making copies when setting copy=False - result = np.array(arr, dtype="M8[ns]", copy=False) + # check that we are not making copies when setting copy=copy_false + result = np.array(arr, dtype="M8[ns]", copy=copy_false) assert result.base is expected.base assert result.base is not None - result = np.array(arr, dtype="datetime64[ns]", copy=False) + result = np.array(arr, dtype="datetime64[ns]", copy=copy_false) assert result.base is expected.base assert result.base is not None def test_array_i8_dtype(self, arr1d): arr = arr1d dti = self.index_cls(arr1d) + copy_false = None if np_version_gt2 else False expected = dti.asi8 result = np.array(arr, dtype="i8") @@ -723,8 +729,8 @@ def test_array_i8_dtype(self, arr1d): result = np.array(arr, dtype=np.int64) tm.assert_numpy_array_equal(result, expected) - # check that we are still making copies when setting copy=False - result = np.array(arr, dtype="i8", copy=False) + # check that we are still making copies when setting copy=copy_false + result = np.array(arr, dtype="i8", copy=copy_false) assert result.base is not expected.base assert result.base is None @@ -880,20 +886,24 @@ def test_concat_same_type_different_freq(self, unit): tm.assert_datetime_array_equal(result, expected) - def test_strftime(self, arr1d): + def test_strftime(self, arr1d, using_infer_string): arr = arr1d result = arr.strftime("%Y %b") expected = np.array([ts.strftime("%Y %b") for ts in arr], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) - def test_strftime_nat(self): + def test_strftime_nat(self, using_infer_string): # GH 29578 arr = DatetimeIndex(["2019-01-01", NaT])._data result = arr.strftime("%Y-%m-%d") expected = np.array(["2019-01-01", np.nan], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) class TestTimedeltaArray(SharedTests): @@ -950,13 +960,14 @@ def test_int_properties(self, timedelta_index, propname): def test_array_interface(self, timedelta_index): arr = timedelta_index._data + copy_false = None if np_version_gt2 else False # default asarray gives the same underlying data result = np.asarray(arr) expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, copy=False) + result = np.array(arr, copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) @@ -965,11 +976,13 @@ def test_array_interface(self, timedelta_index): expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype="timedelta64[ns]", copy=False) + result = np.array(arr, dtype="timedelta64[ns]", copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="timedelta64[ns]") - assert result is not expected + if not np_version_gt2: + # TODO: GH 57739 + assert result is not expected tm.assert_numpy_array_equal(result, expected) # to object dtype @@ -1135,9 +1148,17 @@ def test_array_interface(self, arr1d): result = np.asarray(arr, dtype=object) tm.assert_numpy_array_equal(result, expected) + # to int64 gives the underlying representation result = np.asarray(arr, dtype="int64") tm.assert_numpy_array_equal(result, arr.asi8) + result2 = np.asarray(arr, dtype="int64") + assert np.may_share_memory(result, result2) + + result_copy1 = np.array(arr, dtype="int64", copy=True) + result_copy2 = np.array(arr, dtype="int64", copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + # to other dtypes msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'" with pytest.raises(TypeError, match=msg): @@ -1147,20 +1168,24 @@ def test_array_interface(self, arr1d): expected = np.asarray(arr).astype("S20") tm.assert_numpy_array_equal(result, expected) - def test_strftime(self, arr1d): + def test_strftime(self, arr1d, using_infer_string): arr = arr1d result = arr.strftime("%Y") expected = np.array([per.strftime("%Y") for per in arr], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) - def test_strftime_nat(self): + def test_strftime_nat(self, using_infer_string): # GH 29578 arr = PeriodArray(PeriodIndex(["2019-01-01", NaT], dtype="period[D]")) result = arr.strftime("%Y-%m-%d") expected = np.array(["2019-01-01", np.nan], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 9a576be10d5ca..8f0576cc65a27 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -766,12 +766,18 @@ def test_iter_zoneinfo_fold(self, tz): "freq, freq_depr", [ ("2ME", "2M"), + ("2SME", "2SM"), + ("2SME", "2sm"), ("2QE", "2Q"), ("2QE-SEP", "2Q-SEP"), ("1YE", "1Y"), ("2YE-MAR", "2Y-MAR"), ("1YE", "1A"), ("2YE-MAR", "2A-MAR"), + ("2ME", "2m"), + ("2QE-SEP", "2q-sep"), + ("2YE-MAR", "2a-mar"), + ("2YE", "2y"), ], ) def test_date_range_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): @@ -784,6 +790,42 @@ def test_date_range_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("freq_depr", ["2H", "2CBH", "2MIN", "2S", "2mS", "2Us"]) + def test_date_range_uppercase_frequency_deprecated(self, freq_depr): + # GH#9586, GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_depr.lower()[1:]}' instead." + + expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.lower()) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq_depr", + [ + "2ye-mar", + "2ys", + "2qe", + "2qs-feb", + "2bqs", + "2sms", + "2bms", + "2cbme", + "2me", + "2w", + ], + ) + def test_date_range_lowercase_frequency_deprecated(self, freq_depr): + # GH#9586, GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.upper()[1:]}' instead." + + expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.upper()) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + def test_factorize_sort_without_freq(): dta = DatetimeArray._from_sequence([0, 2, 1], dtype="M8[ns]") diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index f3ac60f672ee1..3434c8110a79c 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -177,3 +177,14 @@ def test_constructor_datetime_nonns(self, constructor): arr.flags.writeable = False result = constructor(arr) tm.assert_equal(result, expected) + + def test_constructor_from_dict_keys(self, constructor, using_infer_string): + # https://github.com/pandas-dev/pandas/issues/60343 + d = {"a": 1, "b": 2} + result = constructor(d.keys(), dtype="str") + if using_infer_string: + assert result.dtype == "str" + else: + assert result.dtype == "object" + expected = constructor(list(d.keys()), dtype="str") + tm.assert_equal(result, expected) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index fe0f1f1454a55..4d0e2d1ce0e07 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -1,6 +1,9 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW +from pandas.compat.numpy import np_version_gt2 + from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd @@ -20,6 +23,7 @@ SparseArray, TimedeltaArray, ) +from pandas.core.arrays.string_ import StringArrayNumpySemantics from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics @@ -218,7 +222,9 @@ def test_iter_box_period(self): ) def test_values_consistent(arr, expected_type, dtype, using_infer_string): if using_infer_string and dtype == "object": - expected_type = ArrowStringArrayNumpySemantics + expected_type = ( + ArrowStringArrayNumpySemantics if HAS_PYARROW else StringArrayNumpySemantics + ) l_values = Series(arr)._values r_values = pd.Index(arr)._values assert type(l_values) is expected_type @@ -290,24 +296,27 @@ def test_array_multiindex_raises(): @pytest.mark.parametrize( - "arr, expected", + "arr, expected, zero_copy", [ - (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), - (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)), + (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64), True), + (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object), False), ( pd.core.arrays.period_array(["2000", "2001"], freq="D"), np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), + False, ), - (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])), + (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan]), False), ( IntervalArray.from_breaks([0, 1, 2]), np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), + False, ), - (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), + (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64), False), # tz-naive datetime ( DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")), np.array(["2000", "2001"], dtype="M8[ns]"), + True, ), # tz-aware stays tz`-aware ( @@ -322,6 +331,7 @@ def test_array_multiindex_raises(): Timestamp("2000-01-02", tz="US/Central"), ] ), + False, ), # Timedelta ( @@ -329,6 +339,7 @@ def test_array_multiindex_raises(): np.array([0, 3600000000000], dtype="i8").view("m8[ns]") ), np.array([0, 3600000000000], dtype="m8[ns]"), + True, ), # GH#26406 tz is preserved in Categorical[dt64tz] ( @@ -339,10 +350,11 @@ def test_array_multiindex_raises(): Timestamp("2016-01-02", tz="US/Pacific"), ] ), + False, ), ], ) -def test_to_numpy(arr, expected, index_or_series_or_array, request): +def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array): box = index_or_series_or_array with tm.assert_produces_warning(None): @@ -354,6 +366,28 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): result = np.asarray(thing) tm.assert_numpy_array_equal(result, expected) + # Additionally, we check the `copy=` semantics for array/asarray + # (these are implemented by us via `__array__`). + result_cp1 = np.array(thing, copy=True) + result_cp2 = np.array(thing, copy=True) + # When called with `copy=True` NumPy/we should ensure a copy was made + assert not np.may_share_memory(result_cp1, result_cp2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + if not zero_copy: + msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed" + with tm.assert_produces_warning(FutureWarning, match=msg): + np.array(thing, copy=False) + + else: + result_nocopy1 = np.array(thing, copy=False) + result_nocopy2 = np.array(thing, copy=False) + # If copy=False was given, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + @pytest.mark.parametrize("as_series", [True, False]) @pytest.mark.parametrize( @@ -366,13 +400,13 @@ def test_to_numpy_copy(arr, as_series, using_infer_string): # no copy by default result = obj.to_numpy() - if using_infer_string and arr.dtype == object: + if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow": assert np.shares_memory(arr, result) is False else: assert np.shares_memory(arr, result) is True result = obj.to_numpy(copy=False) - if using_infer_string and arr.dtype == object: + if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow": assert np.shares_memory(arr, result) is False else: assert np.shares_memory(arr, result) is True diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 65e234e799353..1bf0a8d75dd4f 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import PYPY @@ -83,7 +83,7 @@ def test_ndarray_compat_properties(index_or_series_obj): @pytest.mark.skipif( - PYPY or using_pyarrow_string_dtype(), + PYPY or using_string_dtype(), reason="not relevant for PyPy doesn't work properly for arrow strings", ) def test_memory_usage(index_or_series_memory_obj): @@ -165,6 +165,7 @@ def test_searchsorted(request, index_or_series_obj): assert 0 <= index <= len(obj) +@pytest.mark.filterwarnings(r"ignore:Dtype inference:FutureWarning") def test_access_by_position(index_flat): index = index_flat @@ -180,9 +181,7 @@ def test_access_by_position(index_flat): assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" - if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal( - index.dtype, "string[pyarrow_numpy]" - ): + if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow": msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index d3fe144f70cfc..1add56b47b363 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - import pandas as pd import pandas._testing as tm from pandas.tests.base.common import allow_na_ops @@ -100,12 +98,11 @@ def test_nunique_null(null_obj, index_or_series_obj): @pytest.mark.single_cpu -@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails") def test_unique_bad_unicode(index_or_series): # regression test for #34550 uval = "\ud83d" # smiley emoji - obj = index_or_series([uval] * 2) + obj = index_or_series([uval] * 2, dtype=object) result = obj.unique() if isinstance(obj, pd.Index): diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 2729666398877..1f643f24ed5f7 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -127,7 +127,7 @@ def test_value_counts_inferred(index_or_series, using_infer_string): else: exp = np.unique(np.array(s_values, dtype=np.object_)) if using_infer_string: - exp = array(exp) + exp = array(exp, dtype="str") tm.assert_equal(s.unique(), exp) assert s.nunique() == 4 @@ -205,7 +205,7 @@ def test_value_counts_bins(index_or_series, using_infer_string): else: exp = np.array(["a", "b", np.nan, "d"], dtype=object) if using_infer_string: - exp = array(exp) + exp = array(exp, dtype="str") tm.assert_equal(s.unique(), exp) assert s.nunique() == 3 diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 17630f14b08c7..cf3e50094ac97 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -606,11 +606,10 @@ def test_unary_in_array(self): ) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("expr", ["x < -0.1", "-5 > x"]) - def test_float_comparison_bin_op(self, dtype, expr): + def test_float_comparison_bin_op(self, float_numpy_dtype, expr): # GH 16363 - df = DataFrame({"x": np.array([0], dtype=dtype)}) + df = DataFrame({"x": np.array([0], dtype=float_numpy_dtype)}) res = df.eval(expr) assert res.values == np.array([False]) @@ -747,16 +746,26 @@ class TestTypeCasting: @pytest.mark.parametrize("op", ["+", "-", "*", "**", "/"]) # maybe someday... numexpr has too many upcasting rules now # chain(*(np.core.sctypes[x] for x in ['uint', 'int', 'float'])) - @pytest.mark.parametrize("dt", [np.float32, np.float64]) @pytest.mark.parametrize("left_right", [("df", "3"), ("3", "df")]) - def test_binop_typecasting(self, engine, parser, op, dt, left_right): - df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)), dtype=dt) + def test_binop_typecasting( + self, engine, parser, op, complex_or_float_dtype, left_right, request + ): + # GH#21374 + dtype = complex_or_float_dtype + df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)), dtype=dtype) left, right = left_right s = f"{left} {op} {right}" res = pd.eval(s, engine=engine, parser=parser) - assert df.values.dtype == dt - assert res.values.dtype == dt - tm.assert_frame_equal(res, eval(s)) + if dtype == "complex64" and engine == "numexpr": + mark = pytest.mark.xfail( + reason="numexpr issue with complex that are upcast " + "to complex 128 " + "https://github.com/pydata/numexpr/issues/492" + ) + request.applymarker(mark) + assert df.values.dtype == dtype + assert res.values.dtype == dtype + tm.assert_frame_equal(res, eval(s), check_exact=False) # ------------------------------------- diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 9a3f83e0293f5..0dabec6014b0d 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + from pandas import ( DataFrame, Series, @@ -15,8 +17,12 @@ @pytest.mark.parametrize( "method", - [lambda ser: ser.values, lambda ser: np.asarray(ser)], - ids=["values", "asarray"], + [ + lambda ser: ser.values, + lambda ser: np.asarray(ser), + lambda ser: np.array(ser, copy=False), + ], + ids=["values", "asarray", "array"], ) def test_series_values(using_copy_on_write, method): ser = Series([1, 2, 3], name="name") @@ -45,8 +51,12 @@ def test_series_values(using_copy_on_write, method): @pytest.mark.parametrize( "method", - [lambda df: df.values, lambda df: np.asarray(df)], - ids=["values", "asarray"], + [ + lambda df: df.values, + lambda df: np.asarray(df), + lambda ser: np.array(ser, copy=False), + ], + ids=["values", "asarray", "array"], ) def test_dataframe_values(using_copy_on_write, using_array_manager, method): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -100,7 +110,7 @@ def test_series_to_numpy(using_copy_on_write): arr[0] = 0 assert ser.iloc[0] == 0 - # specify copy=False gives a writeable array + # specify copy=True gives a writeable array ser = Series([1, 2, 3], name="name") arr = ser.to_numpy(copy=True) assert not np.shares_memory(arr, get_array(ser, "name")) @@ -174,6 +184,24 @@ def test_dataframe_multiple_numpy_dtypes(): assert not np.shares_memory(arr, get_array(df, "a")) assert arr.flags.writeable is True + if np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + + msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed" + with pytest.raises(FutureWarning, match=msg): + arr = np.array(df, copy=False) + + arr = np.array(df, copy=True) + assert arr.flags.writeable is True + + +def test_dataframe_single_block_copy_true(): + # the copy=False/None cases are tested above in test_dataframe_values + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + arr = np.array(df, copy=True) + assert not np.shares_memory(arr, get_array(df, "a")) + assert arr.flags.writeable is True + def test_values_is_ea(using_copy_on_write): df = DataFrame({"a": date_range("2012-01-01", periods=3)}) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index d462ce3d3187d..45fc3333c49a7 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under12p0 import pandas.util._test_decorators as td @@ -132,7 +133,8 @@ def test_astype_string_and_object_update_original( tm.assert_frame_equal(df2, df_orig) -def test_astype_string_copy_on_pickle_roundrip(): +def test_astype_str_copy_on_pickle_roundrip(): + # TODO(infer_string) this test can be removed after 3.0 (once str is the default) # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter array inplace base = Series(np.array([(1, 2), None, 1], dtype="object")) @@ -141,6 +143,25 @@ def test_astype_string_copy_on_pickle_roundrip(): tm.assert_series_equal(base, base_copy) +def test_astype_string_copy_on_pickle_roundrip(any_string_dtype): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy.astype(any_string_dtype) + tm.assert_series_equal(base, base_copy) + + +def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter read-only array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy._values.flags.writeable = False + base_copy.astype(any_string_dtype) + tm.assert_series_equal(base, base_copy) + + def test_astype_dict_dtypes(using_copy_on_write): df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")} @@ -232,7 +253,7 @@ def test_convert_dtypes_infer_objects(using_copy_on_write): ) if using_copy_on_write: - assert np.shares_memory(get_array(ser), get_array(result)) + assert tm.shares_memory(get_array(ser), get_array(result)) else: assert not np.shares_memory(get_array(ser), get_array(result)) @@ -240,16 +261,21 @@ def test_convert_dtypes_infer_objects(using_copy_on_write): tm.assert_series_equal(ser, ser_orig) -def test_convert_dtypes(using_copy_on_write): +def test_convert_dtypes(using_copy_on_write, using_infer_string): df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) df_orig = df.copy() df2 = df.convert_dtypes() if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert np.shares_memory(get_array(df2, "d"), get_array(df, "d")) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + if using_infer_string and HAS_PYARROW: + # TODO the default nullable string dtype still uses python storage + # this should be changed to pyarrow if installed + assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d")) + assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c")) else: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) @@ -257,4 +283,5 @@ def test_convert_dtypes(using_copy_on_write): assert not np.shares_memory(get_array(df2, "d"), get_array(df, "d")) df2.iloc[0, 0] = "x" + df2.iloc[0, 1] = 10 tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/test_chained_assignment_deprecation.py b/pandas/tests/copy_view/test_chained_assignment_deprecation.py index 80e38380ed27c..0a37f6b813e55 100644 --- a/pandas/tests/copy_view/test_chained_assignment_deprecation.py +++ b/pandas/tests/copy_view/test_chained_assignment_deprecation.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import PY311 from pandas.errors import ( ChainedAssignmentError, SettingWithCopyWarning, @@ -42,7 +43,9 @@ def test_methods_iloc_warn(using_copy_on_write): ("ffill", ()), ], ) -def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write): +def test_methods_iloc_getitem_item_cache( + func, args, using_copy_on_write, warn_copy_on_write +): # ensure we don't incorrectly raise chained assignment warning because # of the item cache / iloc not setting the item cache df_orig = DataFrame({"a": [1, 2, 3], "b": 1}) @@ -66,14 +69,74 @@ def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write): ser = df["a"] getattr(ser, func)(*args, inplace=True) + df = df_orig.copy() + df["a"] # populate the item_cache + # TODO(CoW-warn) because of the usage of *args, this doesn't warn on Py3.11+ + if using_copy_on_write: + with tm.raises_chained_assignment_error(not PY311): + getattr(df["a"], func)(*args, inplace=True) + else: + with tm.assert_cow_warning(not PY311, match="A value"): + getattr(df["a"], func)(*args, inplace=True) + + df = df_orig.copy() + ser = df["a"] # populate the item_cache and keep ref + if using_copy_on_write: + with tm.raises_chained_assignment_error(not PY311): + getattr(df["a"], func)(*args, inplace=True) + else: + # ideally also warns on the default mode, but the ser' _cacher + # messes up the refcount + even in warning mode this doesn't trigger + # the warning of Py3.1+ (see above) + with tm.assert_cow_warning(warn_copy_on_write and not PY311, match="A value"): + getattr(df["a"], func)(*args, inplace=True) + + +def test_methods_iloc_getitem_item_cache_fillna( + using_copy_on_write, warn_copy_on_write +): + # ensure we don't incorrectly raise chained assignment warning because + # of the item cache / iloc not setting the item cache + df_orig = DataFrame({"a": [1, 2, 3], "b": 1}) + + df = df_orig.copy() + ser = df.iloc[:, 0] + ser.fillna(1, inplace=True) + + # parent that holds item_cache is dead, so don't increase ref count + df = df_orig.copy() + ser = df.copy()["a"] + ser.fillna(1, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + ser = df.iloc[:, 0] # iloc creates a new object + ser.fillna(1, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + ser = df["a"] + ser.fillna(1, inplace=True) + df = df_orig.copy() df["a"] # populate the item_cache if using_copy_on_write: with tm.raises_chained_assignment_error(): - df["a"].fillna(0, inplace=True) + df["a"].fillna(1, inplace=True) else: with tm.assert_cow_warning(match="A value"): - df["a"].fillna(0, inplace=True) + df["a"].fillna(1, inplace=True) + + df = df_orig.copy() + ser = df["a"] # populate the item_cache and keep ref + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["a"].fillna(1, inplace=True) + else: + # TODO(CoW-warn) ideally also warns on the default mode, but the ser' _cacher + # messes up the refcount + with tm.assert_cow_warning(warn_copy_on_write, match="A value"): + df["a"].fillna(1, inplace=True) # TODO(CoW-warn) expand the cases diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 1aa458a625028..66c9b456f18ad 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -285,7 +285,7 @@ def test_dataframe_from_dict_of_series_with_reindex(dtype): @pytest.mark.parametrize("cons", [Series, Index]) @pytest.mark.parametrize( - "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)] + "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], object)] ) def test_dataframe_from_series_or_index( using_copy_on_write, warn_copy_on_write, data, dtype, cons diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 56e4b186350f2..eefd27964e6ae 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -1,6 +1,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + from pandas import ( DataFrame, Index, @@ -13,8 +17,8 @@ def test_concat_frames(using_copy_on_write): - df = DataFrame({"b": ["a"] * 3}) - df2 = DataFrame({"a": ["a"] * 3}) + df = DataFrame({"b": ["a"] * 3}, dtype=object) + df2 = DataFrame({"a": ["a"] * 3}, dtype=object) df_orig = df.copy() result = concat([df, df2], axis=1) @@ -37,8 +41,8 @@ def test_concat_frames(using_copy_on_write): def test_concat_frames_updating_input(using_copy_on_write): - df = DataFrame({"b": ["a"] * 3}) - df2 = DataFrame({"a": ["a"] * 3}) + df = DataFrame({"b": ["a"] * 3}, dtype=object) + df2 = DataFrame({"a": ["a"] * 3}, dtype=object) result = concat([df, df2], axis=1) if using_copy_on_write: @@ -205,8 +209,8 @@ def test_concat_copy_keyword(using_copy_on_write, copy): ], ) def test_merge_on_key(using_copy_on_write, func): - df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}) - df2 = DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]}) + df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]}) + df2 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "b": [4, 5, 6]}) df1_orig = df1.copy() df2_orig = df2.copy() @@ -268,8 +272,8 @@ def test_merge_on_index(using_copy_on_write): ], ) def test_merge_on_key_enlarging_one(using_copy_on_write, func, how): - df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}) - df2 = DataFrame({"key": ["a", "b"], "b": [4, 5]}) + df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]}) + df2 = DataFrame({"key": Series(["a", "b"], dtype=object), "b": [4, 5]}) df1_orig = df1.copy() df2_orig = df2.copy() @@ -313,8 +317,13 @@ def test_merge_copy_keyword(using_copy_on_write, copy): assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) +@pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, + reason="TODO(infer_string); result.index infers str dtype while both " + "df1 and df2 index are object.", +) def test_join_on_key(using_copy_on_write): - df_index = Index(["a", "b", "c"], name="key") + df_index = Index(["a", "b", "c"], name="key", dtype=object) df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True)) df2 = DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True)) @@ -347,7 +356,7 @@ def test_join_on_key(using_copy_on_write): def test_join_multiple_dataframes_on_key(using_copy_on_write): - df_index = Index(["a", "b", "c"], name="key") + df_index = Index(["a", "b", "c"], name="key", dtype=object) df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True)) dfs_list = [ diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 6f3850ab64daa..479fa148f994a 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -1144,11 +1144,16 @@ def test_set_value_copy_only_necessary_column( df_orig = df.copy() view = df[:] - if val == "a" and indexer[0] != slice(None): + if val == "a" and not warn_copy_on_write: with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype is deprecated" ): indexer_func(df)[indexer] = val + if val == "a" and warn_copy_on_write: + with tm.assert_produces_warning( + FutureWarning, match="incompatible dtype|Setting a value on a view" + ): + indexer_func(df)[indexer] = val else: with tm.assert_cow_warning(warn_copy_on_write and val == 100): indexer_func(df)[indexer] = val @@ -1224,6 +1229,27 @@ def test_series_midx_tuples_slice(using_copy_on_write, warn_copy_on_write): tm.assert_series_equal(ser, expected) +def test_midx_read_only_bool_indexer(): + # GH#56635 + def mklbl(prefix, n): + return [f"{prefix}{i}" for i in range(n)] + + idx = pd.MultiIndex.from_product( + [mklbl("A", 4), mklbl("B", 2), mklbl("C", 4), mklbl("D", 2)] + ) + cols = pd.MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"] + ) + df = DataFrame(1, index=idx, columns=cols).sort_index().sort_index(axis=1) + + mask = df[("a", "foo")] == 1 + expected_mask = mask.copy() + result = df.loc[pd.IndexSlice[mask, :, ["C1", "C3"]], :] + expected = df.loc[pd.IndexSlice[:, :, ["C1", "C3"]], :] + tm.assert_frame_equal(result, expected) + tm.assert_series_equal(mask, expected_mask) + + def test_loc_enlarging_with_dataframe(using_copy_on_write): df = DataFrame({"a": [1, 2, 3]}) rhs = DataFrame({"b": [1, 2, 3], "c": [4, 5, 6]}) diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index a727331307d7e..8526d38588897 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -4,7 +4,10 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame +from pandas import ( + DataFrame, + Series, +) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -102,7 +105,7 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype): "c": [7, 8, 9], "d": [10, 11, 12], "e": [13, 14, 15], - "f": ["a", "b", "c"], + "f": Series(["a", "b", "c"], dtype=object), }, ) arr = arr.astype(dtype) diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index ddc5879a56d54..d0c4fa53faab9 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -135,9 +135,9 @@ def test_interp_fill_functions_inplace( assert np.shares_memory(arr, get_array(df, "a")) is (dtype == "float64") -def test_interpolate_cleaned_fill_method(using_copy_on_write): - # Check that "method is set to None" case works correctly +def test_interpolate_cannot_with_object_dtype(using_copy_on_write): df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) + df["a"] = df["a"].astype(object) df_orig = df.copy() msg = "DataFrame.interpolate with object dtype" @@ -156,15 +156,16 @@ def test_interpolate_cleaned_fill_method(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_interpolate_object_convert_no_op(using_copy_on_write): +def test_interpolate_object_convert_no_op(using_copy_on_write, using_infer_string): df = DataFrame({"a": ["a", "b", "c"], "b": 1}) + df["a"] = df["a"].astype(object) arr_a = get_array(df, "a") msg = "DataFrame.interpolate with method=pad is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): df.interpolate(method="pad", inplace=True) # Now CoW makes a copy, it should not! - if using_copy_on_write: + if using_copy_on_write and not using_infer_string: assert df._mgr._has_no_reference(0) assert np.shares_memory(arr_a, get_array(df, "a")) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 862aebdc70a9d..09738fe1023fb 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW from pandas.errors import SettingWithCopyWarning import pandas as pd @@ -280,6 +281,17 @@ def test_reset_index_series_drop(using_copy_on_write, index): tm.assert_series_equal(ser, ser_orig) +def test_groupby_column_index_in_references(): + df = DataFrame( + {"A": ["a", "b", "c", "d"], "B": [1, 2, 3, 4], "C": ["a", "a", "b", "b"]} + ) + df = df.set_index("A") + key = df["C"] + result = df.groupby(key, observed=True).sum() + expected = df.groupby("C", observed=True).sum() + tm.assert_frame_equal(result, expected) + + def test_rename_columns(using_copy_on_write): # Case: renaming columns returns a new dataframe # + afterwards modifying the result @@ -939,14 +951,19 @@ def test_head_tail(method, using_copy_on_write, warn_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_infer_objects(using_copy_on_write): - df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) +def test_infer_objects(using_copy_on_write, using_infer_string): + df = DataFrame( + {"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"} + ) df_orig = df.copy() df2 = df.infer_objects() if using_copy_on_write: assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + if using_infer_string: + assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) + else: + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) else: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) @@ -960,16 +977,16 @@ def test_infer_objects(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_infer_objects_no_reference(using_copy_on_write): +def test_infer_objects_no_reference(using_copy_on_write, using_infer_string): df = DataFrame( { "a": [1, 2], - "b": "c", + "b": Series(["x", "y"], dtype=object), "c": 1, "d": Series( [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" ), - "e": "b", + "e": Series(["z", "w"], dtype=object), } ) df = df.infer_objects() @@ -983,16 +1000,22 @@ def test_infer_objects_no_reference(using_copy_on_write): df.iloc[0, 3] = Timestamp("2018-12-31") if using_copy_on_write: assert np.shares_memory(arr_a, get_array(df, "a")) - # TODO(CoW): Block splitting causes references here - assert not np.shares_memory(arr_b, get_array(df, "b")) + if using_infer_string: + # note that the underlying memory of arr_b has been copied anyway + # because of the assignment, but the EA is updated inplace so still + # appears the share memory + assert tm.shares_memory(arr_b, get_array(df, "b")) + else: + # TODO(CoW): Block splitting causes references here + assert not np.shares_memory(arr_b, get_array(df, "b")) assert np.shares_memory(arr_d, get_array(df, "d")) -def test_infer_objects_reference(using_copy_on_write): +def test_infer_objects_reference(using_copy_on_write, using_infer_string): df = DataFrame( { "a": [1, 2], - "b": "c", + "b": Series(["x", "y"], dtype=object), "c": 1, "d": Series( [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" @@ -1011,7 +1034,8 @@ def test_infer_objects_reference(using_copy_on_write): df.iloc[0, 3] = Timestamp("2018-12-31") if using_copy_on_write: assert not np.shares_memory(arr_a, get_array(df, "a")) - assert not np.shares_memory(arr_b, get_array(df, "b")) + if not using_infer_string or HAS_PYARROW: + assert not np.shares_memory(arr_b, get_array(df, "b")) assert np.shares_memory(arr_d, get_array(df, "d")) @@ -1173,7 +1197,7 @@ def test_round(using_copy_on_write, warn_copy_on_write, decimals): df2 = df.round(decimals=decimals) if using_copy_on_write: - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) # TODO: Make inplace by using out parameter of ndarray.round? if decimals >= 0: # Ensure lazy copy if no-op diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 6d16bc3083883..c6c9eca47f3f4 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -26,7 +26,7 @@ ], ) def test_replace(using_copy_on_write, replace_kwargs): - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df_replaced = df.replace(**replace_kwargs) @@ -34,7 +34,7 @@ def test_replace(using_copy_on_write, replace_kwargs): if using_copy_on_write: if (df_replaced["b"] == df["b"]).all(): assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) # mutating squeezed df triggers a copy-on-write for that column/block df_replaced.loc[0, "c"] = -1 @@ -56,7 +56,7 @@ def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write): with tm.assert_cow_warning(warn_copy_on_write): df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) if using_copy_on_write: - assert not np.shares_memory(arr, get_array(df, "a")) + assert not tm.shares_memory(arr, get_array(df, "a")) assert df._mgr._has_no_reference(0) tm.assert_frame_equal(view, df_orig) else: @@ -69,12 +69,12 @@ def test_replace_regex_inplace(using_copy_on_write): df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) if using_copy_on_write: assert df._mgr._has_no_reference(0) - assert np.shares_memory(arr, get_array(df, "a")) + assert tm.shares_memory(arr, get_array(df, "a")) df_orig = df.copy() df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True) tm.assert_frame_equal(df_orig, df) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) def test_replace_regex_inplace_no_op(using_copy_on_write): @@ -352,11 +352,11 @@ def test_replace_empty_list(using_copy_on_write): @pytest.mark.parametrize("value", ["d", None]) def test_replace_object_list_inplace(using_copy_on_write, value): - df = DataFrame({"a": ["a", "b", "c"]}) + df = DataFrame({"a": ["a", "b", "c"]}, dtype=object) arr = get_array(df, "a") df.replace(["c"], value, inplace=True) if using_copy_on_write or value is None: - assert np.shares_memory(arr, get_array(df, "a")) + assert tm.shares_memory(arr, get_array(df, "a")) else: # This could be inplace assert not np.shares_memory(arr, get_array(df, "a")) @@ -384,6 +384,15 @@ def test_replace_list_none(using_copy_on_write): assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + # replace multiple values that don't actually replace anything with None + # https://github.com/pandas-dev/pandas/issues/59770 + df3 = df.replace(["d", "e", "f"], value=None) + tm.assert_frame_equal(df3, df_orig) + if using_copy_on_write: + assert tm.shares_memory(get_array(df, "a"), get_array(df3, "a")) + else: + assert not tm.shares_memory(get_array(df, "a"), get_array(df3, "a")) + def test_replace_list_none_inplace_refs(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": ["a", "b", "c"]}) diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py index ab468c81124bc..6b9b2dfda6e8b 100644 --- a/pandas/tests/dtypes/cast/test_construct_ndarray.py +++ b/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -21,7 +21,7 @@ def test_construct_1d_ndarray_preserving_na( ): result = sanitize_array(values, index=None, dtype=dtype) if using_infer_string and expected.dtype == object and dtype is None: - tm.assert_extension_array_equal(result, pd.array(expected)) + tm.assert_extension_array_equal(result, pd.array(expected, dtype="str")) else: tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index c34c97b6e4f04..579f5636922dc 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td from pandas.core.dtypes.astype import astype_array @@ -21,6 +22,7 @@ import pandas._testing as tm from pandas.api.types import pandas_dtype from pandas.arrays import SparseArray +from pandas.util.version import Version # EA & Actual Dtypes @@ -787,11 +789,18 @@ def test_validate_allhashable(): def test_pandas_dtype_numpy_warning(): # GH#51523 - with tm.assert_produces_warning( - DeprecationWarning, - check_stacklevel=False, - match="Converting `np.integer` or `np.signedinteger` to a dtype is deprecated", - ): + if Version(np.__version__) < Version("2.3.0.dev0"): + ctx = tm.assert_produces_warning( + DeprecationWarning, + check_stacklevel=False, + match=( + "Converting `np.integer` or `np.signedinteger` to a dtype is deprecated" + ), + ) + else: + ctx = tm.external_error_raised(TypeError) + + with ctx: pandas_dtype(np.integer) @@ -799,3 +808,58 @@ def test_pandas_dtype_ea_not_instance(): # GH 31356 GH 54592 with tm.assert_produces_warning(UserWarning): assert pandas_dtype(CategoricalDtype) == CategoricalDtype() + + +def test_pandas_dtype_string_dtypes(string_storage): + with pd.option_context("future.infer_string", True): + # with the default string_storage setting + result = pandas_dtype("str") + assert result == pd.StringDtype( + "pyarrow" if HAS_PYARROW else "python", na_value=np.nan + ) + + with pd.option_context("future.infer_string", True): + # with the default string_storage setting + result = pandas_dtype(str) + assert result == pd.StringDtype( + "pyarrow" if HAS_PYARROW else "python", na_value=np.nan + ) + + with pd.option_context("future.infer_string", True): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("str") + assert result == pd.StringDtype(string_storage, na_value=np.nan) + + with pd.option_context("future.infer_string", True): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype(str) + assert result == pd.StringDtype(string_storage, na_value=np.nan) + + with pd.option_context("future.infer_string", False): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("str") + assert result == np.dtype("U") + + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("string") + assert result == pd.StringDtype(string_storage, na_value=pd.NA) + + +def test_pandas_dtype_string_dtype_alias_with_storage(): + with pytest.raises(TypeError, match="not understood"): + pandas_dtype("str[python]") + + with pytest.raises(TypeError, match="not understood"): + pandas_dtype("str[pyarrow]") + + result = pandas_dtype("string[python]") + assert result == pd.StringDtype("python", na_value=pd.NA) + + if HAS_PYARROW: + result = pandas_dtype("string[pyarrow]") + assert result == pd.StringDtype("pyarrow", na_value=pd.NA) + else: + with pytest.raises( + ImportError, match="required for PyArrow backed StringArray" + ): + pandas_dtype("string[pyarrow]") diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 0dad0b05303ad..a5666e169fb4c 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -445,12 +445,12 @@ def test_construction(self): def test_cannot_use_custom_businessday(self): # GH#52534 - msg = "CustomBusinessDay is not supported as period frequency" + msg = "C is not supported as period frequency" + msg1 = " is not supported as period frequency" msg2 = r"PeriodDtype\[B\] is deprecated" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - PeriodDtype("C") - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): + PeriodDtype("C") + with pytest.raises(ValueError, match=msg1): with tm.assert_produces_warning(FutureWarning, match=msg2): PeriodDtype(pd.offsets.CustomBusinessDay()) @@ -1059,7 +1059,7 @@ def test_str_vs_repr(self, ordered, using_infer_string): c1 = CategoricalDtype(["a", "b"], ordered=ordered) assert str(c1) == "category" # Py2 will have unicode prefixes - dtype = "string" if using_infer_string else "object" + dtype = "str" if using_infer_string else "object" pat = ( r"CategoricalDtype\(categories=\[.*\], ordered={ordered}, " rf"categories_dtype={dtype}\)" diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 49eb06c299886..79b7e6ff092b6 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -112,8 +112,8 @@ def it_outer(): def __len__(self) -> int: return len(self._values) - def __array__(self, t=None): - return np.asarray(self._values, dtype=t) + def __array__(self, dtype=None, copy=None): + return np.asarray(self._values, dtype=dtype) @property def ndim(self): @@ -1585,6 +1585,31 @@ def test_is_string_array(self): ) assert not lib.is_string_array(np.array([1, 2])) + @pytest.mark.parametrize( + "func", + [ + "is_bool_array", + "is_date_array", + "is_datetime_array", + "is_datetime64_array", + "is_float_array", + "is_integer_array", + "is_interval_array", + "is_string_array", + "is_time_array", + "is_timedelta_or_timedelta64_array", + ], + ) + def test_is_dtype_array_empty_obj(self, func): + # https://github.com/pandas-dev/pandas/pull/60796 + func = getattr(lib, func) + + arr = np.empty((2, 0), dtype=object) + assert not func(arr) + + arr = np.empty((0, 2), dtype=object) + assert not func(arr) + def test_to_object_array_tuples(self): r = (5, 6) values = [r] diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index e1f8d8eca2537..e3d3e98ae2b93 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -131,7 +131,7 @@ def test_isna_isnull(self, isna_f): [ np.arange(4, dtype=float), [0.0, 1.0, 0.0, 1.0], - Series(list("abcd"), dtype=object), + Series(list("abcd")), date_range("2020-01-01", periods=4), ], ) diff --git a/pandas/tests/extension/array_with_attr/array.py b/pandas/tests/extension/array_with_attr/array.py index d0249d9af8098..2789d51ec2ce3 100644 --- a/pandas/tests/extension/array_with_attr/array.py +++ b/pandas/tests/extension/array_with_attr/array.py @@ -49,7 +49,10 @@ def __init__(self, values, attr=None) -> None: @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy=False): - data = np.array(scalars, dtype="float64", copy=copy) + if not copy: + data = np.asarray(scalars, dtype="float64") + else: + data = np.array(scalars, dtype="float64", copy=copy) return cls(data) def __getitem__(self, item): diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 9a41a3a582c4a..9a2f186c2a00b 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -18,8 +18,9 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): try: alt = ser.astype("float64") - except TypeError: - # e.g. Period can't be cast to float64 + except (TypeError, ValueError): + # e.g. Period can't be cast to float64 (TypeError) + # String can't be cast to float64 (ValueError) alt = ser.astype(object) result = getattr(ser, op_name)(skipna=skipna) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 2bfe801c48a77..56879129c3a28 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -43,8 +43,8 @@ def test_tolist(self, data): assert result == expected def test_astype_str(self, data): - result = pd.Series(data[:5]).astype(str) - expected = pd.Series([str(x) for x in data[:5]], dtype=str) + result = pd.Series(data[:2]).astype(str) + expected = pd.Series([str(x) for x in data[:2]], dtype=str) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 75628ea177fc2..6947e672f3d44 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -115,12 +115,12 @@ def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby("B", group_keys=False).apply(groupby_apply_op) - df.groupby("B", group_keys=False).A.apply(groupby_apply_op) + df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op) msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby("A", group_keys=False).apply(groupby_apply_op) - df.groupby("A", group_keys=False).B.apply(groupby_apply_op) + df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 6683c87e2b8fc..38cece7da3308 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -1,6 +1,10 @@ +import warnings + import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import is_extension_array_dtype from pandas.core.dtypes.dtypes import ExtensionDtype @@ -71,6 +75,37 @@ def test_array_interface(self, data): expected = construct_1d_object_array_from_listlike(list(data)) tm.assert_numpy_array_equal(result, expected) + def test_array_interface_copy(self, data): + result_copy1 = np.array(data, copy=True) + result_copy2 = np.array(data, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + warning_raised = False + msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed" + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result_nocopy1 = np.array(data, copy=False) + assert len(w) <= 1 + if len(w): + warning_raised = True + assert msg in str(w[0].message) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result_nocopy2 = np.array(data, copy=False) + assert len(w) <= 1 + if len(w): + warning_raised = True + assert msg in str(w[0].message) + + if not warning_raised: + # If copy=False was given and did not raise, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + def test_is_extension_array_dtype(self, data): assert is_extension_array_dtype(data) assert is_extension_array_dtype(data.dtype) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index c803a8113b4a4..5cb2c14e4c841 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -66,14 +66,14 @@ def test_value_counts_with_normalize(self, data): expected = pd.Series(0.0, index=result.index, name="proportion") expected[result > 0] = 1 / len(values) - if getattr(data.dtype, "storage", "") == "pyarrow" or isinstance( + if isinstance(data.dtype, pd.StringDtype) and data.dtype.na_value is np.nan: + # TODO: avoid special-casing + expected = expected.astype("float64") + elif getattr(data.dtype, "storage", "") == "pyarrow" or isinstance( data.dtype, pd.ArrowDtype ): # TODO: avoid special-casing expected = expected.astype("double[pyarrow]") - elif getattr(data.dtype, "storage", "") == "pyarrow_numpy": - # TODO: avoid special-casing - expected = expected.astype("float64") elif na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing expected = expected.astype("Float64") diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index ffb7a24b4b390..fb15b2dec869c 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -27,7 +27,9 @@ def test_isna_returns_copy(self, data_missing, na_func): expected = result.copy() mask = getattr(result, na_func)() if isinstance(mask.dtype, pd.SparseDtype): + # TODO: GH 57739 mask = np.array(mask) + mask.flags.writeable = True mask[:] = True tm.assert_series_equal(result, expected) @@ -77,6 +79,28 @@ def test_fillna_limit_pad(self, data_missing): expected = pd.Series(data_missing.take([1, 1, 1, 0, 1])) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + arr = data_missing.take(input_ilocs) + result = pd.Series(arr).ffill(limit_area=limit_area) + expected = pd.Series(data_missing.take(expected_ilocs)) + tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings( "ignore:Series.fillna with 'method' is deprecated:FutureWarning" ) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 5cd66d8a874c7..222ff42d45052 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas.core.dtypes.common import is_string_dtype import pandas as pd @@ -22,7 +20,7 @@ class BaseOpsUtil: def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: + ) -> type[Exception] | tuple[type[Exception], ...] | None: # Find the Exception, if any we expect to raise calling # obj.__op_name__(other) @@ -37,14 +35,6 @@ def _get_expected_exception( else: result = self.frame_scalar_exc - if using_pyarrow_string_dtype() and result is not None: - import pyarrow as pa - - result = ( # type: ignore[assignment] - result, - pa.lib.ArrowNotImplementedError, - NotImplementedError, - ) return result def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index b3c57ee49a724..8590cd7fdc235 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + import pandas as pd import pandas._testing as tm from pandas.tests.extension import base @@ -68,7 +70,7 @@ def data_for_grouping(): class TestDecimalArray(base.ExtensionTests): def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: + ) -> type[Exception] | tuple[type[Exception], ...] | None: return None def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: @@ -156,6 +158,36 @@ def test_fillna_limit_pad(self, data_missing): ): super().test_fillna_limit_pad(data_missing) + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + msg = "ExtensionArray.fillna 'method' keyword is deprecated" + with tm.assert_produces_warning( + DeprecationWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, + ): + msg = "DecimalArray does not implement limit_area" + with pytest.raises(NotImplementedError, match=msg): + super().test_ffill_limit_area( + data_missing, limit_area, input_ilocs, expected_ilocs + ) + def test_fillna_limit_backfill(self, data_missing): msg = "Series.fillna with 'method' is deprecated" with tm.assert_produces_warning( @@ -259,6 +291,24 @@ def test_series_repr(self, data): def test_unary_ufunc_dunder_equivalence(self, data, ufunc): super().test_unary_ufunc_dunder_equivalence(data, ufunc) + def test_array_interface_copy(self, data): + result_copy1 = np.array(data, copy=True) + result_copy2 = np.array(data, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + try: + result_nocopy1 = np.array(data, copy=False) + except ValueError: + # An error is always acceptable for `copy=False` + return + + result_nocopy2 = np.array(data, copy=False) + # If copy=False was given and did not raise, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + def test_take_na_value_other_decimal(): arr = DecimalArray([decimal.Decimal("1.0"), decimal.Decimal("2.0")]) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index d3d9dcc4a4712..5ff99589a1961 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -25,9 +25,12 @@ TYPE_CHECKING, Any, ) +import warnings import numpy as np +from pandas.util._exceptions import find_stack_level + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( is_bool_dtype, @@ -146,13 +149,28 @@ def __eq__(self, other): def __ne__(self, other): return NotImplemented - def __array__(self, dtype=None): + def __array__(self, dtype=None, copy=None): + if copy is False: + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if dtype is None: dtype = object if dtype == object: # on py38 builds it looks like numpy is inferring to a non-1D array return construct_1d_object_array_from_listlike(list(self)) - return np.asarray(self.data, dtype=dtype) + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.asarray(self.data, dtype=dtype) + return np.asarray(self.data, dtype=dtype, copy=copy) @property def nbytes(self) -> int: @@ -207,11 +225,12 @@ def astype(self, dtype, copy=True): return self.copy() return self elif isinstance(dtype, StringDtype): - value = self.astype(str) # numpy doesn't like nested dicts arr_cls = dtype.construct_array_type() - return arr_cls._from_sequence(value, dtype=dtype, copy=False) - - return np.array([dict(x) for x in self], dtype=dtype, copy=copy) + return arr_cls._from_sequence(self, dtype=dtype, copy=False) + elif not copy: + return np.asarray([dict(x) for x in self], dtype=dtype) + else: + return np.array([dict(x) for x in self], dtype=dtype, copy=copy) def unique(self): # Parent method doesn't work since np.array will try to infer @@ -235,6 +254,10 @@ def _values_for_argsort(self): frozen = [tuple(x.items()) for x in self] return construct_1d_object_array_from_listlike(frozen) + def _pad_or_backfill(self, *, method, limit=None, copy=True): + # GH#56616 - test EA method without limit_area argument + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + def make_data(): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 7686bc5abb44c..a18edac9aef93 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -149,6 +149,29 @@ def test_fillna_frame(self): """We treat dictionaries as a mapping in fillna, not a scalar.""" super().test_fillna_frame() + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + msg = "JSONArray does not implement limit_area" + with pytest.raises(NotImplementedError, match=msg): + super().test_ffill_limit_area( + data_missing, limit_area, input_ilocs, expected_ilocs + ) + @unhashable def test_value_counts(self, all_data, dropna): super().test_value_counts(all_data, dropna) diff --git a/pandas/tests/extension/list/array.py b/pandas/tests/extension/list/array.py index f07585c0aec10..b3bb35c9396f4 100644 --- a/pandas/tests/extension/list/array.py +++ b/pandas/tests/extension/list/array.py @@ -115,7 +115,10 @@ def astype(self, dtype, copy=True): elif is_string_dtype(dtype) and not is_object_dtype(dtype): # numpy has problems with astype(str) for nested elements return np.array([str(x) for x in self.data], dtype=dtype) - return np.array(self.data, dtype=dtype, copy=copy) + elif not copy: + return np.asarray(self.data, dtype=dtype) + else: + return np.array(self.data, dtype=dtype, copy=copy) @classmethod def _concat_same_type(cls, to_concat): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 3b03272f18203..17fe36c4b4469 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -40,8 +40,8 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, + pa_version_under20p0, ) -import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -286,7 +286,7 @@ def test_map(self, data_missing, na_action): expected = data_missing.to_numpy() tm.assert_numpy_array_equal(result, expected) - def test_astype_str(self, data, request): + def test_astype_str(self, data, request, using_infer_string): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_binary(pa_dtype): request.applymarker( @@ -294,9 +294,10 @@ def test_astype_str(self, data, request): reason=f"For {pa_dtype} .astype(str) decodes.", ) ) - elif ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): + elif not using_infer_string and ( + (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) + or pa.types.is_duration(pa_dtype) + ): request.applymarker( pytest.mark.xfail( reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", @@ -304,25 +305,6 @@ def test_astype_str(self, data, request): ) super().test_astype_str(data) - @pytest.mark.parametrize( - "nullable_string_dtype", - [ - "string[python]", - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], - ) - def test_astype_string(self, data, nullable_string_dtype, request): - pa_dtype = data.dtype.pyarrow_dtype - if ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): - request.applymarker( - pytest.mark.xfail( - reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", - ) - ) - super().test_astype_string(data, nullable_string_dtype) - def test_from_dtype(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype): @@ -407,13 +389,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: # attribute "pyarrow_dtype" pa_type = ser.dtype.pyarrow_dtype # type: ignore[union-attr] - if ( - pa.types.is_string(pa_type) - or pa.types.is_binary(pa_type) - or pa.types.is_decimal(pa_type) - ): + if pa.types.is_binary(pa_type) or pa.types.is_decimal(pa_type): if op_name in ["cumsum", "cumprod", "cummax", "cummin"]: return False + elif pa.types.is_string(pa_type): + if op_name == "cumprod": + return False elif pa.types.is_boolean(pa_type): if op_name in ["cumprod", "cummax", "cummin"]: return False @@ -428,6 +409,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request): pa_type = data.dtype.pyarrow_dtype op_name = all_numeric_accumulations + + if pa.types.is_string(pa_type) and op_name in ["cumsum", "cummin", "cummax"]: + # https://github.com/pandas-dev/pandas/pull/60633 + # Doesn't fit test structure, tested in series/test_cumulative.py instead. + return + ser = pd.Series(data) if not self._supports_accumulation(ser, op_name): @@ -455,13 +442,16 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques request.applymarker( pytest.mark.xfail( reason=f"{all_numeric_accumulations} not implemented for {pa_type}", - raises=NotImplementedError, + raises=TypeError, ) ) self.check_accumulate(ser, op_name, skipna) def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + if op_name == "kurt" or (pa_version_under20p0 and op_name == "skew"): + return False + dtype = ser.dtype # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has # no attribute "pyarrow_dtype" @@ -478,10 +468,11 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: pass else: return False + elif pa.types.is_binary(pa_dtype) and op_name in ["sum", "skew"]: + return False elif ( pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) ) and op_name in [ - "sum", "mean", "median", "prod", @@ -538,18 +529,31 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, reque f"pyarrow={pa.__version__} for {pa_dtype}" ), ) - if all_numeric_reductions in {"skew", "kurt"} and ( - dtype._is_numeric or dtype.kind == "b" - ): - request.applymarker(xfail_mark) - - elif pa.types.is_boolean(pa_dtype) and all_numeric_reductions in { + if pa.types.is_boolean(pa_dtype) and all_numeric_reductions in { "sem", "std", "var", "median", }: request.applymarker(xfail_mark) + elif ( + not pa_version_under20p0 + and all_numeric_reductions == "skew" + and ( + pa.types.is_boolean(pa_dtype) + or ( + skipna + and ( + pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype) + ) + ) + ) + ): + request.applymarker( + pytest.mark.xfail( + reason="https://github.com/apache/arrow/issues/45733", + ) + ) super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) @pytest.mark.parametrize("skipna", [True, False]) @@ -572,15 +576,18 @@ def test_reduce_series_boolean( return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): + pa_type = arr._pa_array.type if op_name in ["max", "min"]: cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": - if op_name not in ["median", "var", "std"]: + if op_name not in ["median", "var", "std", "skew"]: cmp_dtype = arr.dtype else: cmp_dtype = "float64[pyarrow]" elif op_name in ["median", "var", "std", "mean", "skew"]: cmp_dtype = "float64[pyarrow]" + elif op_name == "sum" and pa.types.is_string(pa_type): + cmp_dtype = arr.dtype else: cmp_dtype = { "i": "int64[pyarrow]", @@ -592,7 +599,7 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_frame(self, data, all_numeric_reductions, skipna, request): op_name = all_numeric_reductions - if op_name == "skew": + if op_name == "skew" and pa_version_under20p0: if data.dtype._is_numeric: mark = pytest.mark.xfail(reason="skew not implemented") request.applymarker(mark) @@ -604,26 +611,6 @@ def test_median_not_approximate(self, typ): result = pd.Series([1, 2], dtype=f"{typ}[pyarrow]").median() assert result == 1.5 - def test_in_numeric_groupby(self, data_for_grouping): - dtype = data_for_grouping.dtype - if is_string_dtype(dtype): - df = pd.DataFrame( - { - "A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping, - "C": [1, 1, 1, 1, 1, 1, 1, 1], - } - ) - - expected = pd.Index(["C"]) - msg = re.escape(f"agg function failed [how->sum,dtype->{dtype}") - with pytest.raises(TypeError, match=msg): - df.groupby("A").sum() - result = df.groupby("A").sum(numeric_only=True).columns - tm.assert_index_equal(result, expected) - else: - super().test_in_numeric_groupby(data_for_grouping) - def test_construct_from_string_own_name(self, dtype, request): pa_dtype = dtype.pyarrow_dtype if pa.types.is_decimal(pa_dtype): @@ -800,8 +787,6 @@ def test_value_counts_returns_pyarrow_int64(self, data): _combine_le_expected_dtype = "bool[pyarrow]" - divmod_exc = NotImplementedError - def get_op_from_name(self, op_name): short_opname = op_name.strip("_") if short_opname == "rtruediv": @@ -935,10 +920,11 @@ def _is_temporal_supported(self, opname, pa_dtype): def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: + ) -> type[Exception] | tuple[type[Exception], ...] | None: if op_name in ("__divmod__", "__rdivmod__"): - return self.divmod_exc + return (NotImplementedError, TypeError) + exc: type[Exception] | tuple[type[Exception], ...] | None dtype = tm.get_dtype(obj) # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no # attribute "pyarrow_dtype" @@ -949,7 +935,7 @@ def _get_expected_exception( "__mod__", "__rmod__", }: - exc = NotImplementedError + exc = (NotImplementedError, TypeError) elif arrow_temporal_supported: exc = None elif op_name in ["__add__", "__radd__"] and ( @@ -961,10 +947,7 @@ def _get_expected_exception( or pa.types.is_integer(pa_dtype) or pa.types.is_decimal(pa_dtype) ): - # TODO: in many of these cases, e.g. non-duration temporal, - # these will *never* be allowed. Would it make more sense to - # re-raise as TypeError, more consistent with non-pyarrow cases? - exc = pa.ArrowNotImplementedError + exc = TypeError else: exc = None return exc @@ -1020,14 +1003,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request) if all_arithmetic_operators == "__rmod__" and pa.types.is_binary(pa_dtype): pytest.skip("Skip testing Python string formatting") - elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( - pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) - ): - request.applymarker( - pytest.mark.xfail( - raises=TypeError, reason="Can only string multiply by an integer." - ) - ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: @@ -1042,14 +1017,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) ): pytest.skip("Skip testing Python string formatting") - elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( - pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) - ): - request.applymarker( - pytest.mark.xfail( - raises=TypeError, reason="Can only string multiply by an integer." - ) - ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: @@ -1073,14 +1040,6 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators, request): ), ) ) - elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( - pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) - ): - request.applymarker( - pytest.mark.xfail( - raises=TypeError, reason="Can only string multiply by an integer." - ) - ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: @@ -1700,7 +1659,7 @@ def test_from_arrow_respecting_given_dtype(): def test_from_arrow_respecting_given_dtype_unsafe(): array = pa.array([1.5, 2.5], type=pa.float64()) - with pytest.raises(pa.ArrowInvalid, match="Float value 1.5 was truncated"): + with tm.external_error_raised(pa.ArrowInvalid): array.to_pandas(types_mapper={pa.float64(): ArrowDtype(pa.int64())}.get) @@ -1868,6 +1827,17 @@ def test_str_replace_negative_n(): expected = pd.Series(["bc", ""], dtype=ArrowDtype(pa.string())) tm.assert_series_equal(expected, actual) + # Same bug for pyarrow-backed StringArray GH#59628 + ser2 = ser.astype(pd.StringDtype(storage="pyarrow")) + actual2 = ser2.str.replace("a", "", -3, True) + expected2 = expected.astype(ser2.dtype) + tm.assert_series_equal(expected2, actual2) + + ser3 = ser.astype(pd.StringDtype(storage="pyarrow", na_value=np.nan)) + actual3 = ser3.str.replace("a", "", -3, True) + expected3 = expected.astype(ser3.dtype) + tm.assert_series_equal(expected3, actual3) + def test_str_repeat_unsupported(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) @@ -1903,16 +1873,21 @@ def test_str_match(pat, case, na, exp): @pytest.mark.parametrize( "pat, case, na, exp", [ - ["abc", False, None, [True, None]], - ["Abc", True, None, [False, None]], - ["bc", True, None, [False, None]], - ["ab", False, True, [True, True]], - ["a[a-z]{2}", False, None, [True, None]], - ["A[a-z]{1}", True, None, [False, None]], + ["abc", False, None, [True, True, False, None]], + ["Abc", True, None, [False, False, False, None]], + ["bc", True, None, [False, False, False, None]], + ["ab", False, None, [True, True, False, None]], + ["a[a-z]{2}", False, None, [True, True, False, None]], + ["A[a-z]{1}", True, None, [False, False, False, None]], + # GH Issue: #56652 + ["abc$", False, None, [True, False, False, None]], + ["abc\\$", False, None, [False, True, False, None]], + ["Abc$", True, None, [False, False, False, None]], + ["Abc\\$", True, None, [False, False, False, None]], ], ) def test_str_fullmatch(pat, case, na, exp): - ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) + ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) result = ser.str.match(pat, case=case, na=na) expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) @@ -1937,10 +1912,56 @@ def test_str_find_negative_start(): tm.assert_series_equal(result, expected) -def test_str_find_notimplemented(): +def test_str_find_no_end(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) - with pytest.raises(NotImplementedError, match="find not implemented"): - ser.str.find("ab", start=1) + result = ser.str.find("ab", start=1) + expected = pd.Series([-1, None], dtype="int64[pyarrow]") + tm.assert_series_equal(result, expected) + + +def test_str_find_negative_start_negative_end(): + # GH 56791 + ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="d", start=-6, end=-3) + expected = pd.Series([3, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) + + +def test_str_find_large_start(): + # GH 56791 + ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="d", start=16) + expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) + + +@pytest.mark.skipif( + pa_version_under13p0, reason="https://github.com/apache/arrow/issues/36311" +) +@pytest.mark.parametrize("start", [-15, -3, 0, 1, 15, None]) +@pytest.mark.parametrize("end", [-15, -1, 0, 3, 15, None]) +@pytest.mark.parametrize("sub", ["", "az", "abce", "a", "caa"]) +def test_str_find_e2e(start, end, sub): + s = pd.Series( + ["abcaadef", "abc", "abcdeddefgj8292", "ab", "a", ""], + dtype=ArrowDtype(pa.string()), + ) + object_series = s.astype(pd.StringDtype(storage="python")) + result = s.str.find(sub, start, end) + expected = object_series.str.find(sub, start, end).astype(result.dtype) + tm.assert_series_equal(result, expected) + + arrow_str_series = s.astype(pd.StringDtype(storage="pyarrow")) + result2 = arrow_str_series.str.find(sub, start, end).astype(result.dtype) + tm.assert_series_equal(result2, expected) + + +def test_str_find_negative_start_negative_end_no_match(): + # GH 56791 + ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="d", start=-3, end=-6) + expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1984,6 +2005,7 @@ def test_str_join_string_type(): [None, 2, None, ["ab", None]], [None, 2, 1, ["ab", None]], [1, 3, 1, ["bc", None]], + (None, None, -1, ["dcba", None]), ], ) def test_str_slice(start, stop, step, exp): @@ -2723,6 +2745,111 @@ def test_dt_tz_convert(unit): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dtype", ["timestamp[ms][pyarrow]", "duration[ms][pyarrow]"]) +def test_as_unit(dtype): + # GH 52284 + ser = pd.Series([1000, None], dtype=dtype) + result = ser.dt.as_unit("ns") + expected = ser.astype(dtype.replace("ms", "ns")) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "prop, expected", + [ + ["days", 1], + ["seconds", 2], + ["microseconds", 3], + ["nanoseconds", 4], + ], +) +def test_dt_timedelta_properties(prop, expected): + # GH 52284 + ser = pd.Series( + [ + pd.Timedelta( + days=1, + seconds=2, + microseconds=3, + nanoseconds=4, + ), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = getattr(ser.dt, prop) + expected = pd.Series( + ArrowExtensionArray(pa.array([expected, None], type=pa.int32())) + ) + tm.assert_series_equal(result, expected) + + +def test_dt_timedelta_total_seconds(): + # GH 52284 + ser = pd.Series( + [ + pd.Timedelta( + days=1, + seconds=2, + microseconds=3, + nanoseconds=4, + ), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.total_seconds() + expected = pd.Series( + ArrowExtensionArray(pa.array([86402.000003, None], type=pa.float64())) + ) + tm.assert_series_equal(result, expected) + + +def test_dt_to_pytimedelta(): + # GH 52284 + data = [timedelta(1, 2, 3), timedelta(1, 2, 4)] + ser = pd.Series(data, dtype=ArrowDtype(pa.duration("ns"))) + + result = ser.dt.to_pytimedelta() + expected = np.array(data, dtype=object) + tm.assert_numpy_array_equal(result, expected) + assert all(type(res) is timedelta for res in result) + + expected = ser.astype("timedelta64[ns]").dt.to_pytimedelta() + tm.assert_numpy_array_equal(result, expected) + + +def test_dt_components(): + # GH 52284 + ser = pd.Series( + [ + pd.Timedelta( + days=1, + seconds=2, + microseconds=3, + nanoseconds=4, + ), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.components + expected = pd.DataFrame( + [[1, 0, 0, 2, 0, 3, 4], [None, None, None, None, None, None, None]], + columns=[ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ], + dtype="int32[pyarrow]", + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna): # GH51624 @@ -3124,6 +3251,22 @@ def test_factorize_chunked_dictionary(): tm.assert_index_equal(res_uniques, exp_uniques) +def test_dictionary_astype_categorical(): + # GH#56672 + arrs = [ + pa.array(np.array(["a", "x", "c", "a"])).dictionary_encode(), + pa.array(np.array(["a", "d", "c"])).dictionary_encode(), + ] + ser = pd.Series(ArrowExtensionArray(pa.chunked_array(arrs))) + result = ser.astype("category") + categories = pd.Index(["a", "x", "c", "d"], dtype=ArrowDtype(pa.string())) + expected = pd.Series( + ["a", "x", "c", "a", "a", "d", "c"], + dtype=pd.CategoricalDtype(categories=categories), + ) + tm.assert_series_equal(result, expected) + + def test_arrow_floordiv(): # GH 55561 a = pd.Series([-7], dtype="int64[pyarrow]") @@ -3133,6 +3276,92 @@ def test_arrow_floordiv(): tm.assert_series_equal(result, expected) +def test_arrow_floordiv_large_values(): + # GH 56645 + a = pd.Series([1425801600000000000], dtype="int64[pyarrow]") + expected = pd.Series([1425801600000], dtype="int64[pyarrow]") + result = a // 1_000_000 + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_floordiv_large_integral_result(dtype): + # GH 56676 + a = pd.Series([18014398509481983], dtype=dtype) + result = a // 1 + tm.assert_series_equal(result, a) + + +@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES) +def test_arrow_floordiv_larger_divisor(pa_type): + # GH 56676 + dtype = ArrowDtype(pa_type) + a = pd.Series([-23], dtype=dtype) + result = a // 24 + expected = pd.Series([-1], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES) +def test_arrow_floordiv_integral_invalid(pa_type): + # GH 56676 + min_value = np.iinfo(pa_type.to_pandas_dtype()).min + a = pd.Series([min_value], dtype=ArrowDtype(pa_type)) + with pytest.raises(pa.lib.ArrowInvalid, match="overflow|not in range"): + a // -1 + with pytest.raises(pa.lib.ArrowInvalid, match="divide by zero"): + a // 0 + + +@pytest.mark.parametrize("dtype", tm.FLOAT_PYARROW_DTYPES_STR_REPR) +def test_arrow_floordiv_floating_0_divisor(dtype): + # GH 56676 + a = pd.Series([2], dtype=dtype) + result = a // 0 + expected = pd.Series([float("inf")], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["float64", "datetime64[ns]", "timedelta64[ns]"]) +def test_astype_int_with_null_to_numpy_dtype(dtype): + # GH 57093 + ser = pd.Series([1, None], dtype="int64[pyarrow]") + result = ser.astype(dtype) + expected = pd.Series([1, None], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES) +def test_arrow_integral_floordiv_large_values(pa_type): + # GH 56676 + max_value = np.iinfo(pa_type.to_pandas_dtype()).max + dtype = ArrowDtype(pa_type) + a = pd.Series([max_value], dtype=dtype) + b = pd.Series([1], dtype=dtype) + result = a // b + tm.assert_series_equal(result, a) + + +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_true_division_large_divisor(dtype): + # GH 56706 + a = pd.Series([0], dtype=dtype) + b = pd.Series([18014398509481983], dtype=dtype) + expected = pd.Series([0], dtype="float64[pyarrow]") + result = a / b + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_floor_division_large_divisor(dtype): + # GH 56706 + a = pd.Series([0], dtype=dtype) + b = pd.Series([18014398509481983], dtype=dtype) + expected = pd.Series([0], dtype=dtype) + result = a // b + tm.assert_series_equal(result, expected) + + def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] @@ -3143,6 +3372,17 @@ def test_string_to_datetime_parsing_cast(): tm.assert_series_equal(result, expected) +@pytest.mark.skipif( + pa_version_under13p0, reason="pairwise_diff_checked not implemented in pyarrow" +) +def test_interpolate_not_numeric(data): + if not data.dtype._is_numeric: + ser = pd.Series(data) + msg = re.escape(f"Cannot interpolate with {ser.dtype} dtype") + with pytest.raises(TypeError, match=msg): + pd.Series(data).interpolate() + + def test_string_to_time_parsing_cast(): # GH 56463 string_times = ["11:41:43.076160"] @@ -3153,9 +3393,24 @@ def test_string_to_time_parsing_cast(): tm.assert_series_equal(result, expected) +def test_to_numpy_float(): + # GH#56267 + ser = pd.Series([32, 40, None], dtype="float[pyarrow]") + result = ser.astype("float64") + expected = pd.Series([32, 40, np.nan], dtype="float64") + tm.assert_series_equal(result, expected) + + def test_to_numpy_timestamp_to_int(): # GH 55997 ser = pd.Series(["2020-01-01 04:30:00"], dtype="timestamp[ns][pyarrow]") result = ser.to_numpy(dtype=np.int64) expected = np.array([1577853000000000000]) tm.assert_numpy_array_equal(result, expected) + + +def test_map_numeric_na_action(): + ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") + result = ser.map(lambda x: 42, na_action="ignore") + expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 6f33b18b19c51..135ea67c924d0 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -18,7 +18,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import Categorical @@ -103,7 +103,7 @@ def test_contains(self, data, data_missing): continue assert na_value_obj not in data # this section suffers from super method - if not using_pyarrow_string_dtype(): + if not using_string_dtype(): assert na_value_obj in data_missing def test_empty(self, dtype): diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 3d8523f344d46..5eda0f00f54ca 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -17,7 +17,7 @@ class DummyArray(ExtensionArray): def __init__(self, data) -> None: self.data = data - def __array__(self, dtype): + def __array__(self, dtype=None, copy=None): return self.data @property @@ -30,8 +30,10 @@ def astype(self, dtype, copy=True): if copy: return type(self)(self.data) return self - - return np.array(self, dtype=dtype, copy=copy) + elif not copy: + return np.asarray(self, dtype=dtype) + else: + return np.array(self, dtype=dtype, copy=copy) class TestExtensionArrayDtype: diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 98dd1c5cb615f..6292e6051aa90 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -90,6 +90,31 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object(self, data): + super().test_hash_pandas_object(data) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object_works(self, data, as_frame): + super().test_hash_pandas_object_works(data, as_frame) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_str(self, data): + super().test_astype_str(data) + # TODO: either belongs in tests.arrays.interval or move into base tests. def test_fillna_non_scalar_raises(data_missing): diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 3efc561d6a125..651f783b44d1f 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -179,6 +179,15 @@ def test_map(self, data_missing, na_action): expected = data_missing.to_numpy() tm.assert_numpy_array_equal(result, expected) + def test_map_na_action_ignore(self, data_missing_for_sorting): + zero = data_missing_for_sorting[2] + result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore") + if data_missing_for_sorting.dtype.kind == "b": + expected = np.array([False, pd.NA, False], dtype=object) + else: + expected = np.array([zero, np.nan, zero]) + tm.assert_numpy_array_equal(result, expected) + def _get_expected_exception(self, op_name, obj, other): try: dtype = tm.get_dtype(obj) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index aaf49f53ba02b..e38144f4c615b 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -421,16 +421,6 @@ def test_index_from_listlike_with_dtype(self, data): def test_EA_types(self, engine, data, request): super().test_EA_types(engine, data, request) - @pytest.mark.xfail(reason="Expect NumpyEA, get np.ndarray") - def test_compare_array(self, data, comparison_op): - super().test_compare_array(data, comparison_op) - - def test_compare_scalar(self, data, comparison_op, request): - if data.dtype.kind == "f" or comparison_op.__name__ in ["eq", "ne"]: - mark = pytest.mark.xfail(reason="Expect NumpyEA, get np.ndarray") - request.applymarker(mark) - super().test_compare_scalar(data, comparison_op) - class Test2DCompat(base.NDArrayBacked2DTests): pass diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 4039a5d01f372..2d5989a5b4f1d 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -348,11 +348,16 @@ def test_argmin_argmax_all_na(self, method, data, na_value): self._check_unsupported(data) super().test_argmin_argmax_all_na(method, data, na_value) + @pytest.mark.fails_arm_wheels @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame]) def test_equals(self, data, na_value, as_series, box): self._check_unsupported(data) super().test_equals(data, na_value, as_series, box) + @pytest.mark.fails_arm_wheels + def test_equals_same_data_different_object(self, data): + super().test_equals_same_data_different_object(data) + @pytest.mark.parametrize( "func, na_action, expected", [ diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 2d5a134f8560a..526cf426781ad 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -21,6 +21,10 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW + +from pandas.core.dtypes.base import StorageExtensionDtype + import pandas as pd import pandas._testing as tm from pandas.api.types import is_string_dtype @@ -52,8 +56,9 @@ def chunked(request): @pytest.fixture -def dtype(string_storage): - return StringDtype(storage=string_storage) +def dtype(string_dtype_arguments): + storage, na_value = string_dtype_arguments + return StringDtype(storage=storage, na_value=na_value) @pytest.fixture @@ -95,16 +100,36 @@ def data_for_grouping(dtype, chunked): class TestStringArray(base.ExtensionTests): def test_eq_with_str(self, dtype): - assert dtype == f"string[{dtype.storage}]" super().test_eq_with_str(dtype) + if dtype.na_value is pd.NA: + # only the NA-variant supports parametrized string alias + assert dtype == f"string[{dtype.storage}]" + elif dtype.storage == "pyarrow": + with tm.assert_produces_warning(FutureWarning): + assert dtype == "string[pyarrow_numpy]" + def test_is_not_string_type(self, dtype): # Different from BaseDtypeTests.test_is_not_string_type # because StringDtype is a string type assert is_string_dtype(dtype) - def test_view(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_is_dtype_from_name(self, dtype, using_infer_string): + if dtype.na_value is np.nan and not using_infer_string: + result = type(dtype).is_dtype(dtype.name) + assert result is False + else: + super().test_is_dtype_from_name(dtype) + + def test_construct_from_string_own_name(self, dtype, using_infer_string): + if dtype.na_value is np.nan and not using_infer_string: + with pytest.raises(TypeError, match="Cannot construct a 'StringDtype'"): + dtype.construct_from_string(dtype.name) + else: + super().test_construct_from_string_own_name(dtype) + + def test_view(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_view(data) @@ -112,13 +137,13 @@ def test_from_dtype(self, data): # base test uses string representation of dtype pass - def test_transpose(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_transpose(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_transpose(data) - def test_setitem_preserves_views(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_setitem_preserves_views(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_setitem_preserves_views(data) @@ -141,31 +166,15 @@ def test_fillna_no_op_returns_copy(self, data): def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: - if op_name in ["__divmod__", "__rdivmod__"]: - if isinstance(obj, pd.Series) and cast( - StringDtype, tm.get_dtype(obj) - ).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: - # TODO: re-raise as TypeError? - return NotImplementedError - elif isinstance(other, pd.Series) and cast( - StringDtype, tm.get_dtype(other) - ).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: - # TODO: re-raise as TypeError? - return NotImplementedError - return TypeError - elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]: - if cast(StringDtype, tm.get_dtype(obj)).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: - return NotImplementedError + ) -> type[Exception] | tuple[type[Exception], ...] | None: + if op_name in [ + "__mod__", + "__rmod__", + "__divmod__", + "__rdivmod__", + "__pow__", + "__rpow__", + ]: return TypeError elif op_name in ["__mul__", "__rmul__"]: # Can only multiply strings by integers @@ -178,33 +187,29 @@ def _get_expected_exception( "__sub__", "__rsub__", ]: - if cast(StringDtype, tm.get_dtype(obj)).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: - import pyarrow as pa - - # TODO: better to re-raise as TypeError? - return pa.ArrowNotImplementedError return TypeError return None def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return ( - op_name in ["min", "max"] - or ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] + op_name in ["min", "max", "sum"] + or ser.dtype.na_value is np.nan # type: ignore[union-attr] and op_name in ("any", "all") ) + def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: + assert isinstance(ser.dtype, StorageExtensionDtype) + return op_name in ["cummin", "cummax", "cumsum"] + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: cast_to = dtype + elif dtype.na_value is np.nan: + cast_to = np.bool_ # type: ignore[assignment] elif dtype.storage == "pyarrow": cast_to = "boolean[pyarrow]" # type: ignore[assignment] - elif dtype.storage == "pyarrow_numpy": - cast_to = np.bool_ # type: ignore[assignment] else: cast_to = "boolean" # type: ignore[assignment] return pointwise_result.astype(cast_to) @@ -213,9 +218,35 @@ def test_compare_scalar(self, data, comparison_op): ser = pd.Series(data) self._compare_other(ser, data, comparison_op, "abc") - @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning") - def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): - super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) + def test_combine_add(self, data_repeated, using_infer_string, request): + dtype = next(data_repeated(1)).dtype + if using_infer_string and ( + (dtype.na_value is pd.NA) and dtype.storage == "python" + ): + mark = pytest.mark.xfail( + reason="The pointwise operation result will be inferred to " + "string[nan, pyarrow], which does not match the input dtype" + ) + request.applymarker(mark) + super().test_combine_add(data_repeated) + + def test_arith_series_with_array( + self, data, all_arithmetic_operators, using_infer_string, request + ): + dtype = data.dtype + if ( + using_infer_string + and all_arithmetic_operators == "__radd__" + and ( + (dtype.na_value is pd.NA) or (dtype.storage == "python" and HAS_PYARROW) + ) + ): + mark = pytest.mark.xfail( + reason="The pointwise operation result will be inferred to " + "string[nan, pyarrow], which does not match the input dtype" + ) + request.applymarker(mark) + super().test_arith_series_with_array(data, all_arithmetic_operators) class Test2DCompat(base.Dim2CompatTests): diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index e07024b2e2a09..b7293946d38c9 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -18,7 +18,7 @@ def datetime_frame() -> DataFrame: """ return DataFrame( np.random.default_rng(2).standard_normal((100, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100, freq="B"), ) @@ -33,7 +33,7 @@ def float_string_frame(): df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), index=Index([f"foo_{i}" for i in range(30)], dtype=object), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), ) df["foo"] = "bar" return df diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index 60a8e688b3b8a..1509c47ba65c7 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( DataFrame, @@ -44,9 +44,7 @@ def test_constructor_single_row(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif( - using_pyarrow_string_dtype(), reason="columns inferring logic broken" - ) + @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 3622571f1365d..58e47ba48f894 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -6,7 +6,7 @@ import pytest import pytz -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import is_platform_little_endian @@ -58,9 +58,7 @@ def test_from_records_with_datetimes(self): expected["EXPIRY"] = expected["EXPIRY"].astype("M8[s]") tm.assert_frame_equal(result, expected) - @pytest.mark.skipif( - using_pyarrow_string_dtype(), reason="dtype checking logic doesn't work" - ) + @pytest.mark.xfail(using_string_dtype(), reason="dtype checking logic doesn't work") def test_from_records_sequencelike(self): df = DataFrame( { diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py index ba0d8613b6228..f7f7b2c7c872a 100644 --- a/pandas/tests/frame/indexing/test_coercion.py +++ b/pandas/tests/frame/indexing/test_coercion.py @@ -103,21 +103,36 @@ def test_26395(indexer_al): df["D"] = 0 indexer_al(df)["C", "D"] = 2 - expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64) + expected = DataFrame( + {"D": [0, 0, 2]}, + index=["A", "B", "C"], + columns=pd.Index(["D"], dtype=object), + dtype=np.int64, + ) tm.assert_frame_equal(df, expected) with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype" ): indexer_al(df)["C", "D"] = 44.5 - expected = DataFrame({"D": [0, 0, 44.5]}, index=["A", "B", "C"], dtype=np.float64) + expected = DataFrame( + {"D": [0, 0, 44.5]}, + index=["A", "B", "C"], + columns=pd.Index(["D"], dtype=object), + dtype=np.float64, + ) tm.assert_frame_equal(df, expected) with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype" ): indexer_al(df)["C", "D"] = "hello" - expected = DataFrame({"D": [0, 0, "hello"]}, index=["A", "B", "C"], dtype=object) + expected = DataFrame( + {"D": [0, 0, "hello"]}, + index=["A", "B", "C"], + columns=pd.Index(["D"], dtype=object), + dtype=object, + ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 97e7ae15c6c63..a8249ed7f9828 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import iNaT from pandas.errors import ( InvalidIndexError, @@ -334,7 +336,7 @@ def test_setitem( smaller["col10"] = ["1", "2"] if using_infer_string: - assert smaller["col10"].dtype == "string" + assert smaller["col10"].dtype == "str" else: assert smaller["col10"].dtype == np.object_ assert (smaller["col10"] == ["1", "2"]).all() @@ -469,13 +471,13 @@ def test_setitem_corner(self, float_frame, using_infer_string): del dm["foo"] dm["foo"] = "bar" if using_infer_string: - assert dm["foo"].dtype == "string" + assert dm["foo"].dtype == "str" else: assert dm["foo"].dtype == np.object_ dm["coercible"] = ["1", "2", "3"] if using_infer_string: - assert dm["coercible"].dtype == "string" + assert dm["coercible"].dtype == "str" else: assert dm["coercible"].dtype == np.object_ @@ -511,21 +513,20 @@ def test_setitem_ambig(self, using_infer_string): dm[2] = uncoercable_series assert len(dm.columns) == 3 if using_infer_string: - assert dm[2].dtype == "string" + assert dm[2].dtype == "str" else: assert dm[2].dtype == np.object_ - def test_setitem_None(self, float_frame, using_infer_string): + def test_setitem_None(self, float_frame): # GH #766 float_frame[None] = float_frame["A"] - key = None if not using_infer_string else np.nan tm.assert_series_equal( float_frame.iloc[:, -1], float_frame["A"], check_names=False ) tm.assert_series_equal( - float_frame.loc[:, key], float_frame["A"], check_names=False + float_frame.loc[:, None], float_frame["A"], check_names=False ) - tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False) + tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False) def test_loc_setitem_boolean_mask_allfalse(self): # GH 9596 @@ -901,6 +902,8 @@ def test_setitem_frame_float(self, float_frame): expected = piece.values tm.assert_almost_equal(result, expected) + # dtype inference + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_frame_mixed(self, float_string_frame): # GH 3216 @@ -913,6 +916,8 @@ def test_setitem_frame_mixed(self, float_string_frame): f.loc[key] = piece tm.assert_almost_equal(f.loc[f.index[0:2], ["A", "B"]].values, piece.values) + # dtype inference + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_frame_mixed_rows_unaligned(self, float_string_frame): # GH#3216 rows unaligned f = float_string_frame.copy() @@ -927,6 +932,8 @@ def test_setitem_frame_mixed_rows_unaligned(self, float_string_frame): f.loc[f.index[0:2:], ["A", "B"]].values, piece.values[0:2] ) + # dtype inference + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_frame_mixed_key_unaligned(self, float_string_frame): # GH#3216 key is unaligned with values f = float_string_frame.copy() @@ -949,7 +956,8 @@ def test_setitem_frame_upcast(self): # needs upcasting df = DataFrame([[1, 2, "foo"], [3, 4, "bar"]], columns=["A", "B", "C"]) df2 = df.copy() - df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 expected = df.reindex(columns=["A", "B"]) expected += 0.5 expected["C"] = df["C"] @@ -1198,7 +1206,7 @@ def test_loc_setitem_datetimelike_with_inference(self): result = df.dtypes expected = Series( [np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2, - index=list("ABCDEFGH"), + index=Index(list("ABCDEFGH"), dtype=object), ) tm.assert_series_equal(result, expected) @@ -1243,7 +1251,7 @@ def test_getitem_boolean_indexing_mixed(self): tm.assert_frame_equal(df2, expected) df["foo"] = "test" - msg = "not supported between instances|unorderable types" + msg = "not supported between instances|unorderable types|Invalid comparison" with pytest.raises(TypeError, match=msg): df[df > 0.3] = 1 @@ -1331,7 +1339,7 @@ def test_setting_mismatched_na_into_nullable_fails( r"timedelta64\[ns\] cannot be converted to (Floating|Integer)Dtype", r"datetime64\[ns\] cannot be converted to (Floating|Integer)Dtype", "'values' contains non-numeric NA", - r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}", + r"Invalid value '.*' for dtype '(U?Int|Float)\d{1,2}'", ] ) with pytest.raises(TypeError, match=msg): @@ -1387,20 +1395,20 @@ def test_loc_expand_empty_frame_keep_midx_names(self): tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( - "val, idxr, warn", + "val, idxr", [ - ("x", "a", None), # TODO: this should warn as well - ("x", ["a"], None), # TODO: this should warn as well - (1, "a", None), # TODO: this should warn as well - (1, ["a"], FutureWarning), + ("x", "a"), + ("x", ["a"]), + (1, "a"), + (1, ["a"]), ], ) - def test_loc_setitem_rhs_frame(self, idxr, val, warn): + def test_loc_setitem_rhs_frame(self, idxr, val): # GH#47578 df = DataFrame({"a": [1, 2]}) with tm.assert_produces_warning( - warn, match="Setting an item of incompatible dtype" + FutureWarning, match="Setting an item of incompatible dtype" ): df.loc[:, idxr] = DataFrame({"a": [val, 11]}, index=[1, 2]) expected = DataFrame({"a": [np.nan, val]}) @@ -1939,13 +1947,11 @@ def test_adding_new_conditional_column() -> None: ("dtype", "infer_string"), [ (object, False), - ("string[pyarrow_numpy]", True), + (pd.StringDtype(na_value=np.nan), True), ], ) def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: # https://github.com/pandas-dev/pandas/issues/56204 - pytest.importorskip("pyarrow") - df = DataFrame({"a": [1, 2], "b": [3, 4]}) with pd.option_context("future.infer_string", infer_string): df.loc[df["a"] == 1, "c"] = "1" @@ -1957,13 +1963,12 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: def test_add_new_column_infer_string(): # GH#55366 - pytest.importorskip("pyarrow") df = DataFrame({"x": [1]}) with pd.option_context("future.infer_string", True): df.loc[df["x"] == 1, "y"] = "1" expected = DataFrame( - {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, - columns=Index(["x", "y"], dtype=object), + {"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))}, + columns=Index(["x", "y"], dtype="str"), ) tm.assert_frame_equal(df, expected) @@ -1996,7 +2001,7 @@ def _check_setitem_invalid(self, df, invalid, indexer, warn): np.datetime64("NaT"), np.timedelta64("NaT"), ] - _indexers = [0, [0], slice(0, 1), [True, False, False]] + _indexers = [0, [0], slice(0, 1), [True, False, False], slice(None, None, None)] @pytest.mark.parametrize( "invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)] @@ -2010,7 +2015,7 @@ def test_setitem_validation_scalar_bool(self, invalid, indexer): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype) - if isna(invalid) and invalid is not pd.NaT: + if isna(invalid) and invalid is not pd.NaT and not np.isnat(invalid): warn = None else: warn = FutureWarning diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index 7e702bdc993bd..4cf297b4c037d 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -67,7 +67,8 @@ def test_insert_with_columns_dups(self): df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True) df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True) exp = DataFrame( - [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] + [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], + columns=Index(["A", "A", "A"], dtype=object), ) tm.assert_frame_equal(df, exp) diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index ce771280bc264..3d23e13264911 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -28,7 +28,7 @@ def test_set_value_resize(self, float_frame, using_infer_string): res = float_frame.copy() res._set_value("foobar", "baz", "sam") if using_infer_string: - assert res["baz"].dtype == "string" + assert res["baz"].dtype == "str" else: assert res["baz"].dtype == np.object_ res = float_frame.copy() diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index e802a56ecbc81..190218a82d231 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -151,7 +151,11 @@ def test_setitem_empty_columns(self): df = DataFrame(index=["A", "B", "C"]) df["X"] = df.index df["X"] = ["x", "y", "z"] - exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"]) + exp = DataFrame( + data={"X": ["x", "y", "z"]}, + index=["A", "B", "C"], + columns=Index(["X"], dtype=object), + ) tm.assert_frame_equal(df, exp) def test_setitem_dt64_index_empty_columns(self): @@ -167,7 +171,9 @@ def test_setitem_timestamp_empty_columns(self): df["now"] = Timestamp("20130101", tz="UTC").as_unit("ns") expected = DataFrame( - [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"] + [[Timestamp("20130101", tz="UTC")]] * 3, + index=range(3), + columns=Index(["now"], dtype=object), ) tm.assert_frame_equal(df, expected) @@ -206,7 +212,7 @@ def test_setitem_period_preserves_dtype(self): result = DataFrame([]) result["a"] = data - expected = DataFrame({"a": data}) + expected = DataFrame({"a": data}, columns=Index(["a"], dtype=object)) tm.assert_frame_equal(result, expected) @@ -675,7 +681,7 @@ def test_setitem_iloc_two_dimensional_generator(self): def test_setitem_dtypes_bytes_type_to_object(self): # GH 20734 index = Series(name="id", dtype="S24") - df = DataFrame(index=index) + df = DataFrame(index=index, columns=Index([], dtype="str")) df["a"] = Series(name="a", index=index, dtype=np.uint32) df["b"] = Series(name="b", index=index, dtype="S64") df["c"] = Series(name="c", index=index, dtype="S64") @@ -714,7 +720,7 @@ def test_setitem_npmatrix_2d(self): ) a = np.ones((10, 1)) - df = DataFrame(index=np.arange(10)) + df = DataFrame(index=np.arange(10), columns=Index([], dtype="str")) df["np-array"] = a # Instantiation of `np.matrix` gives PendingDeprecationWarning @@ -933,7 +939,7 @@ def test_setitem_scalars_no_index(self): # GH#16823 / GH#17894 df = DataFrame() df["foo"] = 1 - expected = DataFrame(columns=["foo"]).astype(np.int64) + expected = DataFrame(columns=Index(["foo"], dtype=object)).astype(np.int64) tm.assert_frame_equal(df, expected) def test_setitem_newcol_tuple_key(self, float_frame): @@ -1381,3 +1387,39 @@ def test_frame_setitem_empty_dataframe(self): index=dti[:0], ) tm.assert_frame_equal(df, expected) + + +def test_full_setter_loc_incompatible_dtype(): + # https://github.com/pandas-dev/pandas/issues/55791 + df = DataFrame({"a": [1, 2]}) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "a"] = True + expected = DataFrame({"a": [True, True]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"a": [1, 2]}) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "a"] = {0: 3.5, 1: 4.5} + expected = DataFrame({"a": [3.5, 4.5]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"a": [1, 2]}) + df.loc[:, "a"] = {0: 3, 1: 4} + expected = DataFrame({"a": [3, 4]}) + tm.assert_frame_equal(df, expected) + + +def test_setitem_partial_row_multiple_columns(): + # https://github.com/pandas-dev/pandas/issues/56503 + df = DataFrame({"A": [1, 2, 3], "B": [4.0, 5, 6]}) + # should not warn + df.loc[df.index <= 1, ["F", "G"]] = (1, "abc") + expected = DataFrame( + { + "A": [1, 2, 3], + "B": [4.0, 5, 6], + "F": [1.0, 1, float("nan")], + "G": ["abc", "abc", float("nan")], + } + ) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 3d36d0471f02f..356257bbfec98 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -63,7 +63,10 @@ def _check_get(df, cond, check_dtypes=True): # check getting df = where_frame if df is float_string_frame: - msg = "'>' not supported between instances of 'str' and 'int'" + msg = ( + "'>' not supported between instances of 'str' and 'int'" + "|Invalid comparison" + ) with pytest.raises(TypeError, match=msg): df > 0 return @@ -128,7 +131,10 @@ def _check_align(df, cond, other, check_dtypes=True): df = where_frame if df is float_string_frame: - msg = "'>' not supported between instances of 'str' and 'int'" + msg = ( + "'>' not supported between instances of 'str' and 'int'" + "|Invalid comparison" + ) with pytest.raises(TypeError, match=msg): df > 0 return @@ -193,7 +199,10 @@ def _check_set(df, cond, check_dtypes=True): df = where_frame if df is float_string_frame: - msg = "'>' not supported between instances of 'str' and 'int'" + msg = ( + "'>' not supported between instances of 'str' and 'int'" + "|Invalid comparison" + ) with pytest.raises(TypeError, match=msg): df > 0 return @@ -967,7 +976,7 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): mask = np.array([True, True, False], ndmin=obj.ndim).T - msg = r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}" + msg = r"Invalid value '.*' for dtype '(U?Int|Float)\d{1,2}'" for null in tm.NP_NAT_OBJECTS + [pd.NaT]: # NaT is an NA value that we should *not* cast to pd.NA dtype @@ -1077,13 +1086,9 @@ def test_where_producing_ea_cond_for_np_dtype(): @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) -def test_where_int_overflow(replacement, using_infer_string, request): +def test_where_int_overflow(replacement): # GH 31687 df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) - if using_infer_string and replacement not in (None, "snake"): - request.node.add_marker( - pytest.mark.xfail(reason="Can't set non-string into string column") - ) result = df.where(pd.notnull(df), replacement) expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]]) diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index be809e3a17c8e..2aa27d1d6a548 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -79,7 +79,7 @@ def test_xs( def test_xs_corner(self): # pathological mixed-type reordering case - df = DataFrame(index=[0]) + df = DataFrame(index=[0], columns=Index([], dtype="str")) df["A"] = 1.0 df["B"] = "foo" df["C"] = 2.0 diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 5a1e3cd786f84..938f9cfcde3f8 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -167,21 +169,21 @@ def test_astype_str(self): "d": list(map(str, d._values)), "e": list(map(str, e._values)), }, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) - def test_astype_str_float(self): + def test_astype_str_float(self, using_infer_string): # see GH#11302 result = DataFrame([np.nan]).astype(str) - expected = DataFrame(["nan"], dtype="object") + expected = DataFrame([np.nan if using_infer_string else "nan"], dtype="str") tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" - expected = DataFrame([val], dtype="object") + expected = DataFrame([val], dtype="str") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) @@ -200,7 +202,7 @@ def test_astype_dict_like(self, dtype_class): expected = DataFrame( { "a": a, - "b": Series(["0", "1", "2", "3", "4"], dtype="object"), + "b": Series(["0", "1", "2", "3", "4"], dtype="str"), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), } @@ -261,9 +263,9 @@ def test_astype_duplicate_col(self): a2 = Series([0, 1, 2, 3, 4], name="a") df = concat([a1, b, a2], axis=1) - result = df.astype(str) + result = df.astype("str") a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") - b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") + b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype="str", name="b") a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") expected = concat([a1_str, b_str, a2_str], axis=1) tm.assert_frame_equal(result, expected) @@ -283,7 +285,7 @@ def test_astype_duplicate_col_series_arg(self): result = df.astype(dtypes) expected = DataFrame( { - 0: Series(vals[:, 0].astype(str), dtype=object), + 0: Series(vals[:, 0].astype(str), dtype="str"), 1: vals[:, 1], 2: pd.array(vals[:, 2], dtype="Float64"), 3: vals[:, 3], @@ -664,9 +666,10 @@ def test_astype_dt64tz(self, timezone_frame): # dt64tz->dt64 deprecated timezone_frame.astype("datetime64[ns]") - def test_astype_dt64tz_to_str(self, timezone_frame): + def test_astype_dt64tz_to_str(self, timezone_frame, using_infer_string): # str formatting result = timezone_frame.astype(str) + na_value = np.nan if using_infer_string else "NaT" expected = DataFrame( [ [ @@ -674,7 +677,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): "2013-01-01 00:00:00-05:00", "2013-01-01 00:00:00+01:00", ], - ["2013-01-02", "NaT", "NaT"], + ["2013-01-02", na_value, na_value], [ "2013-01-03", "2013-01-03 00:00:00-05:00", @@ -682,7 +685,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): ], ], columns=timezone_frame.columns, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) @@ -757,6 +760,7 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) GH#60639") def test_astype_dt64_to_string( self, frame_or_series, tz_naive_fixture, using_infer_string ): @@ -909,3 +913,12 @@ def test_astype_to_string_not_modifying_input(string_storage, val): with option_context("mode.string_storage", string_storage): df.astype("string", copy=False) tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT]) +def test_astype_to_string_dtype_not_modifying_input(any_string_dtype, val): + # GH#51073 - variant of the above test with explicit dtype instances + df = DataFrame({"a": ["a", "b", val]}) + expected = df.copy() + df.astype(any_string_dtype) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 521d2cb14ac6a..e7f6e5d625d3e 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -11,13 +11,9 @@ class TestConvertDtypes: @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_convert_dtypes( - self, convert_integer, expected, string_storage, using_infer_string - ): + def test_convert_dtypes(self, convert_integer, expected, string_storage): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here - if using_infer_string: - string_storage = "pyarrow_numpy" df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 04a08c8b9bc52..9abf1996c43e6 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -339,9 +339,8 @@ def test_corrwith_with_objects(self, using_infer_string): df2["obj"] = "bar" if using_infer_string: - import pyarrow as pa - - with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + msg = "Cannot perform reduction 'mean' with string dtype" + with pytest.raises(TypeError, match=msg): df1.corrwith(df2) else: with pytest.raises(TypeError, match="Could not convert"): diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 7899b4aeac3fd..0d4a6a065111f 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -182,9 +182,12 @@ def test_dropna_multiple_axes(self): with pytest.raises(TypeError, match="supplying multiple axes"): inp.dropna(how="all", axis=(0, 1), inplace=True) - def test_dropna_tz_aware_datetime(self): + def test_dropna_tz_aware_datetime(self, using_infer_string): # GH13407 + df = DataFrame() + if using_infer_string: + df.columns = df.columns.astype("str") dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc()) dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc()) df["Time"] = [dt1] diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index ab632ac17318e..524a5587dce10 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -146,8 +146,5 @@ def test_frame_apply_np_array_return_type(self, using_infer_string): # GH 35517 df = DataFrame([["foo"]]) result = df.apply(lambda col: np.array("bar")) - if using_infer_string: - expected = Series([np.array(["bar"])]) - else: - expected = Series(["bar"]) + expected = Series(np.array("bar")) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 6757669351c5c..c0fc72768e27f 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - import pandas.util._test_decorators as td from pandas import ( @@ -91,7 +89,6 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(ValueError, match=msg): datetime_frame.fillna(5, method="ffill") - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan @@ -125,27 +122,21 @@ def test_fillna_empty(self, using_copy_on_write): df.x.fillna(method=m, inplace=True) df.x.fillna(method=m) - def test_fillna_different_dtype(self, using_infer_string): + def test_fillna_different_dtype(self): # with different dtype (GH#3386) df = DataFrame( [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] ) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna({2: "foo"}) - else: - result = df.fillna({2: "foo"}) + result = df.fillna({2: "foo"}) expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) + # column is originally float (all-NaN) -> filling with string gives object dtype + expected[2] = expected[2].astype("object") tm.assert_frame_equal(result, expected) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - return_value = df.fillna({2: "foo"}, inplace=True) - else: - return_value = df.fillna({2: "foo"}, inplace=True) + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) assert return_value is None @@ -384,12 +375,8 @@ def test_fillna_dtype_conversion(self, using_infer_string): # empty block df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna("nan") - else: - result = df.fillna("nan") - expected = DataFrame("nan", index=range(3), columns=["A", "B"]) + result = df.fillna("nan") + expected = DataFrame("nan", index=range(3), columns=["A", "B"], dtype=object) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("val", ["", 1, np.nan, 1.0]) @@ -664,17 +651,10 @@ def test_fillna_col_reordering(self): filled = df.fillna(method="ffill") assert df.columns.tolist() == filled.columns.tolist() - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") - def test_fill_corner(self, float_frame, float_string_frame): - mf = float_string_frame - mf.loc[mf.index[5:20], "foo"] = np.nan - mf.loc[mf.index[-10:], "A"] = np.nan - - filled = float_string_frame.fillna(value=0) - assert (filled.loc[filled.index[5:20], "foo"] == 0).all() - del float_string_frame["foo"] - - float_frame.reindex(columns=[]).fillna(value=0) + def test_fill_empty(self, float_frame): + df = float_frame.reindex(columns=[]) + result = df.fillna(value=0) + tm.assert_frame_equal(result, df) def test_fillna_downcast_dict(self): # GH#40809 @@ -862,41 +842,29 @@ def test_pad_backfill_deprecated(func): @pytest.mark.parametrize( "data, expected_data, method, kwargs", ( - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan], "ffill", {"limit_area": "inside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan], "ffill", {"limit_area": "inside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0], "ffill", {"limit_area": "outside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan], "ffill", {"limit_area": "outside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), ( [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], @@ -910,41 +878,29 @@ def test_pad_backfill_deprecated(func): "ffill", {"limit_area": "outside", "limit": 1}, ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan], "bfill", {"limit_area": "inside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan], "bfill", {"limit_area": "inside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], "bfill", {"limit_area": "outside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], "bfill", {"limit_area": "outside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), ), ) diff --git a/pandas/tests/frame/methods/test_get_numeric_data.py b/pandas/tests/frame/methods/test_get_numeric_data.py index c5d32d56d03c1..6d097e75f6703 100644 --- a/pandas/tests/frame/methods/test_get_numeric_data.py +++ b/pandas/tests/frame/methods/test_get_numeric_data.py @@ -33,7 +33,9 @@ def test_get_numeric_data(self, using_infer_string): [ np.dtype("float64"), np.dtype("int64"), - np.dtype(objectname) if not using_infer_string else "string", + np.dtype(objectname) + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype(datetime64name), ], index=["a", "b", "c", "f"], diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index fcb7677f03f27..c2d15e5ae88e8 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -7,20 +7,26 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import ( + HAS_PYARROW, IS64, PYPY, + is_platform_arm, ) from pandas import ( CategoricalIndex, DataFrame, + Index, MultiIndex, Series, date_range, option_context, ) import pandas._testing as tm +from pandas.util.version import Version @pytest.fixture @@ -360,7 +366,7 @@ def test_info_memory_usage(): df = DataFrame(data) df.columns = dtypes - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+\+", res[-1]) @@ -398,25 +404,25 @@ def test_info_memory_usage(): @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") def test_info_memory_usage_deep_not_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) assert ( df_with_object_index.memory_usage(index=True, deep=True).sum() > df_with_object_index.memory_usage(index=True).sum() ) - df_object = DataFrame({"a": ["a"]}) + df_object = DataFrame({"a": Series(["a"], dtype=object)}) assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() @pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result") def test_info_memory_usage_deep_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) assert ( df_with_object_index.memory_usage(index=True, deep=True).sum() == df_with_object_index.memory_usage(index=True).sum() ) - df_object = DataFrame({"a": ["a"]}) + df_object = DataFrame({"a": Series(["a"], dtype=object)}) assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() @@ -432,17 +438,25 @@ def test_usage_via_getsizeof(): assert abs(diff) < 100 -def test_info_memory_usage_qualified(): +def test_info_memory_usage_qualified(using_infer_string): buf = StringIO() df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) df.info(buf=buf) assert "+" not in buf.getvalue() buf = StringIO() - df = DataFrame(1, columns=list("ab"), index=list("ABC")) + df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype=object)) df.info(buf=buf) assert "+" in buf.getvalue() + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype="str")) + df.info(buf=buf) + if using_infer_string and HAS_PYARROW: + assert "+" not in buf.getvalue() + else: + assert "+" in buf.getvalue() + buf = StringIO() df = DataFrame( 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) @@ -455,7 +469,10 @@ def test_info_memory_usage_qualified(): 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]]) ) df.info(buf=buf) - assert "+" in buf.getvalue() + if using_infer_string and HAS_PYARROW: + assert "+" not in buf.getvalue() + else: + assert "+" in buf.getvalue() def test_info_memory_usage_bug_on_multiindex(): @@ -493,14 +510,14 @@ def test_info_categorical(): @pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") -def test_info_int_columns(): +def test_info_int_columns(using_infer_string): # GH#37245 df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"]) buf = StringIO() df.info(show_counts=True, buf=buf) result = buf.getvalue() expected = textwrap.dedent( - """\ + f"""\ Index: 2 entries, A to B Data columns (total 2 columns): @@ -509,25 +526,32 @@ def test_info_int_columns(): 0 1 2 non-null int64 1 2 2 non-null int64 dtypes: int64(2) - memory usage: 48.0+ bytes + memory usage: {'50.0' if using_infer_string and HAS_PYARROW else '48.0+'} bytes """ ) assert result == expected -def test_memory_usage_empty_no_warning(): +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +def test_memory_usage_empty_no_warning(using_infer_string): # GH#50066 df = DataFrame(index=["a", "b"]) with tm.assert_produces_warning(None): result = df.memory_usage() - expected = Series(16 if IS64 else 8, index=["Index"]) + if using_infer_string and HAS_PYARROW: + value = 18 + else: + value = 16 if IS64 else 8 + expected = Series(value, index=["Index"]) tm.assert_series_equal(result, expected) @pytest.mark.single_cpu def test_info_compute_numba(): # GH#51922 - pytest.importorskip("numba") + numba = pytest.importorskip("numba") + if Version(numba.__version__) == Version("0.61") and is_platform_arm(): + pytest.skip(f"Segfaults on ARM platforms with numba {numba.__version__}") df = DataFrame([[1, 2], [3, 4]]) with option_context("compute.use_numba", True): diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index e0641fcb65bd3..ebee19e3de20a 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.errors import ChainedAssignmentError import pandas.util._test_decorators as td @@ -69,10 +69,7 @@ def test_interpolate_inplace(self, frame_or_series, using_array_manager, request assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" - ) - def test_interp_basic(self, using_copy_on_write): + def test_interp_basic(self, using_copy_on_write, using_infer_string): df = DataFrame( { "A": [1, 2, np.nan, 4], @@ -89,6 +86,13 @@ def test_interp_basic(self, using_copy_on_write): "D": list("abcd"), } ) + if using_infer_string: + dtype = "str" if using_infer_string else "object" + msg = f"[Cc]annot interpolate with {dtype} dtype" + with pytest.raises(TypeError, match=msg): + df.interpolate() + return + msg = "DataFrame.interpolate with object dtype" with tm.assert_produces_warning(FutureWarning, match=msg): result = df.interpolate() @@ -110,11 +114,11 @@ def test_interp_basic(self, using_copy_on_write): tm.assert_frame_equal(df, expected) # check we DID operate inplace - assert np.shares_memory(df["C"]._values, cvalues) - assert np.shares_memory(df["D"]._values, dvalues) + assert tm.shares_memory(df["C"]._values, cvalues) + assert tm.shares_memory(df["D"]._values, dvalues) @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + using_string_dtype(), reason="interpolate doesn't work for string" ) def test_interp_basic_with_non_range_index(self, using_infer_string): df = DataFrame( @@ -508,8 +512,41 @@ def test_interpolate_empty_df(self): assert result is None tm.assert_frame_equal(df, expected) - def test_interpolate_ea_raise(self): + def test_interpolate_ea(self, any_int_ea_dtype): # GH#55347 - df = DataFrame({"a": [1, None, 2]}, dtype="Int64") - with pytest.raises(NotImplementedError, match="does not implement"): - df.interpolate() + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=any_int_ea_dtype) + orig = df.copy() + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="Float64") + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "dtype", + [ + "Float64", + "Float32", + pytest.param("float32[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], + ) + def test_interpolate_ea_float(self, dtype): + # GH#55347 + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype) + orig = df.copy() + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype=dtype) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "dtype", + ["int64", "uint64", "int32", "int16", "int8", "uint32", "uint16", "uint8"], + ) + def test_interpolate_arrow(self, dtype): + # GH#55347 + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype + "[pyarrow]") + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="float64[pyarrow]") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 3ba893501914a..54f2e45488b78 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -86,7 +86,7 @@ def test_nlargest_n(self, df_strings, nselect_method, n, order): df = df_strings if "b" in order: error_msg = ( - f"Column 'b' has dtype (object|string), " + f"Column 'b' has dtype (object|str), " f"cannot use method '{nselect_method}' with this dtype" ) with pytest.raises(TypeError, match=error_msg): diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 0f27eae1a3bfc..15af2a14a042e 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -913,6 +915,7 @@ def test_quantile_ea_scalar(self, request, obj, index): else: tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype, expected_data, expected_index, axis", [ @@ -931,6 +934,7 @@ def test_empty_numeric(self, dtype, expected_data, expected_index, axis): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype, expected_data, expected_index, axis, expected_dtype", [ @@ -949,6 +953,7 @@ def test_empty_datelike( ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "expected_data, expected_index, axis", [ diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 8d7a0b373f5f8..37bed2da05743 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -470,14 +470,10 @@ def test_rank_inf_nans_na_option( ("top", False, [2.0, 3.0, 1.0, 4.0]), ], ) - def test_rank_object_first( - self, frame_or_series, na_option, ascending, expected, using_infer_string - ): + def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): obj = frame_or_series(["foo", "foo", None, "foo"]) result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) - if using_infer_string and isinstance(obj, Series): - expected = expected.astype("uint64") tm.assert_equal(result, expected) @pytest.mark.parametrize( @@ -497,14 +493,15 @@ def test_rank_mixed_axis_zero(self, data, expected): result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "dtype, exp_dtype", - [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], - ) - def test_rank_string_dtype(self, dtype, exp_dtype): + def test_rank_string_dtype(self, string_dtype_no_object): # GH#55362 - pytest.importorskip("pyarrow") - obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object) result = obj.rank(method="first") + exp_dtype = ( + "Float64" if string_dtype_no_object == "string[pyarrow]" else "float64" + ) + if string_dtype_no_object.storage == "python": + # TODO nullable string[python] should also return nullable Int64 + exp_dtype = "float64" expected = Series([1, 2, None, 3], dtype=exp_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 8bfa98042eb07..0971fb7e604c0 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -30,9 +28,6 @@ def mix_abc() -> dict[str, list[float | str]]: class TestDataFrameReplace: - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan @@ -48,7 +43,9 @@ def test_replace_inplace(self, datetime_frame, float_string_frame): mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = float_string_frame.replace(np.nan, 0) - expected = float_string_frame.fillna(value=0) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=0) tm.assert_frame_equal(result, expected) tsframe = datetime_frame.copy() @@ -283,56 +280,48 @@ def test_regex_replace_dict_nested(self, mix_abc): tm.assert_frame_equal(res3, expec) tm.assert_frame_equal(res4, expec) - def test_regex_replace_dict_nested_non_first_character( - self, any_string_dtype, using_infer_string - ): + def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): # GH 25259 dtype = any_string_dtype df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype) - if using_infer_string and any_string_dtype == "object": - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.replace({"a": "."}, regex=True) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) - - else: - result = df.replace({"a": "."}, regex=True) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) def test_regex_replace_dict_nested_gh4115(self): - df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) + df = DataFrame( + {"Type": Series(["Q", "T", "Q", "Q", "T"], dtype=object), "tmp": 2} + ) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) msg = "Downcasting behavior in `replace`" with tm.assert_produces_warning(FutureWarning, match=msg): result = df.replace({"Type": {"Q": 0, "T": 1}}) + tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) - def test_regex_replace_list_to_scalar(self, mix_abc): + def test_regex_replace_list_to_scalar(self, mix_abc, using_infer_string): df = DataFrame(mix_abc) expec = DataFrame( { "a": mix_abc["a"], - "b": np.array([np.nan] * 4), + "b": [np.nan] * 4, "c": [np.nan, np.nan, np.nan, "d"], } ) + if using_infer_string: + expec["b"] = expec["b"].astype("str") msg = "Downcasting behavior in `replace`" - with tm.assert_produces_warning(FutureWarning, match=msg): + warn = None if using_infer_string else FutureWarning + with tm.assert_produces_warning(warn, match=msg): res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) res2 = df.copy() res3 = df.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): return_value = res2.replace( [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True ) assert return_value is None - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): return_value = res3.replace( regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True ) @@ -341,9 +330,6 @@ def test_regex_replace_list_to_scalar(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) @@ -359,9 +345,6 @@ def test_regex_replace_str_to_numeric(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) @@ -440,31 +423,12 @@ def test_replace_regex_metachar(self, metachar): ], ) def test_regex_replace_string_types( - self, - data, - to_replace, - expected, - frame_or_series, - any_string_dtype, - using_infer_string, - request, + self, data, to_replace, expected, frame_or_series, any_string_dtype ): # GH-41333, GH-35977 dtype = any_string_dtype obj = frame_or_series(data, dtype=dtype) - if using_infer_string and any_string_dtype == "object": - if len(to_replace) > 1 and isinstance(obj, DataFrame): - request.node.add_marker( - pytest.mark.xfail( - reason="object input array that gets downcasted raises on " - "second pass" - ) - ) - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = obj.replace(to_replace, regex=True) - dtype = "string[pyarrow_numpy]" - else: - result = obj.replace(to_replace, regex=True) + result = obj.replace(to_replace, regex=True) expected = frame_or_series(expected, dtype=dtype) tm.assert_equal(result, expected) @@ -566,9 +530,6 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) @@ -580,23 +541,28 @@ def test_replace_convert(self): res = rep.dtypes tm.assert_series_equal(expec, res) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = float_string_frame.replace(np.nan, -18) - expected = float_string_frame.fillna(value=-18) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=-18) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result.replace(-18, np.nan), float_string_frame) + expected2 = float_string_frame.copy() + expected2["foo"] = expected2["foo"].astype(object) + tm.assert_frame_equal(result.replace(-18, np.nan), expected2) result = float_string_frame.replace(np.nan, -1e8) - expected = float_string_frame.fillna(value=-1e8) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=-1e8) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame) + expected2 = float_string_frame.copy() + expected2["foo"] = expected2["foo"].astype(object) + tm.assert_frame_equal(result.replace(-1e8, np.nan), expected2) def test_replace_mixed_int_block_upcasting(self): # int block upcasting @@ -657,15 +623,11 @@ def test_replace_mixed2(self, using_infer_string): expected = DataFrame( { - "A": Series(["foo", "bar"]), + "A": Series(["foo", "bar"], dtype="object"), "B": Series([0, "foo"], dtype="object"), } ) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.replace([1, 2], ["foo", "bar"]) - else: - result = df.replace([1, 2], ["foo", "bar"]) + result = df.replace([1, 2], ["foo", "bar"]) tm.assert_frame_equal(result, expected) def test_replace_mixed3(self): @@ -946,9 +908,6 @@ def test_replace_input_formats_listlike(self): with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} @@ -977,10 +936,7 @@ def test_replace_limit(self): # TODO pass - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) - def test_replace_dict_no_regex(self): + def test_replace_dict_no_regex(self, any_string_dtype): answer = Series( { 0: "Strongly Agree", @@ -988,7 +944,8 @@ def test_replace_dict_no_regex(self): 2: "Neutral", 3: "Disagree", 4: "Strongly Disagree", - } + }, + dtype=any_string_dtype, ) weights = { "Agree": 4, @@ -1003,10 +960,7 @@ def test_replace_dict_no_regex(self): result = answer.replace(weights) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) - def test_replace_series_no_regex(self): + def test_replace_series_no_regex(self, any_string_dtype): answer = Series( { 0: "Strongly Agree", @@ -1014,7 +968,8 @@ def test_replace_series_no_regex(self): 2: "Neutral", 3: "Disagree", 4: "Strongly Disagree", - } + }, + dtype=any_string_dtype, ) weights = Series( { @@ -1112,23 +1067,17 @@ def test_nested_dict_overlapping_keys_replace_str(self): expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) - def test_replace_swapping_bug(self, using_infer_string): + def test_replace_swapping_bug(self): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) - expect = DataFrame({"a": ["Y", "N", "Y"]}) + expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object) tm.assert_frame_equal(res, expect) df = DataFrame({"a": [0, 1, 0]}) res = df.replace({"a": {0: "Y", 1: "N"}}) - expect = DataFrame({"a": ["Y", "N", "Y"]}) + expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object) tm.assert_frame_equal(res, expect) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) def test_replace_period(self): d = { "fname": { @@ -1165,9 +1114,6 @@ def test_replace_period(self): result = df.replace(d) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) def test_replace_datetime(self): d = { "fname": { @@ -1393,9 +1339,6 @@ def test_replace_commutative(self, df, to_replace, exp): result = df.replace(to_replace) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) @pytest.mark.parametrize( "replacer", [ @@ -1408,7 +1351,7 @@ def test_replace_commutative(self, df, to_replace, exp): ) def test_replace_replacer_dtype(self, replacer): # GH26632 - df = DataFrame(["a"]) + df = DataFrame(["a"], dtype=object) msg = "Downcasting behavior in `replace` " with tm.assert_produces_warning(FutureWarning, match=msg): result = df.replace({"a": replacer, "b": replacer}) @@ -1525,6 +1468,7 @@ def test_replace_value_category_type(self): input_df = input_df.replace("obj1", "obj9") result = input_df.replace("cat2", "catX") + result = result.astype({"col1": "int64", "col3": "float64", "col5": "str"}) tm.assert_frame_equal(result, expected) def test_replace_dict_category_type(self): @@ -1566,13 +1510,11 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) - def test_replace_intervals(self, using_infer_string): + def test_replace_intervals(self): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) - warning = FutureWarning if using_infer_string else None - with tm.assert_produces_warning(warning, match="Downcasting"): - result = df.replace({"a": {pd.Interval(0, 1): "x"}}) - expected = DataFrame({"a": ["x", "x"]}) + result = df.replace({"a": {pd.Interval(0, 1): "x"}}) + expected = DataFrame({"a": ["x", "x"]}, dtype=object) tm.assert_frame_equal(result, expected) def test_replace_unicode(self): @@ -1672,9 +1614,6 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): # GH-48644 diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index fbf36dbc4fb02..9e51ac0bc2612 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -644,6 +646,7 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes): tm.assert_frame_equal(res, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) - GH#60338") @pytest.mark.parametrize( "array, dtype", [ @@ -661,7 +664,7 @@ def test_reset_index_dtypes_on_empty_frame_with_multiindex( idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = DataFrame(index=idx)[:0].reset_index().dtypes if using_infer_string and dtype == object: - dtype = "string" + dtype = pd.StringDtype(na_value=np.nan) expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) @@ -694,7 +697,7 @@ def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby( expected["c3"] = expected["c3"].astype("datetime64[ns]") expected["c1"] = expected["c1"].astype("float64") if using_infer_string: - expected["c2"] = expected["c2"].astype("string[pyarrow_numpy]") + expected["c2"] = expected["c2"].astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 47c479faed1ef..0354e9df3d168 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -32,7 +32,7 @@ def __init__(self, data, dtype) -> None: self.data = data self._dtype = dtype - def __array__(self, dtype): + def __array__(self, dtype=None, copy=None): return self.data @property @@ -50,7 +50,7 @@ def copy(self): class TestSelectDtypes: - def test_select_dtypes_include_using_list_like(self): + def test_select_dtypes_include_using_list_like(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -94,6 +94,14 @@ def test_select_dtypes_include_using_list_like(self): with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include=["period"]) + if using_infer_string: + ri = df.select_dtypes(include=["str"]) + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=[str]) + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { @@ -151,7 +159,7 @@ def test_select_dtypes_exclude_include_int(self, include): expected = df[["b", "c", "e"]] tm.assert_frame_equal(result, expected) - def test_select_dtypes_include_using_scalars(self): + def test_select_dtypes_include_using_scalars(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -187,6 +195,11 @@ def test_select_dtypes_include_using_scalars(self): with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include="period") + if using_infer_string: + ri = df.select_dtypes(include="str") + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_scalars(self): df = DataFrame( { @@ -347,7 +360,10 @@ def test_select_dtypes_datetime_with_tz(self): @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) - def test_select_dtypes_str_raises(self, dtype, arg): + def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string): + if using_infer_string and (dtype == "str" or dtype is str): + # this is tested below + pytest.skip("Selecting string columns works with future strings") df = DataFrame( { "a": list("abc"), diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 5724f79b82578..1c8d365f0d6c0 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -158,8 +158,8 @@ def test_set_index(self, float_string_frame): def test_set_index_names(self): df = DataFrame( np.ones((10, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(10)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(10)]), ) df.index.name = "name" diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index b21aa2d687682..abb30595fdcb8 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -756,3 +756,9 @@ def test_shift_with_iterable_check_other_arguments(self): msg = "Cannot specify `suffix` if `periods` is an int." with pytest.raises(ValueError, match=msg): df.shift(1, suffix="fails") + + def test_shift_axis_one_empty(self): + # GH#57301 + df = DataFrame() + result = df.shift(1, axis=1) + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 49e292057e4dc..830561a1349ee 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -1002,3 +1002,27 @@ def test_axis_columns_ignore_index(): result = df.sort_index(axis="columns", ignore_index=True) expected = DataFrame([[2, 1]]) tm.assert_frame_equal(result, expected) + + +def test_sort_index_stable_sort(): + # GH 57151 + df = DataFrame( + data=[ + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + result = df.sort_index(level="dt", kind="stable") + expected = DataFrame( + data=[ + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 250567eafc670..3b6a54698b5b6 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -426,7 +426,7 @@ def test_to_csv_chunksize(self): rows = chunksize // 2 + 1 df = DataFrame( np.ones((rows, 2)), - columns=Index(list("ab"), dtype=object), + columns=Index(list("ab")), index=MultiIndex.from_arrays([range(rows) for _ in range(2)]), ) result, expected = self._return_result_expected(df, chunksize, rnlvl=2) @@ -460,7 +460,7 @@ def test_to_csv_params(self, nrows, df_params, func_params, ncols): for _ in range(df_params["c_idx_nlevels"]) ) else: - columns = Index([f"i-{i}" for i in range(ncols)], dtype=object) + columns = Index([f"i-{i}" for i in range(ncols)]) df = DataFrame(np.ones((nrows, ncols)), index=index, columns=columns) result, expected = self._return_result_expected(df, 1000, **func_params) tm.assert_frame_equal(result, expected, check_names=False) @@ -692,10 +692,7 @@ def test_to_csv_interval_index(self, using_infer_string): # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = df.copy() - if using_infer_string: - expected.index = expected.index.astype("string[pyarrow_numpy]") - else: - expected.index = expected.index.astype(str) + expected.index = expected.index.astype("str") tm.assert_frame_equal(result, expected) @@ -737,7 +734,7 @@ def create_cols(name): ) df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool")) df_object = DataFrame( - "foo", index=df_float.index, columns=create_cols("object") + "foo", index=df_float.index, columns=create_cols("object"), dtype="object" ) df_dt = DataFrame( Timestamp("20010101").as_unit("ns"), @@ -815,7 +812,7 @@ def test_to_csv_dups_cols2(self): df = DataFrame( np.ones((5, 3)), index=Index([f"i-{i}" for i in range(5)], name="foo"), - columns=Index(["a", "a", "b"], dtype=object), + columns=Index(["a", "a", "b"]), ) with tm.ensure_clean() as filename: diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 61f0ad30b4519..570f85a4a31ee 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -12,8 +12,11 @@ NA, DataFrame, Index, + Interval, MultiIndex, + Period, Series, + Timedelta, Timestamp, ) import pandas._testing as tm @@ -519,3 +522,14 @@ def test_to_dict_pos_args_deprecation(self): ) with tm.assert_produces_warning(FutureWarning, match=msg): df.to_dict("records", {}) + + +@pytest.mark.parametrize( + "val", [Timestamp(2020, 1, 1), Timedelta(1), Period("2020"), Interval(1, 2)] +) +def test_to_dict_list_pd_scalars(val): + # GH 54824 + df = DataFrame({"a": [val]}) + result = df.to_dict(orient="list") + expected = {"a": [val]} + assert result == expected diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index f64cfd5fe6a2d..42858aa412810 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -35,6 +37,7 @@ def test_no_copy_blocks(self, float_frame, using_copy_on_write): assert _last_df is not None and not _last_df[column].equals(df[column]) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_dict_of_blocks_item_cache(using_copy_on_write, warn_copy_on_write): # Calling to_dict_of_blocks should not poison item_cache df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index bdb9b2c055061..0731750aed0cf 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas.util._test_decorators as td @@ -41,6 +42,9 @@ def test_to_numpy_copy(self, using_copy_on_write): else: assert df.to_numpy(copy=False, na_value=np.nan).base is arr + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_numpy_mixed_dtype_to_str(self): # https://github.com/pandas-dev/pandas/issues/35455 df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]]) diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index d0caa071fae1c..3e74094f266d1 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -3,6 +3,7 @@ import pandas.util._test_decorators as td +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -190,3 +191,19 @@ def test_transpose_not_inferring_dt_mixed_blocks(self): dtype=object, ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype1", ["Int64", "Float64"]) + @pytest.mark.parametrize("dtype2", ["Int64", "Float64"]) + def test_transpose(self, dtype1, dtype2): + # GH#57315 - transpose should have F contiguous blocks + df = DataFrame( + { + "a": pd.array([1, 1, 2], dtype=dtype1), + "b": pd.array([3, 4, 5], dtype=dtype2), + } + ) + result = df.T + for blk in result._mgr.blocks: + # When dtypes are unequal, we get NumPy object array + data = blk.values._data if dtype1 == dtype2 else blk.values + assert data.flags["F_CONTIGUOUS"] diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 7c7a0d23ff75f..56700ab6bd1f7 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -48,16 +48,18 @@ def test_update(self): def test_update_dtypes(self): # gh 3016 df = DataFrame( - [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], - columns=["A", "B", "bool1", "bool2"], + [[1.0, 2.0, 1, False, True], [4.0, 5.0, 2, True, False]], + columns=["A", "B", "int", "bool1", "bool2"], ) - other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) + other = DataFrame( + [[45, 45, 3, True]], index=[0], columns=["A", "B", "int", "bool1"] + ) df.update(other) expected = DataFrame( - [[45.0, 45.0, False, True], [4.0, 5.0, True, False]], - columns=["A", "B", "bool1", "bool2"], + [[45.0, 45.0, 3, True, True], [4.0, 5.0, 2, True, False]], + columns=["A", "B", "int", "bool1", "bool2"], ) tm.assert_frame_equal(df, expected) @@ -160,17 +162,14 @@ def test_update_with_different_dtype(self, using_copy_on_write): # GH#3217 df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) df["c"] = np.nan - if using_copy_on_write: + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): df.update({"c": Series(["foo"], index=[0])}) - else: - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - df["c"].update(Series(["foo"], index=[0])) expected = DataFrame( { "a": [1, 3], "b": [np.nan, 2], - "c": Series(["foo", np.nan], dtype="object"), + "c": Series(["foo", np.nan]), } ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index c7b444045a0f2..6c6944f806a2a 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -5,9 +5,11 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._config.config import option_context +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -113,7 +115,9 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="surrogates not allowed") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="surrogates not allowed" + ) def test_column_name_contains_unicode_surrogate(self): # GH 25509 colname = "\ud83d" @@ -383,7 +387,6 @@ def test_constructor_expanddim(self): def test_inspect_getmembers(self): # GH38740 - pytest.importorskip("jinja2") df = DataFrame() msg = "DataFrame._data is deprecated" with tm.assert_produces_warning( diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 42ce658701355..195126f1c5382 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,8 +11,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -59,7 +58,7 @@ def __init__(self, value, dtype) -> None: self.value = value self.dtype = np.dtype(dtype) - def __array__(self): + def __array__(self, dtype=None, copy=None): return np.array(self.value, dtype=self.dtype) def __str__(self) -> str: @@ -253,9 +252,6 @@ def test_timestamp_compare(self, left, right): with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("nat"), df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't compare string and int" - ) def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError @@ -1572,7 +1568,12 @@ def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne) ) f = getattr(operator, compare_operators_no_eq_ne) - msg = "'[<>]=?' not supported between instances of 'str' and 'int'" + msg = "|".join( + [ + "'[<>]=?' not supported between instances of 'str' and 'int'", + "Invalid comparison between dtype=str and int", + ] + ) with pytest.raises(TypeError, match=msg): f(df, 0) @@ -2126,11 +2127,19 @@ def test_enum_column_equality(): tm.assert_series_equal(result, expected) -def test_mixed_col_index_dtype(): +def test_mixed_col_index_dtype(using_infer_string): # GH 47382 df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) df2 = DataFrame(columns=list("abc"), data=0.0, index=[0]) df1.columns = df2.columns.astype("string") result = df1 + df2 expected = DataFrame(columns=list("abc"), data=1.0, index=[0]) + if using_infer_string: + # df2.columns.dtype will be "str" instead of object, + # so the aligned result will be "string", not object + if HAS_PYARROW: + dtype = "string[pyarrow]" + else: + dtype = "string" + expected.columns = expected.columns.astype(dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py new file mode 100644 index 0000000000000..b36b6b5ffe0cc --- /dev/null +++ b/pandas/tests/frame/test_arrow_interface.py @@ -0,0 +1,47 @@ +import ctypes + +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd + +pa = pytest.importorskip("pyarrow") + + +@td.skip_if_no("pyarrow", min_version="14.0") +def test_dataframe_arrow_interface(using_infer_string): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + capsule = df.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + table = pa.table(df) + string_type = pa.large_string() if using_infer_string else pa.string() + expected = pa.table({"a": [1, 2, 3], "b": pa.array(["a", "b", "c"], string_type)}) + assert table.equals(expected) + + schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) + table = pa.table(df, schema=schema) + expected = expected.cast(schema) + assert table.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="15.0") +def test_dataframe_to_arrow(using_infer_string): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + table = pa.RecordBatchReader.from_stream(df).read_all() + string_type = pa.large_string() if using_infer_string else pa.string() + expected = pa.table({"a": [1, 2, 3], "b": pa.array(["a", "b", "c"], string_type)}) + assert table.equals(expected) + + schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) + table = pa.RecordBatchReader.from_stream(df, schema=schema).read_all() + expected = expected.cast(schema) + assert table.equals(expected) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 712494ef15f97..b2fcba50de097 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -184,19 +184,6 @@ def test_constructor_with_convert(self): tm.assert_series_equal(result, expected) def test_construction_with_mixed(self, float_string_frame, using_infer_string): - # test construction edge cases with mixed types - - # f7u12, this does not work without extensive workaround - data = [ - [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], - [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], - ] - df = DataFrame(data) - - # check dtypes - result = df.dtypes - expected = Series({"datetime64[us]": 3}) - # mixed-type frames float_string_frame["datetime"] = datetime.now() float_string_frame["timedelta"] = timedelta(days=1, seconds=1) @@ -206,7 +193,9 @@ def test_construction_with_mixed(self, float_string_frame, using_infer_string): expected = Series( [np.dtype("float64")] * 4 + [ - np.dtype("object") if not using_infer_string else "string", + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype("datetime64[us]"), np.dtype("timedelta64[us]"), ], @@ -218,8 +207,7 @@ def test_construction_with_conversions(self): # convert from a numpy array of non-ns timedelta64; as of 2.0 this does # *not* convert arr = np.array([1, 2, 3], dtype="timedelta64[s]") - df = DataFrame(index=range(3)) - df["A"] = arr + df = DataFrame({"A": arr}) expected = DataFrame( {"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3) ) @@ -237,11 +225,11 @@ def test_construction_with_conversions(self): assert expected.dtypes["dt1"] == "M8[s]" assert expected.dtypes["dt2"] == "M8[s]" - df = DataFrame(index=range(3)) - df["dt1"] = np.datetime64("2013-01-01") - df["dt2"] = np.array( + dt1 = np.datetime64("2013-01-01") + dt2 = np.array( ["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]" ) + df = DataFrame({"dt1": dt1, "dt2": dt2}) # df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]') @@ -438,9 +426,13 @@ def test_nonconsolidated_item_cache_take(): # https://github.com/pandas-dev/pandas/issues/35521 # create non-consolidated dataframe with object dtype columns - df = DataFrame() - df["col1"] = Series(["a"], dtype=object) + df = DataFrame( + { + "col1": Series(["a"], dtype=object), + } + ) df["col2"] = Series([0], dtype=object) + assert not df._mgr.is_consolidated() # access column (item cache) df["col1"] == "A" diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 6e818d79d5ba8..f16068e0b6538 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -21,9 +21,10 @@ import pytest import pytz -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib +from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -81,7 +82,7 @@ def test_constructor_from_ndarray_with_str_dtype(self): # with an array of strings each of which is e.g. "[0 1 2]" arr = np.arange(12).reshape(4, 3) df = DataFrame(arr, dtype=str) - expected = DataFrame(arr.astype(str), dtype=object) + expected = DataFrame(arr.astype(str), dtype="str") tm.assert_frame_equal(df, expected) def test_constructor_from_2d_datetimearray(self, using_array_manager): @@ -264,7 +265,7 @@ def test_emptylike_constructor(self, emptylike, expected_index, expected_columns tm.assert_frame_equal(result, expected) def test_constructor_mixed(self, float_string_frame, using_infer_string): - dtype = "string" if using_infer_string else np.object_ + dtype = "str" if using_infer_string else np.object_ assert float_string_frame["foo"].dtype == dtype def test_constructor_cast_failure(self): @@ -326,19 +327,39 @@ def test_constructor_dtype_nocast_view_2d_array( assert df2._mgr.arrays[0].flags.c_contiguous @td.skip_array_manager_invalid_test - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") - def test_1d_object_array_does_not_copy(self): + def test_1d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + assert np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") - def test_2d_object_array_does_not_copy(self): + def test_2d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + assert np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) def test_constructor_dtype_list_data(self): @@ -788,7 +809,7 @@ def test_constructor_dict_cast(self, using_infer_string): frame = DataFrame(test_data) assert len(frame) == 3 - assert frame["B"].dtype == np.object_ if not using_infer_string else "string" + assert frame["B"].dtype == np.object_ if not using_infer_string else "str" assert frame["A"].dtype == np.float64 def test_constructor_dict_cast2(self): @@ -1208,7 +1229,7 @@ def test_constructor_scalar_inference(self, using_infer_string): assert df["bool"].dtype == np.bool_ assert df["float"].dtype == np.float64 assert df["complex"].dtype == np.complex128 - assert df["object"].dtype == np.object_ if not using_infer_string else "string" + assert df["object"].dtype == np.object_ if not using_infer_string else "str" def test_constructor_arrays_and_scalars(self): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10), "b": True}) @@ -1291,7 +1312,7 @@ def test_constructor_list_of_lists(self, using_infer_string): # GH #484 df = DataFrame(data=[[1, "a"], [2, "b"]], columns=["num", "str"]) assert is_integer_dtype(df["num"]) - assert df["str"].dtype == np.object_ if not using_infer_string else "string" + assert df["str"].dtype == np.object_ if not using_infer_string else "str" # GH 4851 # list of 0-dim ndarrays @@ -1791,12 +1812,18 @@ def test_constructor_column_duplicates(self): tm.assert_frame_equal(idf, edf) - def test_constructor_empty_with_string_dtype(self): + def test_constructor_empty_with_string_dtype(self, using_infer_string): # GH 9428 expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object) + expected_str = DataFrame( + index=[0, 1], columns=[0, 1], dtype=pd.StringDtype(na_value=np.nan) + ) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str) - tm.assert_frame_equal(df, expected) + if using_infer_string: + tm.assert_frame_equal(df, expected_str) + else: + tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_) tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5") @@ -1859,7 +1886,12 @@ def test_constructor_with_datetimes(self, using_infer_string): result = df.dtypes expected = Series( [np.dtype("int64")] - + [np.dtype(objectname) if not using_infer_string else "string"] * 2 + + [ + np.dtype(objectname) + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + * 2 + [np.dtype("M8[s]"), np.dtype("M8[us]")], index=list("ABCDE"), ) @@ -1881,7 +1913,11 @@ def test_constructor_with_datetimes(self, using_infer_string): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object") if not using_infer_string else "string"] + + [ + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1903,7 +1939,11 @@ def test_constructor_with_datetimes(self, using_infer_string): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object") if not using_infer_string else "string"] + + [ + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1962,6 +2002,7 @@ def test_constructor_with_datetimes4(self): df = DataFrame({"value": dr}) assert str(df.iat[0, 0].tz) == "US/Eastern" + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_constructor_with_datetimes5(self): # GH 7822 # preserver an index with a tz on dict construction @@ -2122,7 +2163,9 @@ def test_constructor_for_list_with_dtypes(self, using_infer_string): [ np.dtype("int64"), np.dtype("float64"), - np.dtype("object") if not using_infer_string else "string", + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype("datetime64[ns]"), np.dtype("float64"), ], @@ -2407,6 +2450,9 @@ def test_construct_with_two_categoricalindex_series(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_constructor_series_nonexact_categoricalindex(self): # GH 42424 ser = Series(range(100)) @@ -2703,8 +2749,7 @@ def test_construct_with_strings_and_none(self): def test_frame_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2738,8 +2783,7 @@ def test_frame_string_inference(self): def test_frame_string_inference_array_string_dtype(self): # GH#54496 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2763,7 +2807,6 @@ def test_frame_string_inference_array_string_dtype(self): def test_frame_string_inference_block_dim(self): # GH#55363 - pytest.importorskip("pyarrow") with pd.option_context("future.infer_string", True): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) assert df._mgr.blocks[0].ndim == 2 @@ -2857,7 +2900,7 @@ def test_dict_data_arrow_column_expansion(self, key_val, col_vals, col_type): ) result = DataFrame({key_val: [1, 2]}, columns=cols) expected = DataFrame([[1, np.nan], [2, np.nan]], columns=cols) - expected.iloc[:, 1] = expected.iloc[:, 1].astype(object) + expected.isetitem(1, expected.iloc[:, 1].astype(object)) tm.assert_frame_equal(result, expected) @@ -3118,6 +3161,24 @@ def test_columns_indexes_raise_on_sets(self): with pytest.raises(ValueError, match="columns cannot be a set"): DataFrame(data, columns={"a", "b", "c"}) + # TODO: make this not cast to object in pandas 3.0 + @pytest.mark.skipif( + not np_version_gt2, reason="StringDType only available in numpy 2 and above" + ) + @pytest.mark.parametrize( + "data", + [ + {"a": ["a", "b", "c"], "b": [1.0, 2.0, 3.0], "c": ["d", "e", "f"]}, + ], + ) + def test_np_string_array_object_cast(self, data): + from numpy.dtypes import StringDType + + data["a"] = np.array(data["a"], dtype=StringDType()) + res = DataFrame(data) + assert res["a"].dtype == np.object_ + assert (res["a"] == data["a"]).all() + def get1(obj): # TODO: make a helper in tm? if isinstance(obj, Series): diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py index 16ca3a202f1e0..f1163e994557f 100644 --- a/pandas/tests/frame/test_logical_ops.py +++ b/pandas/tests/frame/test_logical_ops.py @@ -107,15 +107,12 @@ def test_logical_ops_invalid(self, using_infer_string): df1 = DataFrame("foo", index=[1], columns=["A"]) df2 = DataFrame(True, index=[1], columns=["A"]) - msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") - if using_infer_string: - import pyarrow as pa - - with pytest.raises(pa.lib.ArrowNotImplementedError, match="|has no kernel"): - df1 | df2 + if using_infer_string and df1["A"].dtype.storage == "pyarrow": + msg = "operation 'or_' not supported for dtype 'str'" else: - with pytest.raises(TypeError, match=msg): - df1 | df2 + msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") + with pytest.raises(TypeError, match=msg): + df1 | df2 def test_logical_operators(self): def _check_bin_op(op): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index a498296e09c52..27848e4d18596 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -188,6 +188,25 @@ def test_eval_object_dtype_binop(self): expected = DataFrame({"a1": ["Y", "N"], "c": [True, False]}) tm.assert_frame_equal(res, expected) + def test_extension_array_eval(self, engine, parser, request): + # GH#58748 + if engine == "numexpr": + mark = pytest.mark.xfail( + reason="numexpr does not support extension array dtypes" + ) + request.applymarker(mark) + df = DataFrame({"a": pd.array([1, 2, 3]), "b": pd.array([4, 5, 6])}) + result = df.eval("a / b", engine=engine, parser=parser) + expected = Series(pd.array([0.25, 0.40, 0.50])) + tm.assert_series_equal(result, expected) + + def test_complex_eval(self, engine, parser): + # GH#21374 + df = DataFrame({"a": [1 + 2j], "b": [1 + 1j]}) + result = df.eval("a/b", engine=engine, parser=parser) + expected = Series([1.5 + 0.5j]) + tm.assert_series_equal(result, expected) + class TestDataFrameQueryWithMultiIndex: def test_query_with_named_multiindex(self, parser, engine): @@ -738,6 +757,7 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture): tm.assert_frame_equal(result, expected) expected = DataFrame(df_index) + expected.columns = expected.columns.astype(object) result = df.reset_index().query('"2018-01-03 00:00:00+00" < time') tm.assert_frame_equal(result, expected) @@ -1035,7 +1055,7 @@ def test_query_with_string_columns(self, parser, engine): with pytest.raises(NotImplementedError, match=msg): df.query("a in b and c < d", parser=parser, engine=engine) - def test_object_array_eq_ne(self, parser, engine, using_infer_string): + def test_object_array_eq_ne(self, parser, engine): df = DataFrame( { "a": list("aaaabbbbcccc"), @@ -1044,14 +1064,11 @@ def test_object_array_eq_ne(self, parser, engine, using_infer_string): "d": np.random.default_rng(2).integers(9, size=12), } ) - warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None - with tm.assert_produces_warning(warning): - res = df.query("a == b", parser=parser, engine=engine) + res = df.query("a == b", parser=parser, engine=engine) exp = df[df.a == df.b] tm.assert_frame_equal(res, exp) - with tm.assert_produces_warning(warning): - res = df.query("a != b", parser=parser, engine=engine) + res = df.query("a != b", parser=parser, engine=engine) exp = df[df.a != df.b] tm.assert_frame_equal(res, exp) @@ -1090,16 +1107,12 @@ def test_query_with_nested_special_character(self, parser, engine): [">=", operator.ge], ], ) - def test_query_lex_compare_strings( - self, parser, engine, op, func, using_infer_string - ): + def test_query_lex_compare_strings(self, parser, engine, op, func): a = Series(np.random.default_rng(2).choice(list("abcde"), 20)) b = Series(np.arange(a.size)) df = DataFrame({"X": a, "Y": b}) - warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None - with tm.assert_produces_warning(warning): - res = df.query(f'X {op} "d"', engine=engine, parser=parser) + res = df.query(f'X {op} "d"', engine=engine, parser=parser) expected = df[func(df.X, "d")] tm.assert_frame_equal(res, expected) @@ -1415,3 +1428,11 @@ def test_query_ea_equality_comparison(self, dtype, engine): } ) tm.assert_frame_equal(result, expected) + + def test_all_nat_in_object(self): + # GH#57068 + now = pd.Timestamp.now("UTC") # noqa: F841 + df = DataFrame({"a": pd.to_datetime([None, None], utc=True)}, dtype=object) + result = df.query("a > @now") + expected = DataFrame({"a": []}, dtype=object) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 66145c32c18d7..1b2e55c978071 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas.compat import ( IS64, is_platform_windows, @@ -245,17 +243,11 @@ class TestDataFrameAnalytics: pytest.param("kurt", marks=td.skip_if_no("scipy")), ], ) - def test_stat_op_api_float_string_frame( - self, float_string_frame, axis, opname, using_infer_string - ): - if ( - (opname in ("sum", "min", "max") and axis == 0) - or opname - in ( - "count", - "nunique", - ) - ) and not (using_infer_string and opname == "sum"): + def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): + if (opname in ("sum", "min", "max") and axis == 0) or opname in ( + "count", + "nunique", + ): getattr(float_string_frame, opname)(axis=axis) else: if opname in ["var", "std", "sem", "skew", "kurt"]: @@ -282,10 +274,11 @@ def test_stat_op_api_float_string_frame( msg = "'[><]=' not supported between instances of 'float' and 'str'" elif opname == "median": msg = re.compile( - r"Cannot convert \[.*\] to numeric|does not support", flags=re.S + r"Cannot convert \[.*\] to numeric|does not support|Cannot perform", + flags=re.S, ) if not isinstance(msg, re.Pattern): - msg = msg + "|does not support" + msg = msg + "|does not support|Cannot perform reduction" with pytest.raises(TypeError, match=msg): getattr(float_string_frame, opname)(axis=axis) if opname != "nunique": @@ -447,26 +440,16 @@ def test_mixed_ops(self, op): "could not convert", "can't multiply sequence by non-int", "does not support", + "Cannot perform", ] ) with pytest.raises(TypeError, match=msg): getattr(df, op)() with pd.option_context("use_bottleneck", False): - msg = "|".join( - [ - "Could not convert", - "could not convert", - "can't multiply sequence by non-int", - "does not support", - ] - ) with pytest.raises(TypeError, match=msg): getattr(df, op)() - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="sum doesn't work for arrow strings" - ) def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame( @@ -534,7 +517,7 @@ def test_mean_mixed_string_decimal(self): df = DataFrame(d) with pytest.raises( - TypeError, match="unsupported operand type|does not support" + TypeError, match="unsupported operand type|does not support|Cannot perform" ): df.mean() result = df[["A", "C"]].mean() @@ -629,7 +612,7 @@ def test_sem(self, datetime_frame): "A": [12], "B": [10.0], "C": [np.nan], - "D": np.array([np.nan], dtype=object), + "D": Series([np.nan], dtype="str"), "E": Categorical([np.nan], categories=["a"]), "F": DatetimeIndex([pd.NaT], dtype="M8[ns]"), "G": to_timedelta([pd.NaT]), @@ -671,7 +654,7 @@ def test_mode_dropna(self, dropna, expected): "A": [12, 12, 19, 11], "B": [10, 10, np.nan, 3], "C": [1, np.nan, np.nan, np.nan], - "D": Series([np.nan, np.nan, "a", np.nan], dtype=object), + "D": Series([np.nan, np.nan, "a", np.nan], dtype="str"), "E": Categorical([np.nan, np.nan, "a", np.nan]), "F": DatetimeIndex(["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"), "G": to_timedelta(["1 days", "nan", "nan", "nan"]), @@ -691,18 +674,10 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - def test_mode_sortwarning(self, using_infer_string): - # Check for the warning that is raised when the mode - # results cannot be sorted - + def test_mode_sort_with_na(self, using_infer_string): df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) - - warning = None if using_infer_string else UserWarning - with tm.assert_produces_warning(warning): - result = df.mode(dropna=False) - result = result.sort_values(by="A").reset_index(drop=True) - + result = df.mode(dropna=False) tm.assert_frame_equal(result, expected) def test_mode_empty_df(self): @@ -989,7 +964,7 @@ def test_sum_mixed_datetime(self): def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data - msg = "Could not convert|does not support" + msg = "Could not convert|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): float_string_frame.mean(axis=0) @@ -1117,7 +1092,6 @@ def test_idxmin_axis_2(self, float_frame): with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) - @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmax(self, float_frame, int_frame, skipna, axis): frame = float_frame @@ -1362,9 +1336,7 @@ def test_any_all_extra(self): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) - def test_any_all_object_dtype( - self, axis, bool_agg_func, skipna, using_infer_string - ): + def test_any_all_object_dtype(self, axis, bool_agg_func, skipna): # GH#35450 df = DataFrame( data=[ @@ -1374,13 +1346,8 @@ def test_any_all_object_dtype( [np.nan, np.nan, "5", np.nan], ] ) - if using_infer_string: - # na in object is True while in string pyarrow numpy it's false - val = not axis == 0 and not skipna and bool_agg_func == "all" - else: - val = True result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna) - expected = Series([True, True, val, True]) + expected = Series([True, True, True, True]) tm.assert_series_equal(result, expected) # GH#50947 deprecates this but it is not emitting a warning in some builds. @@ -1960,9 +1927,6 @@ def test_sum_timedelta64_skipna_false(using_array_manager, request): tm.assert_series_equal(result, expected) -@pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="sum doesn't work with arrow strings" -) def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = DataFrame([["a", 1]], columns=list("ab")) @@ -1992,7 +1956,9 @@ def test_minmax_extensionarray(method, numeric_only): def test_frame_mixed_numeric_object_with_timestamp(ts_value): # GH 13912 df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]}) - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises( + TypeError, match="does not support (operation|reduction)|Cannot perform" + ): df.sum() diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index 776007fb9691d..6184e791cab5d 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas import ( NA, Categorical, @@ -176,7 +174,6 @@ def test_repr_mixed_big(self): repr(biggie) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="/r in") def test_repr(self): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 6e1e743eb60de..de470fcda18ed 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -655,7 +655,11 @@ def test_unstack_dtypes(self, using_infer_string): df2["D"] = "foo" df3 = df2.unstack("B") result = df3.dtypes - dtype = "string" if using_infer_string else np.dtype("object") + dtype = ( + pd.StringDtype(na_value=np.nan) + if using_infer_string + else np.dtype("object") + ) expected = Series( [np.dtype("float64")] * 2 + [dtype] * 2, index=MultiIndex.from_arrays( @@ -2075,7 +2079,7 @@ def test_unstack_period_frame(self): @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) - def test_stack_multiple_bug(self, future_stack): + def test_stack_multiple_bug(self, future_stack, using_infer_string): # bug when some uniques are not present in the data GH#3170 id_col = ([1] * 3) + ([2] * 3) name = (["a"] * 3) + (["b"] * 3) @@ -2087,6 +2091,8 @@ def test_stack_multiple_bug(self, future_stack): multi.columns.name = "Params" unst = multi.unstack("ID") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): unst.resample("W-THU").mean() down = unst.resample("W-THU").mean(numeric_only=True) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index ef78ae62cb4d6..855b58229cbdb 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -26,6 +26,17 @@ def _constructor(self): class TestDataFrameSubclassing: + def test_no_warning_on_mgr(self): + # GH#57032 + df = tm.SubclassedDataFrame( + {"X": [1, 2, 3], "Y": [1, 2, 3]}, index=["a", "b", "c"] + ) + with tm.assert_produces_warning(None): + # df.isna() goes through _constructor_from_mgr, which we want to + # *not* pass a Manager do __init__ + df.isna() + df["X"].isna() + def test_frame_subclassing_and_slicing(self): # Subclass frame and ensure it returns the right class on slicing it # In reference to PR 9632 diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 850c92013694f..a48b5c51f9ca7 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -51,22 +51,13 @@ def test_neg_object(self, df, expected): def test_neg_raises(self, df, using_infer_string): msg = ( "bad operand type for unary -: 'str'|" - r"bad operand type for unary -: 'DatetimeArray'" + r"bad operand type for unary -: 'DatetimeArray'|" + "unary '-' not supported for dtype" ) - if using_infer_string and df.dtypes.iloc[0] == "string": - import pyarrow as pa - - msg = "has no kernel" - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - (-df) - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - (-df["a"]) - - else: - with pytest.raises(TypeError, match=msg): - (-df) - with pytest.raises(TypeError, match=msg): - (-df["a"]) + with pytest.raises(TypeError, match=msg): + (-df) + with pytest.raises(TypeError, match=msg): + (-df["a"]) def test_invert(self, float_frame): df = float_frame diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index e0d79c3f15282..9fe9bca8abdc9 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -41,7 +41,7 @@ def test_to_xarray_index_types(self, index_flat, df, using_infer_string): df.index.name = "foo" df.columns.name = "bar" result = df.to_xarray() - assert result.dims["foo"] == 4 + assert result.sizes["foo"] == 4 assert len(result.coords) == 1 assert len(result.data_vars) == 8 tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) @@ -52,7 +52,7 @@ def test_to_xarray_index_types(self, index_flat, df, using_infer_string): # column names are lost expected = df.copy() expected["f"] = expected["f"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.columns.name = None tm.assert_frame_equal(result.to_dataframe(), expected) @@ -62,7 +62,7 @@ def test_to_xarray_empty(self, df): df.index.name = "foo" result = df[0:0].to_xarray() - assert result.dims["foo"] == 0 + assert result.sizes["foo"] == 0 assert isinstance(result, Dataset) def test_to_xarray_with_multiindex(self, df, using_infer_string): @@ -71,8 +71,8 @@ def test_to_xarray_with_multiindex(self, df, using_infer_string): # MultiIndex df.index = MultiIndex.from_product([["a"], range(4)], names=["one", "two"]) result = df.to_xarray() - assert result.dims["one"] == 1 - assert result.dims["two"] == 4 + assert result.sizes["one"] == 1 + assert result.sizes["two"] == 4 assert len(result.coords) == 2 assert len(result.data_vars) == 8 tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) @@ -81,7 +81,7 @@ def test_to_xarray_with_multiindex(self, df, using_infer_string): result = result.to_dataframe() expected = df.copy() expected["f"] = expected["f"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.columns.name = None tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 6223a153df358..f02a828fe8d17 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -337,7 +337,7 @@ def test_wrap_agg_out(three_group): grouped = three_group.groupby(["A", "B"]) def func(ser): - if ser.dtype == object: + if ser.dtype in (object, "string"): raise TypeError("Test error message") return ser.sum() @@ -1109,7 +1109,7 @@ def test_aggregate_mixed_types(): expected = DataFrame( expected_data, index=Index([2, "group 1"], dtype="object", name="grouping"), - columns=Index(["X", "Y", "Z"], dtype="object"), + columns=Index(["X", "Y", "Z"]), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 5c99882cef6d2..0d04af3801dbe 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -108,7 +108,9 @@ def test_cython_agg_nothing_to_agg(): result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True) expected = DataFrame( - [], index=frame["a"].sort_values().drop_duplicates(), columns=[] + [], + index=frame["a"].sort_values().drop_duplicates(), + columns=Index([], dtype="str"), ) tm.assert_frame_equal(result, expected) @@ -163,14 +165,14 @@ def test_cython_agg_return_dict(): def test_cython_fail_agg(): dr = bdate_range("1/1/2000", periods=50) - ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) + ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr) grouped = ts.groupby(lambda x: x.month) summed = grouped.sum() msg = "using SeriesGroupBy.sum" with tm.assert_produces_warning(FutureWarning, match=msg): # GH#53425 - expected = grouped.agg(np.sum) + expected = grouped.agg(np.sum).astype(object) tm.assert_series_equal(summed, expected) diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index ee694129f7118..fcd34f793c584 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import NumbaUtilError from pandas import ( @@ -11,8 +12,17 @@ option_context, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) def test_correct_function_signature(): diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 0596193c137e1..213704f31aca5 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -355,7 +355,8 @@ def test_series_agg_multi_pure_python(): ) def bad(x): - assert len(x.values.base) > 0 + if isinstance(x.values, np.ndarray): + assert len(x.values.base) > 0 return "foo" result = data.groupby(["A", "B"]).agg(bad) diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py index a2440e09dfc02..c0889ab415e74 100644 --- a/pandas/tests/groupby/methods/test_describe.py +++ b/pandas/tests/groupby/methods/test_describe.py @@ -71,7 +71,7 @@ def test_series_describe_as_index(as_index, keys): tm.assert_frame_equal(result, expected) -def test_frame_describe_multikey(tsframe): +def test_frame_describe_multikey(tsframe, using_infer_string): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() desc_groups = [] @@ -79,7 +79,7 @@ def test_frame_describe_multikey(tsframe): group = grouped[col].describe() # GH 17464 - Remove duplicate MultiIndex levels group_col = MultiIndex( - levels=[[col], group.columns], + levels=[Index([col], dtype=tsframe.columns.dtype), group.columns], codes=[[0] * len(group.columns), range(len(group.columns))], ) group = DataFrame(group.values, columns=group_col, index=group.index) @@ -87,6 +87,10 @@ def test_frame_describe_multikey(tsframe): expected = pd.concat(desc_groups, axis=1) tm.assert_frame_equal(result, expected) + # remainder of the tests fails with string dtype but is testing deprecated behaviour + if using_infer_string: + return + msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) @@ -293,5 +297,5 @@ def test_groupby_empty_dataset(dtype, kwargs): result = df.iloc[:0].groupby("A").B.describe(**kwargs) expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0] - expected.index = Index([]) + expected.index = Index([], dtype=df.columns.dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_nth.py b/pandas/tests/groupby/methods/test_nth.py index a8ed9e9d52021..2722993ee5cdf 100644 --- a/pandas/tests/groupby/methods/test_nth.py +++ b/pandas/tests/groupby/methods/test_nth.py @@ -707,10 +707,11 @@ def test_first_multi_key_groupby_categorical(): @pytest.mark.parametrize("method", ["first", "last", "nth"]) def test_groupby_last_first_nth_with_none(method, nulls_fixture): # GH29645 - expected = Series(["y"]) + expected = Series(["y"], dtype=object) data = Series( [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture], index=[0, 0, 0, 0, 0], + dtype=object, ).groupby(level=0) if method == "nth": diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 361a8c27fbf9d..3943590b069ad 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -171,7 +171,8 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) - with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): + msg = "dtype '(object|str)' does not support operation 'quantile'" + with pytest.raises(TypeError, match=msg): df.groupby("key").quantile() @@ -259,9 +260,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): expected = df.groupby("a")[["b"]].quantile(q) tm.assert_frame_equal(result, expected) else: - with pytest.raises( - TypeError, match="'quantile' cannot be performed against 'object' dtypes!" - ): + msg = "dtype '.*' does not support operation 'quantile'" + with pytest.raises(TypeError, match=msg): df.groupby("a").quantile(q, numeric_only=numeric_only) diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 93a4e743d0d71..4e92fb22f840a 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas.core.dtypes.common import is_integer_dtype from pandas import ( @@ -108,22 +106,16 @@ def test_size_series_masked_type_returns_Int64(dtype): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_size_strings(dtype): +def test_size_strings(any_string_dtype, using_infer_string): # GH#55627 + dtype = any_string_dtype df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) result = df.groupby("a")["b"].size() exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" + exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype expected = Series( [2, 1], - index=Index(["a", "b"], name="a", dtype=dtype), + index=Index(["a", "b"], name="a", dtype=exp_index_dtype), name="b", dtype=exp_dtype, ) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 2fa79c815d282..476ce1fe1b8cc 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -8,8 +8,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( Categorical, CategoricalIndex, @@ -298,7 +296,16 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("frame", [True, False]) def test_against_frame_and_seriesgroupby( - education_df, groupby, normalize, name, sort, ascending, as_index, frame, request + education_df, + groupby, + normalize, + name, + sort, + ascending, + as_index, + frame, + request, + using_infer_string, ): # test all parameters: # - Use column, array or function as by= parameter @@ -362,24 +369,24 @@ def test_against_frame_and_seriesgroupby( index_frame["gender"] = index_frame["both"].str.split("-").str.get(0) index_frame["education"] = index_frame["both"].str.split("-").str.get(1) del index_frame["both"] - index_frame = index_frame.rename({0: None}, axis=1) - expected.index = MultiIndex.from_frame(index_frame) + index_frame2 = index_frame.rename({0: None}, axis=1) + expected.index = MultiIndex.from_frame(index_frame2) + + if index_frame2.columns.isna()[0]: + # with using_infer_string, the columns in index_frame as string + # dtype, which makes the rename({0: None}) above use np.nan + # instead of None, so we need to set None more explicitly. + expected.index.names = [None] + expected.index.names[1:] tm.assert_series_equal(result, expected) else: expected.insert(1, "gender", expected["both"].str.split("-").str.get(0)) expected.insert(2, "education", expected["both"].str.split("-").str.get(1)) + if using_infer_string: + expected = expected.astype({"gender": "str", "education": "str"}) del expected["both"] tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( "sort, ascending, expected_rows, expected_count, expected_group_size", @@ -397,8 +404,10 @@ def test_compound( expected_rows, expected_count, expected_group_size, - dtype, + any_string_dtype, + using_infer_string, ): + dtype = any_string_dtype education_df = education_df.astype(dtype) education_df.columns = education_df.columns.astype(dtype) # Multiple groupby keys and as_index=False @@ -415,11 +424,17 @@ def test_compound( expected["proportion"] = expected_count expected["proportion"] /= expected_group_size if dtype == "string[pyarrow]": + # TODO(nullable) also string[python] should return nullable dtypes expected["proportion"] = expected["proportion"].convert_dtypes() else: expected["count"] = expected_count if dtype == "string[pyarrow]": expected["count"] = expected["count"].convert_dtypes() + if using_infer_string and dtype == object: + expected = expected.astype( + {"country": "str", "gender": "str", "education": "str"} + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 34b6e7c4cde5f..8ee38a688a1a0 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -77,7 +79,7 @@ def test_apply_index_date(using_infer_string): tm.assert_frame_equal(result, expected) -def test_apply_index_date_object(using_infer_string): +def test_apply_index_date_object(): # GH 5789 # don't auto coerce dates ts = [ @@ -109,10 +111,7 @@ def test_apply_index_date_object(using_infer_string): 1.40750, 1.40649, ] - dtype = "string[pyarrow_numpy]" if using_infer_string else object - exp_idx = Index( - ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=dtype, name="date" - ) + exp_idx = Index(["2011-05-16", "2011-05-17", "2011-05-18"], name="date") expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -129,7 +128,7 @@ def test_apply_trivial(using_infer_string): {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - dtype = "string" if using_infer_string else "object" + dtype = "str" if using_infer_string else "object" expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" @@ -146,7 +145,7 @@ def test_apply_trivial_fail(using_infer_string): {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - dtype = "string" if using_infer_string else "object" + dtype = "str" if using_infer_string else "object" expected = pd.concat([df, df], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -944,7 +943,7 @@ def test_func_returns_object(): "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], ) -def test_apply_datetime_issue(group_column_dtlike, using_infer_string): +def test_apply_datetime_issue(group_column_dtlike): # GH-28247 # groupby-apply throws an error if one of the columns in the DataFrame # is a datetime object and the column labels are different from @@ -955,8 +954,7 @@ def test_apply_datetime_issue(group_column_dtlike, using_infer_string): with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) - dtype = "string" if using_infer_string else "object" - expected = DataFrame(["spam"], Index(["foo"], dtype=dtype, name="a"), columns=[42]) + expected = DataFrame(["spam"], Index(["foo"], dtype="str", name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -1037,7 +1035,7 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes - dtype = "string" if using_infer_string else object + dtype = pd.StringDtype(na_value=np.nan) if using_infer_string else object expected = Series( [np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype], index=["observation", "color", "mood", "intensity", "score"], @@ -1205,7 +1203,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): ) expected = DataFrame( - {"a": [264, 297], "b": [15, 6], "c": [150, 60]}, + {"b": [15, 6], "c": [150, 60]}, index=Index([88, 99], name="a"), ) @@ -1213,7 +1211,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): grp = df.groupby(by="a") msg = "The behavior of DataFrame.sum with axis=None is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - result = grp.apply(sum) + result = grp.apply(sum, include_groups=False) tm.assert_frame_equal(result, expected) # Check output when another method is called before .apply() @@ -1221,7 +1219,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): args = get_groupby_method_args(reduction_func, df) _ = getattr(grp, reduction_func)(*args) with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - result = grp.apply(sum) + result = grp.apply(sum, include_groups=False) tm.assert_frame_equal(result, expected) @@ -1303,12 +1301,13 @@ def test_apply_dropna_with_indexed_same(dropna): @pytest.mark.parametrize( "as_index, expected", [ - [ + pytest.param( False, DataFrame( [[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None], dtype=object) ), - ], + marks=pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)"), + ), [ True, Series( diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py index 09d5e06bf6ddd..130a29abf9443 100644 --- a/pandas/tests/groupby/test_apply_mutate.py +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -75,7 +75,7 @@ def test_no_mutate_but_looks_like(): tm.assert_series_equal(result1, result2) -def test_apply_function_with_indexing(): +def test_apply_function_with_indexing(warn_copy_on_write): # GH: 33058 df = pd.DataFrame( {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]} @@ -86,7 +86,9 @@ def fn(x): return x.col2 msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=msg, raise_on_extra_warnings=not warn_copy_on_write + ): result = df.groupby(["col1"], as_index=False).apply(fn) expected = pd.Series( [1, 2, 0, 4, 5, 0], diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 7a91601bf688f..cba02ae869889 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -67,6 +67,7 @@ def f(a): } +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_apply_use_categorical_name(df): cats = qcut(df.C, 4) @@ -129,7 +130,7 @@ def f(x): result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") - dtype = "string[pyarrow_numpy]" if using_infer_string else object + dtype = "str" if using_infer_string else object expected["person_name"] = expected["person_name"].astype(dtype) tm.assert_frame_equal(result, expected) @@ -338,7 +339,8 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) -def test_observed(observed): +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") +def test_observed(request, using_infer_string, observed): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) @@ -346,6 +348,10 @@ def test_observed(observed): # gh-8138 (back-compat) # gh-8869 + if using_infer_string and not observed: + # TODO(infer_string) this fails with filling the string column with 0 + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) @@ -1552,6 +1558,7 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( assert (res.loc[unobserved_cats] == expected).all().all() +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_series_groupby_categorical_aggregation_getitem(): # GH 8870 d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]} diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4c903e691add1..07ddbc36b5ab0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -12,8 +12,6 @@ ) import pandas.util._test_decorators as td -from pandas.core.dtypes.common import is_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -640,7 +638,7 @@ def test_frame_multi_key_function_list(): tm.assert_frame_equal(agged, expected) -def test_frame_multi_key_function_list_partial_failure(): +def test_frame_multi_key_function_list_partial_failure(using_infer_string): data = DataFrame( { "A": [ @@ -691,6 +689,8 @@ def test_frame_multi_key_function_list_partial_failure(): grouped = data.groupby(["A", "B"]) funcs = ["mean", "std"] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -981,9 +981,11 @@ def test_groupby_multi_corner(df): tm.assert_frame_equal(agged, expected) -def test_raises_on_nuisance(df): +def test_raises_on_nuisance(df, using_infer_string): grouped = df.groupby("A") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1002,7 +1004,7 @@ def test_raises_on_nuisance(df): depr_msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=depr_msg): grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) - msg = "does not support reduction 'sum'" + msg = "does not support reduction 'sum'|Cannot perform reduction 'sum'" with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) @@ -1026,7 +1028,7 @@ def test_keep_nuisance_agg(df, agg_function): ["sum", "mean", "prod", "std", "var", "sem", "median"], ) @pytest.mark.parametrize("numeric_only", [True, False]) -def test_omit_nuisance_agg(df, agg_function, numeric_only): +def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string): # GH 38774, GH 38815 grouped = df.groupby("A") @@ -1034,7 +1036,10 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): if agg_function in no_drop_nuisance and not numeric_only: # Added numeric_only as part of GH#46560; these do not drop nuisance # columns when numeric_only is False - if agg_function in ("std", "sem"): + if using_infer_string: + msg = f"dtype 'str' does not support operation '{agg_function}'" + klass = TypeError + elif agg_function in ("std", "sem"): klass = ValueError msg = "could not convert string to float: 'one'" else: @@ -1055,16 +1060,24 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): tm.assert_frame_equal(result, expected) -def test_raise_on_nuisance_python_single(df): +def test_raise_on_nuisance_python_single(df, using_infer_string): # GH 38815 grouped = df.groupby("A") - with pytest.raises(ValueError, match="could not convert"): + + err = ValueError + msg = "could not convert" + if using_infer_string: + err = TypeError + msg = "dtype 'str' does not support operation 'skew'" + with pytest.raises(err, match=msg): grouped.skew() -def test_raise_on_nuisance_python_multiple(three_group): +def test_raise_on_nuisance_python_multiple(three_group, using_infer_string): grouped = three_group.groupby(["A", "B"]) msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1102,12 +1115,16 @@ def test_nonsense_func(): df.groupby(lambda x: x + "foo") -def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data): +def test_wrap_aggregated_output_multindex( + multiindex_dataframe_random_data, using_infer_string +): df = multiindex_dataframe_random_data.T df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): df.groupby(keys).agg("mean") agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean") @@ -1214,7 +1231,7 @@ def test_groupby_complex_mean(): tm.assert_frame_equal(result, expected) -def test_groupby_complex_numbers(using_infer_string): +def test_groupby_complex_numbers(): # GH 17927 df = DataFrame( [ @@ -1223,11 +1240,10 @@ def test_groupby_complex_numbers(using_infer_string): {"a": 4, "b": 1}, ] ) - dtype = "string[pyarrow_numpy]" if using_infer_string else object expected = DataFrame( np.array([1, 1, 1], dtype=np.int64), index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"), - columns=Index(["a"], dtype=dtype), + columns=Index(["a"]), ) result = df.groupby("b", sort=False).count() tm.assert_frame_equal(result, expected) @@ -1300,8 +1316,10 @@ def test_groupby_with_hier_columns(): def test_grouping_ndarray(df): grouped = df.groupby(df["A"].values) + grouped2 = df.groupby(df["A"].rename(None)) + result = grouped.sum() - expected = df.groupby(df["A"].rename(None)).sum() + expected = grouped2.sum() tm.assert_frame_equal(result, expected) @@ -1605,7 +1623,7 @@ def test_groupby_2d_malformed(): d["label"] = ["l1", "l2"] tmp = d.groupby(["group"]).mean(numeric_only=True) res_values = np.array([[0.0, 1.0], [0.0, 1.0]]) - tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) + tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object)) tm.assert_numpy_array_equal(tmp.values, res_values) @@ -1742,18 +1760,14 @@ def g(group): @pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) -def test_set_group_name(df, grouper, using_infer_string): +def test_set_group_name(df, grouper): def f(group): assert group.name is not None return group def freduce(group): assert group.name is not None - if using_infer_string and grouper == "A" and is_string_dtype(group.dtype): - with pytest.raises(TypeError, match="does not support"): - group.sum() - else: - return group.sum() + return group.sum() def freducex(x): return freduce(x) @@ -1797,8 +1811,8 @@ def test_no_dummy_key_names(df): result = df.groupby(df["A"].values).sum() assert result.index.name is None - result = df.groupby([df["A"].values, df["B"].values]).sum() - assert result.index.names == (None, None) + result2 = df.groupby([df["A"].values, df["B"].values]).sum() + assert result2.index.names == (None, None) def test_groupby_sort_multiindex_series(): @@ -2094,7 +2108,7 @@ def get_categorical_invalid_expected(): idx = Index(lev, name=keys[0]) if using_infer_string: - columns = Index([], dtype="string[pyarrow_numpy]") + columns = Index([], dtype="str") else: columns = [] expected = DataFrame([], columns=columns, index=idx) @@ -2103,6 +2117,7 @@ def get_categorical_invalid_expected(): is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype) is_dt64 = df.dtypes.iloc[0].kind == "M" is_cat = isinstance(values, Categorical) + is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype) if ( isinstance(values, Categorical) @@ -2127,13 +2142,15 @@ def get_categorical_invalid_expected(): if op in ["prod", "sum", "skew"]: # ops that require more than just ordered-ness - if is_dt64 or is_cat or is_per: + if is_dt64 or is_cat or is_per or (is_str and op != "sum"): # GH#41291 # datetime64 -> prod and sum are invalid if is_dt64: msg = "datetime64 type does not support" elif is_per: msg = "Period type does not support" + elif is_str: + msg = f"dtype 'str' does not support operation '{op}'" else: msg = "category type does not support" if op == "skew": @@ -2699,7 +2716,7 @@ def test_groupby_empty_multi_column(as_index, numeric_only): result = gb.sum(numeric_only=numeric_only) if as_index: index = MultiIndex([[], []], [[], []], names=["A", "B"]) - columns = ["C"] if not numeric_only else [] + columns = ["C"] if not numeric_only else Index([], dtype="str") else: index = RangeIndex(0) columns = ["A", "B", "C"] if not numeric_only else ["A", "B"] @@ -2717,7 +2734,7 @@ def test_groupby_aggregation_non_numeric_dtype(): { "v": [[1, 1], [10, 20]], }, - index=Index(["M", "W"], dtype="object", name="MW"), + index=Index(["M", "W"], name="MW"), ) gb = df.groupby(by=["MW"]) @@ -2816,25 +2833,20 @@ def test_rolling_wrong_param_min_period(): test_df = DataFrame([name_l, val_l]).T test_df.columns = ["name", "val"] - result_error_msg = r"__init__\(\) got an unexpected keyword argument 'min_period'" + result_error_msg = ( + r"^[a-zA-Z._]*\(\) got an unexpected keyword argument 'min_period'" + ) with pytest.raises(TypeError, match=result_error_msg): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_by_column_values_with_same_starting_value(dtype): +def test_by_column_values_with_same_starting_value(any_string_dtype): # GH29635 df = DataFrame( { "Name": ["Thomas", "Thomas", "Thomas John"], "Credit": [1200, 1300, 900], - "Mood": Series(["sad", "happy", "happy"], dtype=dtype), + "Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype), } ) aggregate_details = {"Mood": Series.mode, "Credit": "sum"} @@ -2862,11 +2874,13 @@ def test_groupby_none_in_first_mi_level(): tm.assert_series_equal(result, expected) -def test_groupby_none_column_name(): +def test_groupby_none_column_name(using_infer_string): # GH#47348 df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]}) - result = df.groupby(by=[None]).sum() - expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=None)) + by = [np.nan] if using_infer_string else [None] + gb = df.groupby(by=by) + result = gb.sum() + expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=by[0])) tm.assert_frame_equal(result, expected) @@ -3089,7 +3103,7 @@ def test_obj_with_exclusions_duplicate_columns(): def test_groupby_numeric_only_std_no_result(numeric_only): # GH 51080 dicts_non_numeric = [{"a": "foo", "b": "bar"}, {"a": "car", "b": "dar"}] - df = DataFrame(dicts_non_numeric) + df = DataFrame(dicts_non_numeric, dtype=object) dfgb = df.groupby("a", as_index=False, sort=False) if numeric_only: @@ -3103,6 +3117,7 @@ def test_groupby_numeric_only_std_no_result(numeric_only): dfgb.std(numeric_only=numeric_only) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_grouping_with_categorical_interval_columns(): # GH#34164 df = DataFrame({"x": [0.1, 0.2, 0.3, -0.4, 0.5], "w": ["a", "b", "a", "c", "a"]}) @@ -3148,10 +3163,14 @@ def test_grouping_with_categorical_interval_columns(): def test_groupby_sum_on_nan_should_return_nan(bug_var): # GH 24196 df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]}) + if isinstance(bug_var, str): + df = df.astype(object) dfgb = df.groupby(lambda x: x) result = dfgb.sum(min_count=1) - expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"]) + expected_df = DataFrame( + [bug_var, bug_var, bug_var, None], columns=["A"], dtype=df["A"].dtype + ) tm.assert_frame_equal(result, expected_df) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 73638eba0a3b3..2a9b61aa7ebf5 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -123,7 +123,7 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"]) grouped = df.groupby("a", dropna=dropna).sum() - expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a")) + expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a")) tm.assert_frame_equal(grouped, expected) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index bf809bd5db437..b5523592c3c5c 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -69,16 +69,27 @@ def test_groupby_preserves_metadata(): def func(group): assert isinstance(group, tm.SubclassedDataFrame) assert hasattr(group, "testattr") + assert group.testattr == "hello" return group.testattr msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning( - FutureWarning, match=msg, raise_on_extra_warnings=False + FutureWarning, + match=msg, + raise_on_extra_warnings=False, + check_stacklevel=False, ): result = custom_df.groupby("c").apply(func) expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) tm.assert_series_equal(result, expected) + result = custom_df.groupby("c").apply(func, include_groups=False) + tm.assert_series_equal(result, expected) + + # https://github.com/pandas-dev/pandas/pull/56761 + result = custom_df.groupby("c")[["a", "b"]].apply(func) + tm.assert_series_equal(result, expected) + def func2(group): assert isinstance(group, tm.SubclassedSeries) assert hasattr(group, "testattr") @@ -98,7 +109,7 @@ def test_groupby_resample_preserves_subclass(obj): df = obj( { - "Buyer": "Carl Carl Carl Carl Joe Carl".split(), + "Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object), "Quantity": [18, 3, 5, 1, 9, 3], "Date": [ datetime(2013, 9, 1, 13, 0), @@ -115,7 +126,10 @@ def test_groupby_resample_preserves_subclass(obj): # Confirm groupby.resample() preserves dataframe type msg = "DataFrameGroupBy.resample operated on the grouping columns" with tm.assert_produces_warning( - FutureWarning, match=msg, raise_on_extra_warnings=False + FutureWarning, + match=msg, + raise_on_extra_warnings=False, + check_stacklevel=False, ): result = df.groupby("Buyer").resample("5D").sum() assert isinstance(result, obj) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 363ff883385db..9a0e67dea532b 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -851,7 +851,7 @@ def test_groupby_level_index_value_all_na(self): expected = DataFrame( data=[], index=MultiIndex( - levels=[Index(["x"], dtype="object"), Index([], dtype="float64")], + levels=[Index(["x"], dtype="str"), Index([], dtype="float64")], codes=[[], []], names=["A", "B"], ), @@ -990,7 +990,9 @@ def test_groupby_with_single_column(self): df = DataFrame({"a": list("abssbab")}) tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) # GH 13530 - exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[]) + exp = DataFrame( + index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype="str") + ) tm.assert_frame_equal(df.groupby("a").count(), exp) tm.assert_frame_equal(df.groupby("a").sum(), exp) diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py index ee7d342472493..f2c138c86a046 100644 --- a/pandas/tests/groupby/test_numba.py +++ b/pandas/tests/groupby/test_numba.py @@ -1,15 +1,24 @@ import pytest +from pandas.compat import is_platform_arm + from pandas import ( DataFrame, Series, option_context, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] -pytest.importorskip("numba") +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.mark.filterwarnings("ignore") diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index ff4685b1e412d..3c1ed20ddcb16 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -29,7 +29,8 @@ def df(self): "group": [1, 1, 2], "int": [1, 2, 3], "float": [4.0, 5.0, 6.0], - "string": list("abc"), + "string": Series(["a", "b", "c"], dtype="str"), + "object": Series(["a", "b", "c"], dtype=object), "category_string": Series(list("abc")).astype("category"), "category_int": [7, 8, 9], "datetime": date_range("20130101", periods=3), @@ -41,6 +42,7 @@ def df(self): "int", "float", "string", + "object", "category_string", "category_int", "datetime", @@ -113,6 +115,7 @@ def test_first_last(self, df, method): "int", "float", "string", + "object", "category_string", "category_int", "datetime", @@ -160,7 +163,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): # object dtypes for transformations are not implemented in Cython and # have no Python fallback - exception = NotImplementedError if method.startswith("cum") else TypeError + exception = ( + (NotImplementedError, TypeError) if method.startswith("cum") else TypeError + ) if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"): # The methods default to numeric_only=False and raise TypeError @@ -171,6 +176,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): re.escape(f"agg function failed [how->{method},dtype->object]"), # cumsum/cummin/cummax/cumprod "function is not implemented for this dtype", + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -181,6 +187,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -198,6 +205,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -271,9 +279,10 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_str # cumsum, diff, pct_change "unsupported operand type", "has no kernel", + "operation 'sub' not supported for dtype 'str' with dtype 'float64'", ) if using_infer_string: - import pyarrow as pa + pa = pytest.importorskip("pyarrow") errs = (TypeError, pa.lib.ArrowNotImplementedError) else: @@ -381,7 +390,9 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): re.escape(f"agg function failed [how->{kernel},dtype->object]"), ] ) - if kernel == "idxmin": + if kernel == "quantile": + msg = "dtype 'object' does not support operation 'quantile'" + elif kernel == "idxmin": msg = "'<' not supported between instances of 'type' and 'type'" elif kernel == "idxmax": msg = "'>' not supported between instances of 'type' and 'type'" @@ -455,7 +466,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): # that succeed should not be allowed to fail (without deprecation, at least) if groupby_func in fails_on_numeric_object and dtype is object: if groupby_func == "quantile": - msg = "cannot be performed against 'object' dtypes" + msg = "dtype 'object' does not support operation 'quantile'" else: msg = "is not supported for object dtype" warn = FutureWarning if groupby_func == "fillna" else None diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index 7d5c1625b8ab4..ee59a93695bcf 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -35,7 +35,7 @@ def square(srs): # NDFrame.pipe methods result = df.groupby("A").pipe(f).pipe(square) - index = Index(["bar", "foo"], dtype="object", name="A") + index = Index(["bar", "foo"], name="A") expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index) tm.assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 0b451ce73db89..5457f5ba050c6 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -119,7 +119,7 @@ def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_string( - how, by, groupby_series, groupby_func, df_with_string_col + how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string ): df = df_with_string_col args = get_groupby_method_args(groupby_func, df) @@ -179,7 +179,7 @@ def test_groupby_raises_string( TypeError, re.escape("agg function failed [how->prod,dtype->object]"), ), - "quantile": (TypeError, "cannot be performed against 'object' dtypes!"), + "quantile": (TypeError, "dtype 'object' does not support operation 'quantile'"), "rank": (None, ""), "sem": (ValueError, "could not convert string to float"), "shift": (None, ""), @@ -193,6 +193,37 @@ def test_groupby_raises_string( ), }[groupby_func] + if using_infer_string: + if groupby_func in [ + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + "quantile", + ]: + msg = f"dtype 'str' does not support operation '{groupby_func}'" + if groupby_func in ["sem", "std", "skew"]: + # The object-dtype raises ValueError when trying to convert to numeric. + klass = TypeError + elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow": + # This doesn't go through EA._groupby_op so the message isn't controlled + # there. + msg = "operation 'truediv' not supported for dtype 'str' with dtype 'str'" + elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow": + # This doesn't go through EA._groupby_op so the message isn't controlled + # there. + msg = "operation 'sub' not supported for dtype 'str' with dtype 'str'" + + elif groupby_func in ["cummin", "cummax"]: + msg = msg.replace("object", "str") + elif groupby_func == "corrwith": + msg = "Cannot perform reduction 'mean' with string dtype" + if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" @@ -219,7 +250,12 @@ def func(x): @pytest.mark.parametrize("how", ["agg", "transform"]) @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) def test_groupby_raises_string_np( - how, by, groupby_series, groupby_func_np, df_with_string_col + how, + by, + groupby_series, + groupby_func_np, + df_with_string_col, + using_infer_string, ): # GH#50749 df = df_with_string_col @@ -232,10 +268,15 @@ def test_groupby_raises_string_np( np.sum: (None, ""), np.mean: ( TypeError, - re.escape("agg function failed [how->mean,dtype->object]"), + "agg function failed|Cannot perform reduction 'mean' with string dtype", ), }[groupby_func_np] + if using_infer_string: + if groupby_func_np is np.mean: + klass = TypeError + msg = "dtype 'str' does not support operation 'mean'" + if groupby_series: warn_msg = "using SeriesGroupBy.[sum|mean]" else: diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 425079f943aba..599b0aabf85d5 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -7,6 +7,9 @@ from pandas._libs.tslibs import iNaT +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.missing import na_value_for_dtype + import pandas as pd from pandas import ( DataFrame, @@ -195,6 +198,68 @@ def test_empty(frame_or_series, bool_agg_func): tm.assert_equal(result, expected) +@pytest.mark.parametrize("how", ["idxmin", "idxmax"]) +def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype): + # GH#57040 + if any_real_numpy_dtype is int or any_real_numpy_dtype is float: + # No need to test + return + info = np.iinfo if "int" in any_real_numpy_dtype else np.finfo + min_value = info(any_real_numpy_dtype).min + max_value = info(any_real_numpy_dtype).max + df = DataFrame( + {"a": [2, 1, 1, 2], "b": [min_value, max_value, max_value, min_value]}, + dtype=any_real_numpy_dtype, + ) + gb = df.groupby("a") + result = getattr(gb, how)() + expected = DataFrame( + {"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype) + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["idxmin", "idxmax"]) +def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype): + # GH#57040 + min_value = np.finfo(float_numpy_dtype).min + max_value = np.finfo(float_numpy_dtype).max + df = DataFrame( + { + "a": Series(np.repeat(range(1, 6), repeats=2), dtype="intp"), + "b": Series( + [ + np.nan, + min_value, + np.nan, + max_value, + min_value, + np.nan, + max_value, + np.nan, + np.nan, + np.nan, + ], + dtype=float_numpy_dtype, + ), + }, + ) + gb = df.groupby("a") + + warn = None if skipna else FutureWarning + msg = f"The behavior of DataFrameGroupBy.{how} with all-NA values" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb, how)(skipna=skipna) + if skipna: + values = [1, 3, 4, 6, np.nan] + else: + values = np.nan + expected = DataFrame( + {"b": values}, index=pd.Index(range(1, 6), name="a", dtype="intp") + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "func, values", [ @@ -265,6 +330,34 @@ def test_groupby_non_arithmetic_agg_int_like_precision(method, data): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("how", ["first", "last"]) +def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how): + # GH#57019 + na_value = na_value_for_dtype(pandas_dtype(any_real_nullable_dtype)) + df = DataFrame( + { + "a": [2, 1, 1, 2, 3, 3], + "b": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + "c": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + }, + dtype=any_real_nullable_dtype, + ) + gb = df.groupby("a", sort=sort) + method = getattr(gb, how) + result = method(skipna=skipna) + + ilocs = { + ("first", True): [3, 1, 4], + ("first", False): [0, 1, 4], + ("last", True): [3, 1, 5], + ("last", False): [3, 2, 5], + }[how, skipna] + expected = df.iloc[ilocs].set_index("a") + if sort: + expected = expected.sort_index() + tm.assert_frame_equal(result, expected) + + def test_idxmin_idxmax_axis1(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] @@ -362,7 +455,7 @@ def test_max_min_non_numeric(): assert "ss" in result -def test_max_min_object_multiple_columns(using_array_manager): +def test_max_min_object_multiple_columns(using_array_manager, using_infer_string): # GH#41111 case where the aggregation is valid for some columns but not # others; we split object blocks column-wise, consistent with # DataFrame._reduce @@ -376,7 +469,7 @@ def test_max_min_object_multiple_columns(using_array_manager): ) df._consolidate_inplace() # should already be consolidate, but double-check if not using_array_manager: - assert len(df._mgr.blocks) == 2 + assert len(df._mgr.blocks) == 3 if using_infer_string else 2 gb = df.groupby("A") @@ -606,10 +699,9 @@ def test_groupby_min_max_categorical(func): @pytest.mark.parametrize("func", ["min", "max"]) -def test_min_empty_string_dtype(func): +def test_min_empty_string_dtype(func, string_dtype_no_object): # GH#55619 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = string_dtype_no_object df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0] result = getattr(df.groupby("a"), func)() expected = DataFrame( diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index d357a65e79796..3bae719e01b73 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -10,6 +10,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -73,6 +75,9 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): class TestGroupBy: + # TODO(infer_string) resample sum introduces 0's + # https://github.com/pandas-dev/pandas/issues/60229 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_timegrouper(self): # GH 4161 # TimeGrouper requires a sorted index diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index 61fcc930f116a..5afc6f3bdcd3c 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import NumbaUtilError from pandas import ( @@ -9,8 +10,17 @@ option_context, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) def test_correct_function_signature(): diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index a2ecd6c65db60..18ce6e93de402 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -497,7 +497,7 @@ def test_transform_select_columns(df): tm.assert_frame_equal(result, expected) -def test_transform_nuisance_raises(df): +def test_transform_nuisance_raises(df, using_infer_string): # case that goes through _transform_item_by_item df.columns = ["A", "B", "B", "D"] @@ -507,10 +507,13 @@ def test_transform_nuisance_raises(df): grouped = df.groupby("A") gbc = grouped["B"] - with pytest.raises(TypeError, match="Could not convert"): + msg = "Could not convert" + if using_infer_string: + msg = "Cannot perform reduction 'mean' with string dtype" + with pytest.raises(TypeError, match=msg): gbc.transform(lambda x: np.mean(x)) - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match=msg): df.groupby("A").transform(lambda x: np.mean(x)) @@ -579,7 +582,7 @@ def test_transform_coercion(): tm.assert_frame_equal(result, expected) -def test_groupby_transform_with_int(): +def test_groupby_transform_with_int(using_infer_string): # GH 3740, make sure that we might upcast on item-by-item transform # floats @@ -609,8 +612,11 @@ def test_groupby_transform_with_int(): "D": "foo", } ) + msg = "Could not convert" + if using_infer_string: + msg = "Cannot perform reduction 'mean' with string dtype" with np.errstate(all="ignore"): - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match=msg): df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) result = df.groupby("A")[["B", "C"]].transform( lambda x: (x - x.mean()) / x.std() @@ -622,7 +628,7 @@ def test_groupby_transform_with_int(): s = Series([2, 3, 4, 10, 5, -1]) df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"}) with np.errstate(all="ignore"): - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match=msg): df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) result = df.groupby("A")[["B", "C"]].transform( lambda x: (x - x.mean()) / x.std() @@ -896,6 +902,8 @@ def test_cython_transform_frame_column( "does not support .* operations", ".* is not supported for object dtype", "is not implemented for this dtype", + ".* is not supported for str dtype", + "dtype 'str' does not support operation '.*'", ] ) with pytest.raises(TypeError, match=msg): @@ -1224,14 +1232,14 @@ def test_groupby_transform_dtype(): df = DataFrame({"a": [1], "val": [1.35]}) result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}")) - expected1 = Series(["+1.35"], name="val", dtype="object") + expected1 = Series(["+1.35"], name="val") tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}")) tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})")) - expected2 = Series(["+(1.35)"], name="val", dtype="object") + expected2 = Series(["+(1.35)"], name="val") tm.assert_series_equal(result, expected2) df["val"] = df["val"].astype(object) diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index fd5176a28565e..dcf0165ead6c0 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -47,9 +47,7 @@ def test_construct_empty_tuples(self, tuple_list): def test_index_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Index(["a", "b"], dtype=dtype) + expected = Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)) with pd.option_context("future.infer_string", True): ser = Index(["a", "b"]) tm.assert_index_equal(ser, expected) @@ -71,3 +69,10 @@ def test_inference_on_pandas_objects(self): with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): result = Index(ser) assert result.dtype != np.object_ + + def test_constructor_not_read_only(self): + # GH#57130 + ser = Series([1, 2], dtype=object) + with pd.option_context("mode.copy_on_write", True): + idx = Index(ser) + assert idx._values.flags.writeable diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index f30b578cfcf56..955e3be107f75 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas._config.config as cf from pandas import Index @@ -16,7 +16,7 @@ def test_repr_is_valid_construction_code(self): res = eval(repr(idx)) tm.assert_index_equal(res, idx) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -81,7 +81,7 @@ def test_string_index_repr(self, index, expected): result = repr(index) assert result == expected - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 814a6a516904b..b1a6c30b52f68 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -4,6 +4,7 @@ import numpy as np import pytest +import pandas as pd from pandas import Index import pandas._testing as tm @@ -35,7 +36,9 @@ def test_insert(self): null_index = Index([]) tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a")) - def test_insert_missing(self, nulls_fixture, using_infer_string): + def test_insert_missing(self, request, nulls_fixture, using_infer_string): + if using_infer_string and nulls_fixture is pd.NA: + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) # GH#22295 # test there is no mangling of NA values expected = Index(["a", nulls_fixture, "b", "c"], dtype=object) @@ -56,12 +59,11 @@ def test_insert_datetime_into_object(self, loc, val): tm.assert_index_equal(result, expected) assert type(expected[2]) is type(val) - def test_insert_none_into_string_numpy(self): + def test_insert_none_into_string_numpy(self, string_dtype_no_object): # GH#55365 - pytest.importorskip("pyarrow") - index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b", "c"], dtype=string_dtype_no_object) result = index.insert(-1, None) - expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]") + expected = Index(["a", "b", None, "c"], dtype=string_dtype_no_object) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 3ef3f3ad4d3a2..a897e5aca058a 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -240,6 +240,7 @@ def test_tuple_union_bug(self, method, expected, sort): def test_union_name_preservation( self, first_list, second_list, first_name, second_name, expected_name, sort ): + expected_dtype = object if not first_list or not second_list else "str" first = Index(first_list, name=first_name) second = Index(second_list, name=second_name) union = first.union(second, sort=sort) @@ -250,7 +251,7 @@ def test_union_name_preservation( expected = Index(sorted(vals), name=expected_name) tm.assert_index_equal(union, expected) else: - expected = Index(vals, name=expected_name) + expected = Index(vals, name=expected_name, dtype=expected_dtype) tm.assert_index_equal(union.sort_values(), expected.sort_values()) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 03a298a13dc2b..166e628ae4b3e 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import index as libindex from pandas._libs.arrays import NDArrayBacked @@ -196,7 +196,7 @@ def test_unique(self, data, categories, expected_data, ordered): expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr doesn't roundtrip") + @pytest.mark.xfail(using_string_dtype(), reason="repr doesn't roundtrip") def test_repr_roundtrip(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) str(ci) diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index 522ca1bc2afde..e8489e4ad8161 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -3,7 +3,7 @@ """ import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas._config.config as cf from pandas import CategoricalIndex @@ -19,7 +19,7 @@ def test_format_different_scalar_lengths(self): with tm.assert_produces_warning(FutureWarning, match=msg): assert idx.format() == expected - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) diff --git a/pandas/tests/indexes/categorical/test_setops.py b/pandas/tests/indexes/categorical/test_setops.py new file mode 100644 index 0000000000000..2e87b90efd54c --- /dev/null +++ b/pandas/tests/indexes/categorical/test_setops.py @@ -0,0 +1,18 @@ +import numpy as np +import pytest + +from pandas import ( + CategoricalIndex, + Index, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("na_value", [None, np.nan]) +def test_difference_with_na(na_value): + # GH 57318 + ci = CategoricalIndex(["a", "b", "c", None]) + other = Index(["c", na_value]) + result = ci.difference(other) + expected = CategoricalIndex(["a", "b"], categories=["a", "b", "c"]) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimelike_/test_indexing.py b/pandas/tests/indexes/datetimelike_/test_indexing.py index ee7128601256a..7b2c81aaf17de 100644 --- a/pandas/tests/indexes/datetimelike_/test_indexing.py +++ b/pandas/tests/indexes/datetimelike_/test_indexing.py @@ -19,7 +19,7 @@ @pytest.mark.parametrize("ldtype", dtlike_dtypes) @pytest.mark.parametrize("rdtype", dtlike_dtypes) def test_get_indexer_non_unique_wrong_dtype(ldtype, rdtype): - vals = np.tile(3600 * 10**9 * np.arange(3), 2) + vals = np.tile(3600 * 10**9 * np.arange(3, dtype=np.int64), 2) def construct(dtype): if dtype is dtlike_dtypes[-1]: diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index c0bc6601769b1..a9bcae625e494 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -102,13 +102,16 @@ def test_astype_tznaive_to_tzaware(self): # dt64->dt64tz deprecated idx._data.astype("datetime64[ns, US/Eastern]") - def test_astype_str_nat(self): + def test_astype_str_nat(self, using_infer_string): # GH 13149, GH 13209 # verify that we are returning NaT as a string (and not unicode) idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan]) result = idx.astype(str) - expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) + if using_infer_string: + expected = Index(["2016-05-16", None, None, None], dtype="str") + else: + expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) tm.assert_index_equal(result, expected) def test_astype_str(self): @@ -118,7 +121,7 @@ def test_astype_str(self): expected = Index( ["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -133,7 +136,7 @@ def test_astype_str_tz_and_name(self): "2012-01-03 00:00:00-05:00", ], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -144,7 +147,7 @@ def test_astype_str_freq_and_name(self): expected = Index( ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -156,7 +159,7 @@ def test_astype_str_freq_and_tz(self): result = dti.astype(str) expected = Index( ["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"], - dtype=object, + dtype="str", name="test_name", ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 42a3f3b0f7b42..de8d32f64cde2 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -111,23 +111,6 @@ def test_to_period_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): with tm.assert_produces_warning(FutureWarning, match=msg): assert prng.freq == freq_depr - @pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2BQE-SEP", "2BQ-SEP"), - ("2BYE-MAR", "2BY-MAR"), - ], - ) - def test_to_period_frequency_BQ_BY_deprecated(self, freq, freq_depr): - # GH#9586 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." - - rng = date_range("01-Jan-2012", periods=8, freq=freq) - prng = rng.to_period() - with tm.assert_produces_warning(FutureWarning, match=msg): - prng.freq == freq_depr - def test_to_period_infer(self): # https://github.com/pandas-dev/pandas/issues/33358 rng = date_range( @@ -238,5 +221,5 @@ def test_to_period_offsets_not_supported(self, freq): # GH#56243 msg = f"{freq[1:]} is not supported as period frequency" ts = date_range("1/1/2012", periods=4, freq=freq) - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): ts.to_period() diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 44dd64e162413..d26bee80003e9 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -822,6 +822,17 @@ def test_frequencies_A_deprecated_Y_renamed(self, freq, freq_depr): result = date_range("1/1/2000", periods=2, freq=freq_depr) tm.assert_index_equal(result, expected) + def test_to_offset_with_lowercase_deprecated_freq(self) -> None: + # https://github.com/pandas-dev/pandas/issues/56847 + msg = ( + "'m' is deprecated and will be removed in a future version, please use " + "'ME' instead." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = date_range("2010-01-01", periods=2, freq="m") + expected = DatetimeIndex(["2010-01-31", "2010-02-28"], freq="ME") + tm.assert_index_equal(result, expected) + def test_date_range_bday(self): sdate = datetime(1999, 12, 25) idx = date_range(start=sdate, freq="1B", periods=20) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 5db0aa5cf510f..bac9548b932c1 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -10,8 +10,6 @@ ) import pandas._testing as tm -START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) - class TestDatetimeIndexOps: def test_infer_freq(self, freq_sample): @@ -26,6 +24,7 @@ def test_infer_freq(self, freq_sample): class TestBusinessDatetimeIndex: @pytest.fixture def rng(self, freq): + START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) return bdate_range(START, END, freq=freq) def test_comparison(self, rng): diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 0ebb88afb6c86..8b493fc61cb58 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -236,7 +236,7 @@ def test_partial_slice_second_precision(self): rng = date_range( start=datetime(2005, 1, 1, 0, 0, 59, microsecond=999990), periods=20, - freq="US", + freq="us", ) s = Series(np.arange(20), rng) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 59c555b9644a1..dde5f38074efb 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -186,6 +186,12 @@ def test_subtype_datetimelike(self, index, subtype): with pytest.raises(TypeError, match=msg): index.astype(dtype) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_category(self, index): + super().test_astype_category(index) + class TestDatetimelikeSubtype(AstypeTests): """Tests specific to IntervalIndex with datetime-like subtype""" diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 778c07b46e57c..e47a014f18045 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_unsigned_integer_dtype from pandas.core.dtypes.dtypes import IntervalDtype @@ -517,3 +519,17 @@ def test_dtype_closed_mismatch(): with pytest.raises(ValueError, match=msg): IntervalArray([], dtype=dtype, closed="neither") + + +@pytest.mark.parametrize( + "dtype", + ["Float64", pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow"))], +) +def test_ea_dtype(dtype): + # GH#56765 + bins = [(0.0, 0.4), (0.4, 0.6)] + interval_dtype = IntervalDtype(subtype=dtype, closed="left") + result = IntervalIndex.from_tuples(bins, closed="left", dtype=interval_dtype) + assert result.dtype == interval_dtype + expected = IntervalIndex.from_tuples(bins, closed="left").astype(interval_dtype) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index 3b8e18463160f..73bbfc91028b3 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas import ( DataFrame, DatetimeIndex, @@ -42,12 +40,11 @@ def test_repr_missing(self, constructor, expected, using_infer_string, request): result = repr(obj) assert result == expected - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") def test_repr_floats(self): # GH 32553 markers = Series( - ["foo", "bar"], + [1, 2], index=IntervalIndex( [ Interval(left, right) @@ -59,9 +56,12 @@ def test_repr_floats(self): ), ) result = str(markers) - expected = "(329.973, 345.137] foo\n(345.137, 360.191] bar\ndtype: object" + expected = "(329.973, 345.137] 1\n(345.137, 360.191] 2\ndtype: int64" assert result == expected + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) @pytest.mark.parametrize( "tuples, closed, expected_data", [ diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index fd03047b2c127..b5be7e0713cdf 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -341,6 +341,9 @@ def test_get_indexer_categorical(self, target, ordered): expected = index.get_indexer(target) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_get_indexer_categorical_with_nans(self): # GH#41934 nans in both index and in target ii = IntervalIndex.from_breaks(range(5)) diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index d4d4a09c44d13..e8de59f84bcc6 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -84,9 +84,7 @@ def test_constructor_timestamp(self, closed, name, freq, periods, tz): tm.assert_index_equal(result, expected) # GH 20976: linspace behavior defined from start/end/periods - if not breaks.freq.is_anchored() and tz is None: - # matches expected only for non-anchored offsets and tz naive - # (anchored/DST transitions cause unequal spacing in expected) + if not breaks.freq.n == 1 and tz is None: result = interval_range( start=start, end=end, periods=periods, name=name, closed=closed ) diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py index 45b25f2533afd..78388e84fc6dc 100644 --- a/pandas/tests/indexes/interval/test_interval_tree.py +++ b/pandas/tests/indexes/interval/test_interval_tree.py @@ -190,7 +190,6 @@ def test_construction_overflow(self): expected = (50 + np.iinfo(np.int64).max) / 2 assert result == expected - @pytest.mark.xfail(not IS64, reason="GH 23440") @pytest.mark.parametrize( "left, right, expected", [ diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 8456e6a7acba5..b1180f2d7af14 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -851,7 +851,7 @@ def test_dtype_representation(using_infer_string): # GH#46900 pmidx = MultiIndex.from_arrays([[1], ["a"]], names=[("a", "b"), ("c", "d")]) result = pmidx.dtypes - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = Series( ["int64", exp], index=MultiIndex.from_tuples([("a", "b"), ("c", "d")]), diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 3c2ca045d6f99..d62bd5438a1e3 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + import pandas as pd from pandas import ( DataFrame, @@ -15,6 +17,41 @@ def test_to_numpy(idx): tm.assert_numpy_array_equal(result, exp) +def test_array_interface(idx): + # https://github.com/pandas-dev/pandas/pull/60046 + result = np.asarray(idx) + expected = np.empty((6,), dtype=object) + expected[:] = [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ] + tm.assert_numpy_array_equal(result, expected) + + # it always gives a copy by default, but the values are cached, so results + # are still sharing memory + result_copy1 = np.asarray(idx) + result_copy2 = np.asarray(idx) + assert np.may_share_memory(result_copy1, result_copy2) + + # with explicit copy=True, then it is an actual copy + result_copy1 = np.array(idx, copy=True) + result_copy2 = np.array(idx, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + # for MultiIndex, copy=False is never allowed + msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed" + with tm.assert_produces_warning(FutureWarning, match=msg): + np.array(idx, copy=False) + + def test_to_frame(): tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 6eeaeb6711d03..17ca876487330 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -41,7 +41,7 @@ def test_get_dtypes(using_infer_string): names=["int", "string", "dt"], ) - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( { "int": np.dtype("int64"), @@ -61,7 +61,7 @@ def test_get_dtypes_no_level_name(using_infer_string): pd.date_range("20200101", periods=2, tz="UTC"), ], ) - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( { "level_0": np.dtype("int64"), @@ -82,7 +82,7 @@ def test_get_dtypes_duplicate_level_names(using_infer_string): ], names=["A", "A", "A"], ).dtypes - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( [np.dtype("int64"), exp, DatetimeTZDtype(tz="utc")], index=["A", "A", "A"], diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 0abb56ecf9de7..801a813955b41 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -763,7 +763,7 @@ def test_union_with_na_when_constructing_dataframe(): series1 = Series( (1,), index=MultiIndex.from_arrays( - [Series([None], dtype="string"), Series([None], dtype="string")] + [Series([None], dtype="str"), Series([None], dtype="str")] ), ) series2 = Series((10, 20), index=MultiIndex.from_tuples(((None, None), ("a", "b")))) diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index 9c1ef302c5b51..7e0de138aacfb 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -3,25 +3,7 @@ from pandas import ( Index, NaT, - Series, ) -import pandas._testing as tm - - -def test_astype_str_from_bytes(): - # https://github.com/pandas-dev/pandas/issues/38607 - # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively - # did a .decode() on the bytes object. In 2.0 we go through - # ensure_string_array which does f"{val}" - idx = Index(["あ", b"a"], dtype="object") - result = idx.astype(str) - expected = Index(["あ", "a"], dtype="object") - tm.assert_index_equal(result, expected) - - # while we're here, check that Series.astype behaves the same - result = Series(idx).astype(str) - expected = Series(expected, dtype=object) - tm.assert_series_equal(result, expected) def test_astype_invalid_nas_to_tdt64_raises(): diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index ebf9dac715f8d..42ef7e7a96f5e 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -3,13 +3,8 @@ import numpy as np import pytest -from pandas._libs.missing import ( - NA, - is_matching_na, -) -import pandas.util._test_decorators as td +from pandas._libs.missing import is_matching_na -import pandas as pd from pandas import Index import pandas._testing as tm @@ -23,41 +18,31 @@ class TestGetIndexer: ], ) def test_get_indexer_strings(self, method, expected): - index = Index(["b", "c"]) + expected = np.array(expected, dtype=np.intp) + index = Index(["b", "c"], dtype=object) actual = index.get_indexer(["a", "b", "c", "d"], method=method) tm.assert_numpy_array_equal(actual, expected) - def test_get_indexer_strings_raises(self, using_infer_string): - index = Index(["b", "c"]) + def test_get_indexer_strings_raises(self): + index = Index(["b", "c"], dtype=object) - if using_infer_string: - import pyarrow as pa - - msg = "has no kernel" - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="nearest") - - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) - - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - index.get_indexer( - ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] - ) - - else: - msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" - with pytest.raises(TypeError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="nearest") + msg = "|".join( + [ + "operation 'sub' not supported for dtype 'str'", + r"unsupported operand type\(s\) for -: 'str' and 'str'", + ] + ) + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="nearest") - with pytest.raises(TypeError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) - with pytest.raises(TypeError, match=msg): - index.get_indexer( - ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] - ) + with pytest.raises(TypeError, match=msg): + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) def test_get_indexer_with_NA_values( self, unique_nulls_fixture, unique_nulls_fixture2 @@ -77,15 +62,20 @@ def test_get_indexer_with_NA_values( expected = np.array([0, 1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_infer_string_missing_values(self): + # ensure the passed list is not cast to string but to object so that + # the None value is matched in the index + # https://github.com/pandas-dev/pandas/issues/55834 + idx = Index(["a", "b", None], dtype="object") + result = idx.get_indexer([None, "x"]) + expected = np.array([2, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + class TestGetIndexerNonUnique: - def test_get_indexer_non_unique_nas( - self, nulls_fixture, request, using_infer_string - ): + def test_get_indexer_non_unique_nas(self, nulls_fixture): # even though this isn't non-unique, this should still work - if using_infer_string and (nulls_fixture is None or nulls_fixture is NA): - request.applymarker(pytest.mark.xfail(reason="NAs are cast to NaN")) - index = Index(["a", "b", nulls_fixture]) + index = Index(["a", "b", nulls_fixture], dtype=object) indexer, missing = index.get_indexer_non_unique([nulls_fixture]) expected_indexer = np.array([2], dtype=np.intp) @@ -94,7 +84,7 @@ def test_get_indexer_non_unique_nas( tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique - index = Index(["a", nulls_fixture, "b", nulls_fixture]) + index = Index(["a", nulls_fixture, "b", nulls_fixture], dtype=object) indexer, missing = index.get_indexer_non_unique([nulls_fixture]) expected_indexer = np.array([1, 3], dtype=np.intp) @@ -103,10 +93,10 @@ def test_get_indexer_non_unique_nas( # matching-but-not-identical nans if is_matching_na(nulls_fixture, float("NaN")): - index = Index(["a", float("NaN"), "b", float("NaN")]) + index = Index(["a", float("NaN"), "b", float("NaN")], dtype=object) match_but_not_identical = True elif is_matching_na(nulls_fixture, Decimal("NaN")): - index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")]) + index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")], dtype=object) match_but_not_identical = True else: match_but_not_identical = False @@ -167,67 +157,3 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): expected_indexer = np.array([1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) - - -class TestSliceLocs: - @pytest.mark.parametrize( - "dtype", - [ - "object", - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], - ) - @pytest.mark.parametrize( - "in_slice,expected", - [ - # error: Slice index must be an integer or None - (pd.IndexSlice[::-1], "yxdcb"), - (pd.IndexSlice["b":"y":-1], ""), # type: ignore[misc] - (pd.IndexSlice["b"::-1], "b"), # type: ignore[misc] - (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice[:"y":-1], "y"), # type: ignore[misc] - (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice["y"::-4], "yb"), # type: ignore[misc] - # absent labels - (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore[misc] - (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice["z"::-3], "yc"), # type: ignore[misc] - (pd.IndexSlice["m"::-1], "dcb"), # type: ignore[misc] - (pd.IndexSlice[:"m":-1], "yx"), # type: ignore[misc] - (pd.IndexSlice["a":"a":-1], ""), # type: ignore[misc] - (pd.IndexSlice["z":"z":-1], ""), # type: ignore[misc] - (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] - ], - ) - def test_slice_locs_negative_step(self, in_slice, expected, dtype): - index = Index(list("bcdxy"), dtype=dtype) - - s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) - result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected), dtype=dtype) - tm.assert_index_equal(result, expected) - - @td.skip_if_no("pyarrow") - def test_slice_locs_negative_step_oob(self): - index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") - - result = index[-10:5:1] - tm.assert_index_equal(result, index) - - result = index[4:-10:-1] - expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") - tm.assert_index_equal(result, expected) - - def test_slice_locs_dup(self): - index = Index(["a", "a", "b", "c", "d", "d"]) - assert index.slice_locs("a", "d") == (0, 6) - assert index.slice_locs(end="d") == (0, 6) - assert index.slice_locs("a", "c") == (0, 4) - assert index.slice_locs("b", "d") == (2, 6) - - index2 = index[::-1] - assert index2.slice_locs("d", "a") == (0, 6) - assert index2.slice_locs(end="a") == (0, 6) - assert index2.slice_locs("d", "b") == (0, 4) - assert index2.slice_locs("c", "a") == (2, 6) diff --git a/pandas/tests/indexes/period/methods/test_asfreq.py b/pandas/tests/indexes/period/methods/test_asfreq.py index ed078a3e8fb8b..865bae69d91c7 100644 --- a/pandas/tests/indexes/period/methods/test_asfreq.py +++ b/pandas/tests/indexes/period/methods/test_asfreq.py @@ -1,3 +1,5 @@ +import re + import pytest from pandas import ( @@ -7,6 +9,8 @@ ) import pandas._testing as tm +from pandas.tseries import offsets + class TestPeriodIndex: def test_asfreq(self): @@ -136,3 +140,50 @@ def test_asfreq_with_different_n(self): excepted = Series([1, 2], index=PeriodIndex(["2020-02", "2020-04"], freq="M")) tm.assert_series_equal(result, excepted) + + @pytest.mark.parametrize( + "freq", + [ + "2BMS", + "2YS-MAR", + "2bh", + ], + ) + def test_pi_asfreq_not_supported_frequency(self, freq): + # GH#55785 + msg = f"{freq[1:]} is not supported as period frequency" + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq", + [ + "2BME", + "2YE-MAR", + "2QE", + ], + ) + def test_pi_asfreq_invalid_frequency(self, freq): + # GH#55785 + msg = f"Invalid frequency: {freq}" + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq", + [ + offsets.MonthBegin(2), + offsets.BusinessMonthEnd(2), + ], + ) + def test_pi_asfreq_invalid_baseoffset(self, freq): + # GH#56945 + msg = re.escape(f"{freq} is not supported as period frequency") + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index d545bfd2fae0f..af3c2667f51b4 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -22,7 +22,7 @@ def test_astype_raises(self, dtype): with pytest.raises(TypeError, match=msg): idx.astype(dtype) - def test_astype_conversion(self): + def test_astype_conversion(self, using_infer_string): # GH#13149, GH#13209 idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D", name="idx") @@ -41,7 +41,12 @@ def test_astype_conversion(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) idx = period_range("1990", "2009", freq="Y", name="idx") diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 387dc47c48d20..892eb7b4a00d1 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -26,9 +26,12 @@ class TestPeriodIndexDisallowedFreqs: ("2M", "2ME"), ("2Q-MAR", "2QE-MAR"), ("2Y-FEB", "2YE-FEB"), + ("2M", "2me"), + ("2Q-MAR", "2qe-MAR"), + ("2Y-FEB", "2yE-feb"), ], ) - def test_period_index_frequency_ME_error_message(self, freq, freq_depr): + def test_period_index_offsets_frequency_error_message(self, freq, freq_depr): # GH#52064 msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" @@ -38,7 +41,7 @@ def test_period_index_frequency_ME_error_message(self, freq, freq_depr): with pytest.raises(ValueError, match=msg): period_range(start="2020-01-01", end="2020-01-02", freq=freq_depr) - @pytest.mark.parametrize("freq_depr", ["2SME", "2CBME", "2BYE"]) + @pytest.mark.parametrize("freq_depr", ["2SME", "2sme", "2CBME", "2BYE", "2Bye"]) def test_period_index_frequency_invalid_freq(self, freq_depr): # GH#9586 msg = f"Invalid frequency: {freq_depr[1:]}" @@ -48,6 +51,15 @@ def test_period_index_frequency_invalid_freq(self, freq_depr): with pytest.raises(ValueError, match=msg): PeriodIndex(["2020-01", "2020-05"], freq=freq_depr) + @pytest.mark.parametrize("freq", ["2BQE-SEP", "2BYE-MAR", "2BME"]) + def test_period_index_from_datetime_index_invalid_freq(self, freq): + # GH#56899 + msg = f"Invalid frequency: {freq[1:]}" + + rng = date_range("01-Jan-2012", periods=8, freq=freq) + with pytest.raises(ValueError, match=msg): + rng.to_period() + class TestPeriodIndex: def test_from_ordinals(self): @@ -538,7 +550,9 @@ def test_period_range_length(self): assert i1.freq == end_intv.freq assert i1[-1] == end_intv - end_intv = Period("2006-12-31", "1w") + msg = "'w' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + end_intv = Period("2006-12-31", "1w") i2 = period_range(end=end_intv, periods=10) assert len(i1) == len(i2) assert (i1 == i2).all() @@ -567,7 +581,9 @@ def test_mixed_freq_raises(self): with tm.assert_produces_warning(FutureWarning, match=msg): end_intv = Period("2005-05-01", "B") - vals = [end_intv, Period("2006-12-31", "w")] + msg = "'w' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + vals = [end_intv, Period("2006-12-31", "w")] msg = r"Input has different freq=W-SUN from PeriodIndex\(freq=B\)" depr_msg = r"PeriodDtype\[B\] is deprecated" with pytest.raises(IncompatibleFrequency, match=msg): diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index 2543b49089948..6f8e6d07da8bf 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -181,7 +181,9 @@ def test_construction_from_period(self): def test_mismatched_start_end_freq_raises(self): depr_msg = "Period with BDay freq is deprecated" - end_w = Period("2006-12-31", "1w") + msg = "'w' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + end_w = Period("2006-12-31", "1w") with tm.assert_produces_warning(FutureWarning, match=depr_msg): start_b = Period("02-Apr-2005", "B") @@ -203,19 +205,37 @@ def test_constructor_U(self): with pytest.raises(ValueError, match="Invalid frequency: X"): period_range("2007-1-1", periods=500, freq="X") - def test_H_deprecated_from_time_series(self): + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2Y", "2A"), + ("2Y", "2a"), + ("2Y-AUG", "2A-AUG"), + ("2Y-AUG", "2A-aug"), + ], + ) + def test_a_deprecated_from_time_series(self, freq, freq_depr): # GH#52536 - msg = "'H' is deprecated and will be removed in a future version." + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq[1:]}' instead." + + with tm.assert_produces_warning(FutureWarning, match=msg): + period_range(freq=freq_depr, start="1/1/2001", end="12/1/2009") + + @pytest.mark.parametrize("freq_depr", ["2H", "2MIN", "2S", "2US", "2NS"]) + def test_uppercase_freq_deprecated_from_time_series(self, freq_depr): + # GH#52536, GH#54939 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_depr.lower()[1:]}' instead." + with tm.assert_produces_warning(FutureWarning, match=msg): - period_range(freq="2H", start="1/1/2001", end="12/1/2009") + period_range("2020-01-01 00:00:00 00:00", periods=2, freq=freq_depr) + + @pytest.mark.parametrize("freq_depr", ["2m", "2q-sep", "2y", "2w"]) + def test_lowercase_freq_deprecated_from_time_series(self, freq_depr): + # GH#52536, GH#54939 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_depr.upper()[1:]}' instead." - @pytest.mark.parametrize("freq_depr", ["2A", "A-DEC", "200A-AUG"]) - def test_a_deprecated_from_time_series(self, freq_depr): - # GH#52536 - freq_msg = freq_depr[freq_depr.index("A") :] - msg = ( - f"'{freq_msg}' is deprecated and will be removed in a future version, " - f"please use 'Y{freq_msg[1:]}' instead." - ) with tm.assert_produces_warning(FutureWarning, match=msg): period_range(freq=freq_depr, start="1/1/2001", end="12/1/2009") diff --git a/pandas/tests/indexes/string/__init__.py b/pandas/tests/indexes/string/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/indexes/string/test_astype.py b/pandas/tests/indexes/string/test_astype.py new file mode 100644 index 0000000000000..0349d85f23167 --- /dev/null +++ b/pandas/tests/indexes/string/test_astype.py @@ -0,0 +1,21 @@ +from pandas import ( + Index, + Series, +) +import pandas._testing as tm + + +def test_astype_str_from_bytes(): + # https://github.com/pandas-dev/pandas/issues/38607 + # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively + # did a .decode() on the bytes object. In 2.0 we go through + # ensure_string_array which does f"{val}" + idx = Index(["あ", b"a"], dtype="object") + result = idx.astype(str) + expected = Index(["あ", "a"], dtype="str") + tm.assert_index_equal(result, expected) + + # while we're here, check that Series.astype behaves the same + result = Series(idx).astype(str) + expected = Series(expected, dtype="str") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py new file mode 100644 index 0000000000000..648ee47ddc34c --- /dev/null +++ b/pandas/tests/indexes/string/test_indexing.py @@ -0,0 +1,199 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Index +import pandas._testing as tm + + +def _isnan(val): + try: + return val is not pd.NA and np.isnan(val) + except TypeError: + return False + + +def _equivalent_na(dtype, null): + if dtype.na_value is pd.NA and null is pd.NA: + return True + elif _isnan(dtype.na_value) and _isnan(null): + return True + else: + return False + + +class TestGetLoc: + def test_get_loc(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + assert index.get_loc("b") == 1 + + def test_get_loc_raises(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="d"): + index.get_loc("d") + + def test_get_loc_invalid_value(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="1"): + index.get_loc(1) + + def test_get_loc_non_unique(self, any_string_dtype): + index = Index(["a", "b", "a"], dtype=any_string_dtype) + result = index.get_loc("a") + expected = np.array([True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + + def test_get_loc_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype) + assert index.get_loc(nulls_fixture) == 2 + + +class TestGetIndexer: + @pytest.mark.parametrize( + "method,expected", + [ + ("pad", [-1, 0, 1, 1]), + ("backfill", [0, 0, 1, -1]), + ], + ) + def test_get_indexer_strings(self, any_string_dtype, method, expected): + expected = np.array(expected, dtype=np.intp) + index = Index(["b", "c"], dtype=any_string_dtype) + actual = index.get_indexer(["a", "b", "c", "d"], method=method) + + tm.assert_numpy_array_equal(actual, expected) + + def test_get_indexer_strings_raises(self, any_string_dtype): + index = Index(["b", "c"], dtype=any_string_dtype) + + msg = "|".join( + [ + "operation 'sub' not supported for dtype 'str", + r"unsupported operand type\(s\) for -: 'str' and 'str'", + ] + ) + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="nearest") + + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) + + with pytest.raises(TypeError, match=msg): + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) + + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string): + # NaT and Decimal("NaN") from null_fixture are not supported for string dtype + index = Index(["a", "b", null], dtype=any_string_dtype) + result = index.get_indexer(["a", null, "c"]) + if using_infer_string: + expected = np.array([0, 2, -1], dtype=np.intp) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null + ): + expected = np.array([0, -1, -1], dtype=np.intp) + else: + expected = np.array([0, 2, -1], dtype=np.intp) + + tm.assert_numpy_array_equal(result, expected) + + +class TestGetIndexerNonUnique: + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_non_unique_nas( + self, any_string_dtype, null, using_infer_string + ): + index = Index(["a", "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null + ): + expected_indexer = np.array([0, -1], dtype=np.intp) + expected_missing = np.array([1], dtype=np.intp) + else: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + # actually non-unique + index = Index(["a", null, "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null + ): + pass + else: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + +class TestSliceLocs: + @pytest.mark.parametrize( + "in_slice,expected", + [ + # error: Slice index must be an integer or None + (pd.IndexSlice[::-1], "yxdcb"), + (pd.IndexSlice["b":"y":-1], ""), # type: ignore[misc] + (pd.IndexSlice["b"::-1], "b"), # type: ignore[misc] + (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"y":-1], "y"), # type: ignore[misc] + (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["y"::-4], "yb"), # type: ignore[misc] + # absent labels + (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore[misc] + (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["z"::-3], "yc"), # type: ignore[misc] + (pd.IndexSlice["m"::-1], "dcb"), # type: ignore[misc] + (pd.IndexSlice[:"m":-1], "yx"), # type: ignore[misc] + (pd.IndexSlice["a":"a":-1], ""), # type: ignore[misc] + (pd.IndexSlice["z":"z":-1], ""), # type: ignore[misc] + (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] + ], + ) + def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) + + s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) + result = index[s_start : s_stop : in_slice.step] + expected = Index(list(expected), dtype=any_string_dtype) + tm.assert_index_equal(result, expected) + + def test_slice_locs_negative_step_oob(self, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) + + result = index[-10:5:1] + tm.assert_index_equal(result, index) + + result = index[4:-10:-1] + expected = Index(list("yxdcb"), dtype=any_string_dtype) + tm.assert_index_equal(result, expected) + + def test_slice_locs_dup(self, any_string_dtype): + index = Index(["a", "a", "b", "c", "d", "d"], dtype=any_string_dtype) + assert index.slice_locs("a", "d") == (0, 6) + assert index.slice_locs(end="d") == (0, 6) + assert index.slice_locs("a", "c") == (0, 4) + assert index.slice_locs("b", "d") == (2, 6) + + index2 = index[::-1] + assert index2.slice_locs("d", "a") == (0, 6) + assert index2.slice_locs(end="a") == (0, 6) + assert index2.slice_locs("d", "b") == (0, 4) + assert index2.slice_locs("c", "a") == (2, 6) diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index 10204cfb78e89..8edeaf9c16083 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -45,7 +45,7 @@ def test_map_identity_mapping(index, request): # GH#12766 result = index.map(lambda x: x) - if index.dtype == object and result.dtype == bool: + if index.dtype == object and result.dtype in [bool, "string"]: assert (index == result).all() # TODO: could work that into the 'exact="equiv"'? return # FIXME: doesn't belong in this file anymore! diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 666d92064c86c..a94e4728a9751 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -71,15 +71,15 @@ def test_constructor_casting(self, index): tm.assert_contains_all(arr, new_index) tm.assert_index_equal(index, new_index) - @pytest.mark.parametrize("index", ["string"], indirect=True) - def test_constructor_copy(self, index, using_infer_string): + def test_constructor_copy(self, using_infer_string): + index = Index(list("abc"), name="name") arr = np.array(index) new_index = Index(arr, copy=True, name="name") assert isinstance(new_index, Index) assert new_index.name == "name" if using_infer_string: tm.assert_extension_array_equal( - new_index.values, pd.array(arr, dtype="string[pyarrow_numpy]") + new_index.values, pd.array(arr, dtype="str") ) else: tm.assert_numpy_array_equal(arr, new_index.values) @@ -160,7 +160,7 @@ def test_constructor_from_frame_series_freq(self, using_infer_string): df = DataFrame(np.random.default_rng(2).random((5, 3))) df["date"] = dts result = DatetimeIndex(df["date"], freq="MS") - dtype = object if not using_infer_string else "string" + dtype = object if not using_infer_string else "str" assert df["date"].dtype == dtype expected.name = "date" tm.assert_index_equal(result, expected) @@ -354,11 +354,12 @@ def test_view_with_args_object_array_raises(self, index): msg = "When changing to a larger dtype" with pytest.raises(ValueError, match=msg): index.view("i8") - elif index.dtype == "string": - with pytest.raises(NotImplementedError, match="i8"): - index.view("i8") else: - msg = "Cannot change data-type for object array" + msg = ( + r"Cannot change data-type for array of references\.|" + r"Cannot change data-type for object array\.|" + r"Cannot change data-type for array of strings\.|" + ) with pytest.raises(TypeError, match=msg): index.view("i8") @@ -957,10 +958,9 @@ def test_isin_empty(self, empty): result = index.isin(empty) tm.assert_numpy_array_equal(expected, result) - @td.skip_if_no("pyarrow") - def test_isin_arrow_string_null(self): + def test_isin_string_null(self, string_dtype_no_object): # GH#55821 - index = Index(["a", "b"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b"], dtype=string_dtype_no_object) result = index.isin([None]) expected = np.array([False, False]) tm.assert_numpy_array_equal(result, expected) @@ -1722,3 +1722,13 @@ def test_nan_comparison_same_object(op): result = op(idx, idx.copy()) tm.assert_numpy_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_is_monotonic_pyarrow_list_type(): + # GH 57333 + import pyarrow as pa + + idx = Index([[1], [2, 3]], dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + assert not idx.is_monotonic_increasing + assert not idx.is_monotonic_decreasing diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 412a59d15307d..c08fcdaedbefe 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -147,6 +147,7 @@ def test_copy_and_deepcopy(self, index_flat): new_copy = index.copy(deep=True, name="banana") assert new_copy.name == "banana" + @pytest.mark.filterwarnings(r"ignore:Dtype inference:FutureWarning") def test_copy_name(self, index_flat): # GH#12309: Check that the "name" argument # passed at initialization is honored. @@ -452,6 +453,7 @@ def test_sort_values_invalid_na_position(index_with_missing, na_position): index_with_missing.sort_values(na_position=na_position) +@pytest.mark.fails_arm_wheels @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize("na_position", ["first", "last"]) def test_sort_values_with_missing(index_with_missing, na_position, request): @@ -500,3 +502,12 @@ def test_ndarray_compat_properties(index): # test for validity idx.nbytes idx.values.nbytes + + +def test_compare_read_only_array(): + # GH#57130 + arr = np.array([], dtype=object) + arr.flags.writeable = False + idx = pd.Index(arr) + result = idx > 69 + assert result.dtype == bool diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index 72641077c90fe..6042e5b9cc679 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -413,7 +413,7 @@ class ArrayLike: def __init__(self, array) -> None: self.array = array - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: return self.array expected = Index(array) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 1787379b0faee..2f6bdb1fd8969 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas._libs.tslibs import Timestamp from pandas.core.dtypes.common import ( @@ -27,6 +25,7 @@ PeriodIndex, RangeIndex, Series, + StringDtype, TimedeltaIndex, isna, period_range, @@ -261,7 +260,7 @@ def test_ensure_copied_data(self, index): "RangeIndex cannot be initialized from data, " "MultiIndex and CategoricalIndex are tested separately" ) - elif index.dtype == object and index.inferred_type == "boolean": + elif index.dtype == object and index.inferred_type in ["boolean", "string"]: init_kwargs["dtype"] = index.dtype index_type = type(index) @@ -295,12 +294,17 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._mask, result._values._mask, check_same="same" ) - elif index.dtype == "string[python]": + elif ( + isinstance(index.dtype, StringDtype) and index.dtype.storage == "python" + ): assert np.shares_memory(index._values._ndarray, result._values._ndarray) tm.assert_numpy_array_equal( index._values._ndarray, result._values._ndarray, check_same="same" ) - elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"): + elif ( + isinstance(index.dtype, StringDtype) + and index.dtype.storage == "pyarrow" + ): assert tm.shares_memory(result._values, index._values) else: raise NotImplementedError(index.dtype) @@ -425,11 +429,7 @@ def test_insert_base(self, index): result = trimmed.insert(0, index[0]) assert index[0:4].equals(result) - @pytest.mark.skipif( - using_pyarrow_string_dtype(), - reason="completely different behavior, tested elsewher", - ) - def test_insert_out_of_bounds(self, index): + def test_insert_out_of_bounds(self, index, using_infer_string): # TypeError/IndexError matches what np.insert raises in these cases if len(index) > 0: @@ -441,6 +441,12 @@ def test_insert_out_of_bounds(self, index): msg = "index (0|0.5) is out of bounds for axis 0 with size 0" else: msg = "slice indices must be integers or None or have an __index__ method" + + if using_infer_string and ( + index.dtype == "string" or index.dtype == "category" # noqa: PLR1714 + ): + msg = "loc must be an integer between" + with pytest.raises(err, match=msg): index.insert(0.5, "foo") @@ -479,6 +485,7 @@ def test_delete_base(self, index): with pytest.raises(IndexError, match=msg): index.delete(length) + @pytest.mark.filterwarnings(r"ignore:Dtype inference:FutureWarning") @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_equals(self, index): if isinstance(index, IntervalIndex): @@ -859,21 +866,14 @@ def test_inv(self, simple_index, using_infer_string): tm.assert_series_equal(res2, Series(expected)) else: if idx.dtype.kind == "f": - err = TypeError msg = "ufunc 'invert' not supported for the input types" - elif using_infer_string and idx.dtype == "string": - import pyarrow as pa - - err = pa.lib.ArrowNotImplementedError - msg = "has no kernel" else: - err = TypeError - msg = "bad operand" - with pytest.raises(err, match=msg): + msg = "bad operand|__invert__ is not supported for string dtype" + with pytest.raises(TypeError, match=msg): ~idx # check that we get the same behavior with Series - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): ~Series(idx) def test_is_boolean_is_deprecated(self, simple_index): diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 4a6982cf98670..f6a865ccbb3a0 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -240,9 +240,6 @@ def test_intersection_base(self, index): with pytest.raises(TypeError, match=msg): first.intersection([1, 2, 3]) - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_union_base(self, index): index = index.unique() @@ -270,9 +267,6 @@ def test_union_base(self, index): first.union([1, 2, 3]) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) def test_difference_base(self, sort, index): first = index[2:] second = index[:4] @@ -299,10 +293,13 @@ def test_difference_base(self, sort, index): first.difference([1, 2, 3], sort) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) - def test_symmetric_difference(self, index): + def test_symmetric_difference(self, index, using_infer_string, request): + if ( + using_infer_string + and index.dtype == "object" + and index.inferred_type == "string" + ): + request.applymarker(pytest.mark.xfail(reason="TODO: infer_string")) if isinstance(index, CategoricalIndex): pytest.skip(f"Not relevant for {type(index).__name__}") if len(index) < 2: @@ -522,10 +519,8 @@ def test_intersection_difference_match_empty(self, index, sort): tm.assert_index_equal(inter, diff, exact=True) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") -@pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" -) @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] ) diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py index 311f2b5c9aa59..5166cadae499e 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_astype.py +++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -44,7 +44,7 @@ def test_astype_object_with_nat(self): tm.assert_index_equal(result, expected) assert idx.tolist() == expected_list - def test_astype(self): + def test_astype(self, using_infer_string): # GH 13149, GH 13209 idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan], name="idx") @@ -61,7 +61,12 @@ def test_astype(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) rng = timedelta_range("1 days", periods=10) diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index cabfee9aa040a..dd51917b85a59 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -2,7 +2,6 @@ import pytest from pandas._libs import index as libindex -from pandas.compat import IS64 import pandas as pd from pandas import ( @@ -210,7 +209,6 @@ def test_mi_intervalindex_slicing_with_scalar(self): expected = Series([1, 6, 2, 8, 7], index=expected_index, name="value") tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not IS64, reason="GH 23440") @pytest.mark.parametrize( "base", [101, 1010], diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index 283921a23e368..018db5846f4e2 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat import IS64 - from pandas import ( Index, Interval, @@ -211,7 +209,6 @@ def test_loc_getitem_missing_key_error_message( obj.loc[[4, 5, 6]] -@pytest.mark.xfail(not IS64, reason="GH 23440") @pytest.mark.parametrize( "intervals", [ diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 5508153322adb..fa5ec63dd32fe 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -588,7 +588,7 @@ def test_loc_nan_multiindex(using_infer_string): np.ones((1, 4)), index=Index( [np.nan], - dtype="object" if not using_infer_string else "string[pyarrow_numpy]", + dtype="object" if not using_infer_string else "str", name="u3", ), columns=Index(["d1", "d2", "d3", "d4"]), diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 0e32399b131c3..ecc640cfd0571 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas.compat import ( IS64, is_platform_windows, @@ -833,11 +831,10 @@ def replacer(self, how, from_key, to_key): raise ValueError return replacer - # Expected needs adjustment for the infer string option, seems to work as expecetd - @pytest.mark.skipif(using_pyarrow_string_dtype(), reason="TODO: test is to complex") - def test_replace_series(self, how, to_key, from_key, replacer): + def test_replace_series(self, how, to_key, from_key, replacer, using_infer_string): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = obj.astype(from_key) assert obj.dtype == from_key if from_key.startswith("datetime") and to_key.startswith("datetime"): @@ -858,7 +855,10 @@ def test_replace_series(self, how, to_key, from_key, replacer): else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") - assert exp.dtype == to_key + + if using_infer_string and exp.dtype == "string": + # with infer_string, we disable the deprecated downcasting behavior + exp = exp.astype(object) msg = "Downcasting behavior in `replace`" warn = FutureWarning @@ -889,8 +889,9 @@ def test_replace_series_datetime_tz( assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") - if using_infer_string and to_key == "object": - assert exp.dtype == "string" + if using_infer_string and exp.dtype == "string": + # with infer_string, we disable the deprecated downcasting behavior + exp = exp.astype(object) else: assert exp.dtype == to_key diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 409eca42f404b..c2742f42e3a92 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -535,7 +535,8 @@ def test_iloc_setitem_frame_duplicate_columns_multiple_blocks( # if the assigned values cannot be held by existing integer arrays, # we cast - df.iloc[:, 0] = df.iloc[:, 0] + 0.5 + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.iloc[:, 0] = df.iloc[:, 0] + 0.5 if not using_array_manager: assert len(df._mgr.blocks) == 2 @@ -1215,21 +1216,27 @@ def test_iloc_getitem_int_single_ea_block_view(self): arr[2] = arr[-1] assert ser[0] == arr[-1] - def test_iloc_setitem_multicolumn_to_datetime(self): + def test_iloc_setitem_multicolumn_to_datetime(self, using_infer_string): # GH#20511 df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]}) - df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) - expected = DataFrame( - { - "A": [ - Timestamp("2021-01-01 00:00:00"), - Timestamp("2022-01-01 00:00:00"), - ], - "B": ["2021", "2022"], - } - ) - tm.assert_frame_equal(df, expected, check_dtype=False) + if using_infer_string: + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) + else: + df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) + expected = DataFrame( + { + "A": [ + Timestamp("2021-01-01 00:00:00"), + Timestamp("2022-01-01 00:00:00"), + ], + "B": ["2021", "2022"], + } + ) + tm.assert_frame_equal(df, expected, check_dtype=False) class TestILocErrors: @@ -1471,6 +1478,7 @@ def test_iloc_setitem_pure_position_based(self): def test_iloc_nullable_int64_size_1_nan(self): # GH 31861 result = DataFrame({"a": ["test"], "b": [np.nan]}) - result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") expected = DataFrame({"a": ["test"], "b": array([NA], dtype="Int64")}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 57f45f867254d..07275302dcf9f 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -294,7 +292,7 @@ def test_dups_fancy_indexing_only_missing_label(self, using_infer_string): with pytest.raises( KeyError, match=re.escape( - "\"None of [Index(['E'], dtype='string')] are in the [index]\"" + "\"None of [Index(['E'], dtype='str')] are in the [index]\"" ), ): dfnu.loc[["E"]] @@ -461,9 +459,6 @@ def test_set_index_nan(self): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't multiply arrow strings" - ) def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df # set float64 to avoid upcast when setting nan @@ -571,6 +566,7 @@ def test_astype_assignment(self, using_infer_string): df_orig = DataFrame( [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) + df_orig[list("ABCDG")] = df_orig[list("ABCDG")].astype(object) df = df_orig.copy() @@ -580,9 +576,9 @@ def test_astype_assignment(self, using_infer_string): expected = DataFrame( [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["A"] = expected["A"].astype(object) - expected["B"] = expected["B"].astype(object) + expected[list("CDG")] = expected[list("CDG")].astype(object) + expected["A"] = expected["A"].astype(object) + expected["B"] = expected["B"].astype(object) tm.assert_frame_equal(df, expected) # GH5702 (loc) @@ -591,18 +587,16 @@ def test_astype_assignment(self, using_infer_string): expected = DataFrame( [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["A"] = expected["A"].astype(object) + expected[list("ABCDG")] = expected[list("ABCDG")].astype(object) tm.assert_frame_equal(df, expected) df = df_orig.copy() + df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) expected = DataFrame( [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["B"] = expected["B"].astype(object) - expected["C"] = expected["C"].astype(object) + expected[list("ABCDG")] = expected[list("ABCDG")].astype(object) tm.assert_frame_equal(df, expected) def test_astype_assignment_full_replacements(self): @@ -689,8 +683,7 @@ def test_loc_setitem_fullindex_views(self): df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") - def test_rhs_alignment(self): + def test_rhs_alignment(self, using_infer_string): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases def run_tests(df, rhs, right_loc, right_iloc): @@ -734,8 +727,15 @@ def run_tests(df, rhs, right_loc, right_iloc): frame["jolie"] = frame["jolie"].map(lambda x: f"@{x}") right_iloc["joe"] = [1.0, "@-28", "@-20", "@-12", 17.0] right_iloc["jolie"] = ["@2", -26.0, -18.0, -10.0, "@18"] - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - run_tests(df, rhs, right_loc, right_iloc) + if using_infer_string: + with pytest.raises(TypeError, match="Invalid value"): + with tm.assert_produces_warning( + FutureWarning, match="incompatible dtype" + ): + run_tests(df, rhs, right_loc, right_iloc) + else: + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + run_tests(df, rhs, right_loc, right_iloc) @pytest.mark.parametrize( "idx", [_mklbl("A", 20), np.arange(20) + 100, np.linspace(100, 150, 20)] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index fb0adc56c401b..dc4f159cfd3c3 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1,5 +1,6 @@ """ test label based indexing with loc """ from collections import namedtuple +import contextlib from datetime import ( date, datetime, @@ -12,7 +13,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import index as libindex from pandas.compat.numpy import np_version_gt2 @@ -63,12 +64,17 @@ def test_not_change_nan_loc(series, new_series, expected_ser): class TestLoc: - def test_none_values_on_string_columns(self): + def test_none_values_on_string_columns(self, using_infer_string): # Issue #32218 - df = DataFrame(["1", "2", None], columns=["a"], dtype="str") - + df = DataFrame(["1", "2", None], columns=["a"], dtype=object) assert df.loc[2, "a"] is None + df = DataFrame(["1", "2", None], columns=["a"], dtype="str") + if using_infer_string: + assert np.isnan(df.loc[2, "a"]) + else: + assert df.loc[2, "a"] is None + @pytest.mark.parametrize("kind", ["series", "frame"]) def test_loc_getitem_int(self, kind, request): # int label @@ -584,7 +590,8 @@ def test_loc_setitem_consistency(self, frame_for_consistency, val): } ) df = frame_for_consistency.copy() - df.loc[:, "date"] = val + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = val tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_dt64_to_str(self, frame_for_consistency): @@ -598,7 +605,8 @@ def test_loc_setitem_consistency_dt64_to_str(self, frame_for_consistency): } ) df = frame_for_consistency.copy() - df.loc[:, "date"] = "foo" + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = "foo" tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_dt64_to_float(self, frame_for_consistency): @@ -611,14 +619,16 @@ def test_loc_setitem_consistency_dt64_to_float(self, frame_for_consistency): } ) df = frame_for_consistency.copy() - df.loc[:, "date"] = 1.0 + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = 1.0 tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_single_row(self): # GH 15494 # setting on frame with single row df = DataFrame({"date": Series([Timestamp("20180101")])}) - df.loc[:, "date"] = "string" + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = "string" expected = DataFrame({"date": Series(["string"])}) tm.assert_frame_equal(df, expected) @@ -638,7 +648,9 @@ def test_loc_setitem_consistency_empty(self): expected["x"] = expected["x"].astype(np.int64) tm.assert_frame_equal(df, expected) - def test_loc_setitem_consistency_slice_column_len(self): + # incompatible dtype warning + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + def test_loc_setitem_consistency_slice_column_len(self, using_infer_string): # .loc[:,column] setting with slice == len of the column # GH10408 levels = [ @@ -662,13 +674,24 @@ def test_loc_setitem_consistency_slice_column_len(self): ] df = DataFrame(values, index=mi, columns=cols) - df.loc[:, ("Respondent", "StartDate")] = to_datetime( - df.loc[:, ("Respondent", "StartDate")] - ) - df.loc[:, ("Respondent", "EndDate")] = to_datetime( - df.loc[:, ("Respondent", "EndDate")] - ) - df = df.infer_objects(copy=False) + ctx = contextlib.nullcontext() + if using_infer_string: + ctx = pytest.raises(TypeError, match="Invalid value") + + with ctx: + df.loc[:, ("Respondent", "StartDate")] = to_datetime( + df.loc[:, ("Respondent", "StartDate")] + ) + with ctx: + df.loc[:, ("Respondent", "EndDate")] = to_datetime( + df.loc[:, ("Respondent", "EndDate")] + ) + + if using_infer_string: + # infer-objects won't infer stuff anymore + return + + df = df.infer_objects() # Adding a new key df.loc[:, ("Respondent", "Duration")] = ( @@ -678,9 +701,10 @@ def test_loc_setitem_consistency_slice_column_len(self): # timedelta64[m] -> float, so this cannot be done inplace, so # no warning - df.loc[:, ("Respondent", "Duration")] = df.loc[ - :, ("Respondent", "Duration") - ] / Timedelta(60_000_000_000) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, ("Respondent", "Duration")] = df.loc[ + :, ("Respondent", "Duration") + ] / Timedelta(60_000_000_000) expected = Series( [23.0, 12.0, 14.0, 36.0], index=df.index, name=("Respondent", "Duration") @@ -1230,13 +1254,7 @@ def test_loc_setitem_empty_append_raises(self): with pytest.raises(KeyError, match=msg): df.loc[[0, 1], "x"] = data - msg = "|".join( - [ - "cannot copy sequence with size 2 to array axis with dimension 0", - r"could not broadcast input array from shape \(2,\) into shape \(0,\)", - "Must have equal len keys and value when setting with an iterable", - ] - ) + msg = "setting an array element with a sequence." with pytest.raises(ValueError, match=msg): df.loc[0:2, "x"] = data @@ -1263,20 +1281,23 @@ def test_loc_reverse_assignment(self): tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") - def test_loc_setitem_str_to_small_float_conversion_type(self): + def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string): # GH#20388 col_data = [str(np.random.default_rng(2).random() * 1e-12) for _ in range(5)] result = DataFrame(col_data, columns=["A"]) - expected = DataFrame(col_data, columns=["A"], dtype=object) + expected = DataFrame(col_data, columns=["A"]) tm.assert_frame_equal(result, expected) # assigning with loc/iloc attempts to set the values inplace, which # in this case is successful - result.loc[result.index, "A"] = [float(x) for x in col_data] - expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) - tm.assert_frame_equal(result, expected) + if using_infer_string: + with pytest.raises(TypeError, match="Invalid value"): + result.loc[result.index, "A"] = [float(x) for x in col_data] + else: + result.loc[result.index, "A"] = [float(x) for x in col_data] + expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) + tm.assert_frame_equal(result, expected) # assigning the entire column using __setitem__ swaps in the new array # GH#??? @@ -1460,7 +1481,7 @@ def test_loc_setitem_single_row_categorical(self, using_infer_string): result = df["Alpha"] expected = Series(categories, index=df.index, name="Alpha").astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) tm.assert_series_equal(result, expected) @@ -1487,7 +1508,11 @@ def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): # if result started off with object dtype, then the .loc.__setitem__ # below would retain object dtype result = DataFrame(index=idx, columns=["var"], dtype=np.float64) - result.loc[:, idxer] = expected + with tm.assert_produces_warning( + FutureWarning if idxer == "var" else None, match="incompatible dtype" + ): + # See https://github.com/pandas-dev/pandas/issues/56223 + result.loc[:, idxer] = expected tm.assert_frame_equal(result, expected) def test_loc_setitem_time_key(self, using_array_manager): @@ -1566,16 +1591,10 @@ def test_loc_setitem_2d_to_1d_raises(self): # float64 dtype to avoid upcast when trying to set float data ser = Series(range(2), dtype="float64") - msg = "|".join( - [ - r"shape mismatch: value array of shape \(2,2\)", - r"cannot reshape array of size 4 into shape \(2,\)", - ] - ) + msg = "setting an array element with a sequence." with pytest.raises(ValueError, match=msg): ser.loc[range(2)] = data - msg = r"could not broadcast input array from shape \(2,2\) into shape \(2,?\)" with pytest.raises(ValueError, match=msg): ser.loc[:] = data @@ -1637,7 +1656,7 @@ def test_loc_setitem_single_column_mixed(self, using_infer_string): df.loc[df.index[::2], "str"] = np.nan expected = Series( [np.nan, "qux", np.nan, "qux", np.nan], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ).values tm.assert_almost_equal(df["str"].values, expected) @@ -3355,3 +3374,15 @@ def test_getitem_loc_str_periodindex(self): index = pd.period_range(start="2000", periods=20, freq="B") series = Series(range(20), index=index) assert series.loc["2000-01-14"] == 9 + + def test_loc_nonunique_masked_index(self): + # GH 57027 + ids = list(range(11)) + index = Index(ids * 1000, dtype="Int64") + df = DataFrame({"val": np.arange(len(index), dtype=np.intp)}, index=index) + result = df.loc[ids] + expected = DataFrame( + {"val": index.argsort(kind="stable").astype(np.intp)}, + index=Index(np.array(ids).repeat(1000), dtype="Int64"), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index ca551024b4c1f..5fcb71d0186a6 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -227,7 +227,7 @@ def test_partial_set_empty_frame_empty_consistencies(self, using_infer_string): { "x": Series( ["1", "2"], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ), "y": Series([np.nan, np.nan], dtype=object), } diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 15c2b8d000b37..c32b31c297c5d 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -1,4 +1,7 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import numpy as np import pytest @@ -179,8 +182,6 @@ def test_missing_from_masked(): } ) - df2 = df.__dataframe__() - rng = np.random.default_rng(2) dict_null = {col: rng.integers(low=0, high=len(df)) for col in df.columns} for col, num_nulls in dict_null.items(): @@ -303,6 +304,51 @@ def test_multi_chunk_pyarrow() -> None: pd.api.interchange.from_dataframe(table, allow_copy=False) +def test_multi_chunk_column() -> None: + pytest.importorskip("pyarrow", "11.0.0") + ser = pd.Series([1, 2, None], dtype="Int64[pyarrow]") + df = pd.concat([ser, ser], ignore_index=True).to_frame("a") + df_orig = df.copy() + with pytest.raises( + RuntimeError, match="Found multi-chunk pyarrow array, but `allow_copy` is False" + ): + pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=False)) + result = pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=True)) + # Interchange protocol defaults to creating numpy-backed columns, so currently this + # is 'float64'. + expected = pd.DataFrame({"a": [1.0, 2.0, None, 1.0, 2.0, None]}, dtype="float64") + tm.assert_frame_equal(result, expected) + + # Check that the rechunking we did didn't modify the original DataFrame. + tm.assert_frame_equal(df, df_orig) + assert len(df["a"].array._pa_array.chunks) == 2 + assert len(df_orig["a"].array._pa_array.chunks) == 2 + + +def test_timestamp_ns_pyarrow(): + # GH 56712 + pytest.importorskip("pyarrow", "11.0.0") + timestamp_args = { + "year": 2000, + "month": 1, + "day": 1, + "hour": 1, + "minute": 1, + "second": 1, + } + df = pd.Series( + [datetime(**timestamp_args)], + dtype="timestamp[ns][pyarrow]", + name="col0", + ).to_frame() + + dfi = df.__dataframe__() + result = pd.api.interchange.from_dataframe(dfi)["col0"].item() + + expected = pd.Timestamp(**timestamp_args) + assert result == expected + + @pytest.mark.parametrize("tz", ["UTC", "US/Pacific"]) @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_datetimetzdtype(tz, unit): @@ -362,3 +408,201 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: interchange.get_column_by_name = lambda _: column monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange) pd.api.interchange.from_dataframe(df) + + +def test_empty_string_column(): + # https://github.com/pandas-dev/pandas/issues/56703 + df = pd.DataFrame({"a": []}, dtype=str) + df2 = df.__dataframe__() + result = pd.api.interchange.from_dataframe(df2) + tm.assert_frame_equal(df, result) + + +def test_large_string(): + # GH#56702 + pytest.importorskip("pyarrow") + df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") + result = pd.api.interchange.from_dataframe(df.__dataframe__()) + expected = pd.DataFrame({"a": ["x"]}, dtype="str") + tm.assert_frame_equal(result, expected) + + +def test_non_str_names(): + # https://github.com/pandas-dev/pandas/issues/56701 + df = pd.Series([1, 2, 3], name=0).to_frame() + names = df.__dataframe__().column_names() + assert names == ["0"] + + +def test_non_str_names_w_duplicates(): + # https://github.com/pandas-dev/pandas/issues/56701 + df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) + dfi = df.__dataframe__() + with pytest.raises( + TypeError, + match=( + "Expected a Series, got a DataFrame. This likely happened because you " + "called __dataframe__ on a DataFrame which, after converting column " + r"names to string, resulted in duplicated names: Index\(\['0', '0'\], " + r"dtype='(str|object)'\). Please rename these columns before using the " + "interchange protocol." + ), + ): + pd.api.interchange.from_dataframe(dfi, allow_copy=False) + + +@pytest.mark.parametrize( + ("data", "dtype", "expected_dtype"), + [ + ([1, 2, None], "Int64", "int64"), + ([1, 2, None], "Int64[pyarrow]", "int64"), + ([1, 2, None], "Int8", "int8"), + ([1, 2, None], "Int8[pyarrow]", "int8"), + ( + [1, 2, None], + "UInt64", + "uint64", + ), + ( + [1, 2, None], + "UInt64[pyarrow]", + "uint64", + ), + ([1.0, 2.25, None], "Float32", "float32"), + ([1.0, 2.25, None], "Float32[pyarrow]", "float32"), + ([True, False, None], "boolean", "bool"), + ([True, False, None], "boolean[pyarrow]", "bool"), + (["much ado", "about", None], pd.StringDtype(na_value=np.nan), "large_string"), + (["much ado", "about", None], "string[pyarrow]", "large_string"), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), None], + "timestamp[ns][pyarrow]", + "timestamp[ns]", + ), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), None], + "timestamp[us][pyarrow]", + "timestamp[us]", + ), + ( + [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + None, + ], + "timestamp[us, Asia/Kathmandu][pyarrow]", + "timestamp[us, tz=Asia/Kathmandu]", + ), + ], +) +def test_pandas_nullable_with_missing_values( + data: list, dtype: str, expected_dtype: str +) -> None: + # https://github.com/pandas-dev/pandas/issues/57643 + # https://github.com/pandas-dev/pandas/issues/57664 + pa = pytest.importorskip("pyarrow", "11.0.0") + import pyarrow.interchange as pai + + if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": + expected_dtype = pa.timestamp("us", "Asia/Kathmandu") + + df = pd.DataFrame({"a": data}, dtype=dtype) + result = pai.from_dataframe(df.__dataframe__())["a"] + assert result.type == expected_dtype + assert result[0].as_py() == data[0] + assert result[1].as_py() == data[1] + assert result[2].as_py() is None + + +@pytest.mark.parametrize( + ("data", "dtype", "expected_dtype"), + [ + ([1, 2, 3], "Int64", "int64"), + ([1, 2, 3], "Int64[pyarrow]", "int64"), + ([1, 2, 3], "Int8", "int8"), + ([1, 2, 3], "Int8[pyarrow]", "int8"), + ( + [1, 2, 3], + "UInt64", + "uint64", + ), + ( + [1, 2, 3], + "UInt64[pyarrow]", + "uint64", + ), + ([1.0, 2.25, 5.0], "Float32", "float32"), + ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"), + ([True, False, False], "boolean", "bool"), + ([True, False, False], "boolean[pyarrow]", "bool"), + ( + ["much ado", "about", "nothing"], + pd.StringDtype(na_value=np.nan), + "large_string", + ), + (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], + "timestamp[ns][pyarrow]", + "timestamp[ns]", + ), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], + "timestamp[us][pyarrow]", + "timestamp[us]", + ), + ( + [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + datetime(2020, 1, 3, tzinfo=timezone.utc), + ], + "timestamp[us, Asia/Kathmandu][pyarrow]", + "timestamp[us, tz=Asia/Kathmandu]", + ), + ], +) +def test_pandas_nullable_without_missing_values( + data: list, dtype: str, expected_dtype: str +) -> None: + # https://github.com/pandas-dev/pandas/issues/57643 + pa = pytest.importorskip("pyarrow", "11.0.0") + import pyarrow.interchange as pai + + if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": + expected_dtype = pa.timestamp("us", "Asia/Kathmandu") + + df = pd.DataFrame({"a": data}, dtype=dtype) + result = pai.from_dataframe(df.__dataframe__())["a"] + assert result.type == expected_dtype + assert result[0].as_py() == data[0] + assert result[1].as_py() == data[1] + assert result[2].as_py() == data[2] + + +def test_string_validity_buffer() -> None: + # https://github.com/pandas-dev/pandas/issues/57761 + pytest.importorskip("pyarrow", "11.0.0") + df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") + result = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] + assert result is None + + +def test_string_validity_buffer_no_missing() -> None: + # https://github.com/pandas-dev/pandas/issues/57762 + pytest.importorskip("pyarrow", "11.0.0") + df = pd.DataFrame({"a": ["x", None]}, dtype="large_string[pyarrow]") + validity = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] + assert validity is not None + result = validity[1] + expected = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, "=") + assert result == expected + + +def test_empty_dataframe(): + # https://github.com/pandas-dev/pandas/issues/56700 + df = pd.DataFrame({"a": []}, dtype="int8") + dfi = df.__dataframe__() + result = pd.api.interchange.from_dataframe(dfi, allow_copy=False) + expected = pd.DataFrame({"a": []}, dtype="int8") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index f816cef38b9ab..1251a6ae97a1c 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -68,9 +68,7 @@ def test_deprecations(name): def test_make_block_2d_with_dti(): # GH#41168 dti = pd.date_range("2012", periods=3, tz="UTC") - msg = "make_block is deprecated" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - blk = api.make_block(dti, placement=[0]) + blk = api.make_block(dti, placement=[0]) assert blk.shape == (1, 3) assert blk.values.shape == (1, 3) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 2265522bc7ecb..30c5d3177c5a5 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -626,7 +626,7 @@ def _compare(old_mgr, new_mgr): mgr.iset(1, np.array(["2."] * N, dtype=np.object_)) mgr.iset(2, np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(copy=True) - dtype = "string[pyarrow_numpy]" if using_infer_string else np.object_ + dtype = "str" if using_infer_string else np.object_ assert new_mgr.iget(0).dtype == dtype assert new_mgr.iget(1).dtype == dtype assert new_mgr.iget(2).dtype == dtype @@ -1383,11 +1383,9 @@ def test_validate_ndim(): values = np.array([1.0, 2.0]) placement = BlockPlacement(slice(2)) msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" - depr_msg = "make_block is deprecated" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(DeprecationWarning, match=depr_msg): - make_block(values, placement, ndim=2) + make_block(values, placement, ndim=2) def test_block_shape(): @@ -1402,12 +1400,8 @@ def test_make_block_no_pandas_array(block_maker): # https://github.com/pandas-dev/pandas/pull/24866 arr = pd.arrays.NumpyExtensionArray(np.array([1, 2])) - warn = None if block_maker is not make_block else DeprecationWarning - msg = "make_block is deprecated and will be removed in a future version" - # NumpyExtensionArray, no dtype - with tm.assert_produces_warning(warn, match=msg): - result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) + result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] if block_maker is make_block: @@ -1415,16 +1409,14 @@ def test_make_block_no_pandas_array(block_maker): assert result.is_extension is False # NumpyExtensionArray, NumpyEADtype - with tm.assert_produces_warning(warn, match=msg): - result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) + result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False # new_block no longer taked dtype keyword # ndarray, NumpyEADtype - with tm.assert_produces_warning(warn, match=msg): - result = block_maker( - arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim - ) + result = block_maker( + arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim + ) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index ab6cacc4cc860..a5ddda9d66e7a 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -67,14 +67,13 @@ def s3_base(worker_id, monkeypatch): monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") if is_ci_environment(): if is_platform_arm() or is_platform_mac() or is_platform_windows(): - # NOT RUN on Windows/macOS/ARM, only Ubuntu + # NOT RUN on Windows/macOS, only Ubuntu # - subprocess in CI can cause timeouts # - GitHub Actions do not support # container services for the above OSs - # - CircleCI will probably hit the Docker rate pull limit pytest.skip( - "S3 tests do not have a corresponding service in " - "Windows, macOS or ARM platforms" + "S3 tests do not have a corresponding service on " + "Windows or macOS platforms" ) else: # set in .github/workflows/unit-tests.yml @@ -224,19 +223,3 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] - - -@pytest.fixture( - params=[ - "python", - pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - ] -) -def string_storage(request): - """ - Parametrized fixture for pd.options.mode.string_storage. - - * 'python' - * 'pyarrow' - """ - return request.param diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index f01827fa4ca2f..b5bb9b27258d8 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -3,11 +3,16 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd import pandas._testing as tm pytest.importorskip("odf") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture(autouse=True) def cd_and_set_engine(monkeypatch, datapath): diff --git a/pandas/tests/io/excel/test_odswriter.py b/pandas/tests/io/excel/test_odswriter.py index 271353a173d2a..1c728ad801bc1 100644 --- a/pandas/tests/io/excel/test_odswriter.py +++ b/pandas/tests/io/excel/test_odswriter.py @@ -6,6 +6,8 @@ import pytest +from pandas.compat import is_platform_windows + import pandas as pd import pandas._testing as tm @@ -13,6 +15,9 @@ odf = pytest.importorskip("odf") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture def ext(): diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 2df9ec9e53516..e53b5830ec6a4 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -17,6 +19,9 @@ openpyxl = pytest.importorskip("openpyxl") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture def ext(): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 15712f36da4ca..c62144adbaecb 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -16,8 +16,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - +from pandas.compat import is_platform_windows import pandas.util._test_decorators as td import pandas as pd @@ -29,10 +28,9 @@ read_csv, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) + +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ @@ -550,7 +548,7 @@ def test_reader_dtype(self, read_ext): expected["a"] = expected["a"].astype("float64") expected["b"] = expected["b"].astype("float32") - expected["c"] = Series(["001", "002", "003", "004"], dtype=object) + expected["c"] = Series(["001", "002", "003", "004"], dtype="str") tm.assert_frame_equal(actual, expected) msg = "Unable to convert column d to type int64" @@ -577,9 +575,9 @@ def test_reader_dtype(self, read_ext): { "a": Series([1, 2, 3, 4], dtype="float64"), "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), - "c": Series(["001", "002", "003", "004"], dtype=object), - "d": Series(["1", "2", np.nan, "4"], dtype=object), - } + "c": Series(["001", "002", "003", "004"], dtype="str"), + "d": Series(["1", "2", np.nan, "4"], dtype="str"), + }, ), ), ], @@ -655,16 +653,11 @@ def test_dtype_backend_and_dtype(self, read_ext): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="infer_string takes precedence" - ) def test_dtype_backend_string(self, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") - pa = pytest.importorskip("pyarrow") - with pd.option_context("mode.string_storage", string_storage): df = DataFrame( { @@ -672,27 +665,22 @@ def test_dtype_backend_string(self, read_ext, string_storage): "b": np.array(["x", pd.NA], dtype=np.object_), } ) + with tm.ensure_clean(read_ext) as file_path: df.to_excel(file_path, sheet_name="test", index=False) result = pd.read_excel( file_path, sheet_name="test", dtype_backend="numpy_nullable" ) - if string_storage == "python": - expected = DataFrame( - { - "a": StringArray(np.array(["a", "b"], dtype=np.object_)), - "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), - } - ) - else: - expected = DataFrame( - { - "a": ArrowStringArray(pa.array(["a", "b"])), - "b": ArrowStringArray(pa.array(["x", None])), - } - ) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + { + "a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)), + "b": Series(["x", None], dtype=pd.StringDtype(string_storage)), + } + ) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) @pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index 3ca8637885639..89615172688d7 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows import pandas.util._test_decorators as td from pandas import ( @@ -20,6 +21,9 @@ # could compute styles and render to excel without jinja2, since there is no # 'template' file, but this needs the import error to delayed until render time. +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + def assert_equal_cell_styles(cell1, cell2): # TODO: should find a better way to check equality diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 8c003723c1c71..d6e99de4f9d91 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -11,6 +11,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows from pandas.compat._constants import PY310 from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -34,6 +35,9 @@ ) from pandas.io.excel._util import _writers +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + def get_exp_unit(path: str) -> str: return "ns" @@ -749,6 +753,9 @@ def test_excel_date_datetime_format(self, ext, path): # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_excel_interval_no_labels(self, path, using_infer_string): # see gh-19242 # @@ -760,7 +767,7 @@ def test_to_excel_interval_no_labels(self, path, using_infer_string): df["new"] = pd.cut(df[0], 10) expected["new"] = pd.cut(expected[0], 10).astype( - str if not using_infer_string else "string[pyarrow_numpy]" + str if not using_infer_string else "str" ) df.to_excel(path, sheet_name="test1") @@ -1311,7 +1318,7 @@ def test_path_path_lib(self, engine, ext): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD")), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + index=Index([f"i-{i}" for i in range(30)]), ) writer = partial(df.to_excel, engine=engine) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 6d5008ca9ee68..066393d91eead 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd import pandas._testing as tm @@ -11,6 +13,9 @@ xlrd = pytest.importorskip("xlrd") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture(params=[".xls"]) def read_ext_xlrd(request): diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py index 94f6bdfaf069c..529367761fc02 100644 --- a/pandas/tests/io/excel/test_xlsxwriter.py +++ b/pandas/tests/io/excel/test_xlsxwriter.py @@ -2,6 +2,8 @@ import pytest +from pandas.compat import is_platform_windows + from pandas import DataFrame import pandas._testing as tm @@ -9,6 +11,9 @@ xlsxwriter = pytest.importorskip("xlsxwriter") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture def ext(): diff --git a/pandas/tests/io/formats/style/test_bar.py b/pandas/tests/io/formats/style/test_bar.py index b0e4712e8bb3d..d28c7c566d851 100644 --- a/pandas/tests/io/formats/style/test_bar.py +++ b/pandas/tests/io/formats/style/test_bar.py @@ -347,6 +347,7 @@ def test_styler_bar_with_NA_values(): def test_style_bar_with_pyarrow_NA_values(): + pytest.importorskip("pyarrow") data = """name,age,test1,test2,teacher Adam,15,95.0,80,Ashby Bob,16,81.0,82,Ashby diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 0ca29c219b55b..535ef76cb12f4 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -11,7 +11,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -1396,9 +1396,7 @@ def test_unicode_name_in_footer(self): sf = fmt.SeriesFormatter(s, name="\u05e2\u05d1\u05e8\u05d9\u05ea") sf._get_footer() # should not raise exception - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="Fixup when arrow is default" - ) + @pytest.mark.xfail(using_string_dtype(), reason="Fixup when arrow is default") def test_east_asian_unicode_series(self): # not aligned properly because of east asian width @@ -1773,9 +1771,7 @@ def chck_ncols(self, s): ncolsizes = len({len(line.strip()) for line in lines}) assert ncolsizes == 1 - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="change when arrow is default" - ) + @pytest.mark.xfail(using_string_dtype(), reason="change when arrow is default") def test_format_explicit(self): test_sers = gen_series_formatting() with option_context("display.max_rows", 4, "display.show_dimensions", False): diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 2e5a5005cb076..164e514262603 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -10,7 +10,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( CategoricalIndex, @@ -851,7 +851,7 @@ def test_to_string(self): frame.to_string() # TODO: split or simplify this test? - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="fix when arrow is default") + @pytest.mark.xfail(using_string_dtype(), reason="fix when arrow is default") def test_to_string_index_with_nan(self): # GH#2850 df = DataFrame( diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index cc101bb9c8b6d..1c7320aa7a083 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -69,7 +69,7 @@ def test_build_table_schema(self, df_schema, using_infer_string): "primaryKey": ["idx"], } if using_infer_string: - expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"} + expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected result = build_table_schema(df_schema) assert "pandas_version" in result @@ -120,9 +120,9 @@ def test_multiindex(self, df_schema, using_infer_string): expected["fields"][0] = { "name": "level_0", "type": "any", - "extDtype": "string", + "extDtype": "str", } - expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"} + expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected df.index.names = ["idx0", None] @@ -305,7 +305,7 @@ def test_to_json(self, df_table, using_infer_string): ] if using_infer_string: - fields[2] = {"name": "B", "type": "any", "extDtype": "string"} + fields[2] = {"name": "B", "type": "any", "extDtype": "str"} schema = {"fields": fields, "primaryKey": ["idx"]} data = [ diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 0eefb0b52c483..10f1e7df648f0 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -13,7 +13,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import IS64 import pandas.util._test_decorators as td @@ -24,17 +24,13 @@ DataFrame, DatetimeIndex, Index, + RangeIndex, Series, Timestamp, date_range, read_json, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) -from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.json import ujson_dumps @@ -122,7 +118,7 @@ def datetime_frame(self): # since that doesn't round-trip, see GH#33711 df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=30, freq="B"), ) df.index = df.index._with_freq(None) @@ -179,7 +175,7 @@ def test_frame_non_unique_columns(self, orient, data): # in milliseconds; these are internally stored in nanosecond, # so divide to get where we need # TODO: a to_epoch method would also solve; see GH 14772 - expected.iloc[:, 0] = expected.iloc[:, 0].astype(np.int64) // 1000000 + expected.isetitem(0, expected.iloc[:, 0].astype(np.int64) // 1000000) elif orient == "split": expected = df expected.columns = ["x", "x.1"] @@ -264,7 +260,7 @@ def test_roundtrip_categorical( expected = categorical_frame.copy() expected.index = expected.index.astype( - str if not using_infer_string else "string[pyarrow_numpy]" + str if not using_infer_string else "str" ) # Categorical not preserved expected.index.name = None # index names aren't preserved in JSON assert_json_roundtrip_equal(result, expected, orient) @@ -493,12 +489,12 @@ def test_frame_mixedtype_orient(self): # GH10289 left = read_json(inp, orient=orient, convert_axes=False) tm.assert_frame_equal(left, right) - right.index = pd.RangeIndex(len(df)) + right.index = RangeIndex(len(df)) inp = StringIO(df.to_json(orient="records")) left = read_json(inp, orient="records", convert_axes=False) tm.assert_frame_equal(left, right) - right.columns = pd.RangeIndex(df.shape[1]) + right.columns = RangeIndex(df.shape[1]) inp = StringIO(df.to_json(orient="values")) left = read_json(inp, orient="values", convert_axes=False) tm.assert_frame_equal(left, right) @@ -618,7 +614,7 @@ def test_blocks_compat_GH9037(self, using_infer_string): # JSON deserialisation always creates unicode strings df_mixed.columns = df_mixed.columns.astype( - np.str_ if not using_infer_string else "string[pyarrow_numpy]" + np.str_ if not using_infer_string else "str" ) data = StringIO(df_mixed.to_json(orient="split")) df_roundtrip = read_json(data, orient="split") @@ -703,7 +699,7 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string expected = string_series if using_infer_string and orient in ("split", "index", "columns"): # These schemas don't contain dtypes, so we infer string - expected.index = expected.index.astype("string[pyarrow_numpy]") + expected.index = expected.index.astype("str") if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -722,6 +718,9 @@ def test_series_roundtrip_object(self, orient, dtype, object_series): if orient != "split": expected.name = None + if using_string_dtype(): + expected = expected.astype("str") + tm.assert_series_equal(result, expected) def test_series_roundtrip_empty(self, orient): @@ -1491,7 +1490,7 @@ def test_from_json_to_json_table_dtypes(self): # TODO: We are casting to string which coerces None to NaN before casting back # to object, ending up with incorrect na values - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="incorrect na conversion") + @pytest.mark.xfail(using_string_dtype(), reason="incorrect na conversion") @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) def test_to_json_from_json_columns_dtypes(self, orient): # GH21892 GH33205 @@ -1750,7 +1749,7 @@ def test_to_json_indent(self, indent): assert result == expected @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="Adjust expected when infer_string is default, no bug here, " "just a complicated parametrization", ) @@ -2026,14 +2025,11 @@ def test_json_uint64(self): result = df.to_json(orient="split") assert result == expected - @pytest.mark.parametrize( - "orient", ["split", "records", "values", "index", "columns"] - ) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_json_dtype_backend( self, string_storage, dtype_backend, orient, using_infer_string ): # GH#50750 - pa = pytest.importorskip("pyarrow") df = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -2047,30 +2043,18 @@ def test_read_json_dtype_backend( } ) - if using_infer_string: - string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None])) - elif string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - - else: - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - out = df.to_json(orient=orient) with pd.option_context("mode.string_storage", string_storage): result = read_json( StringIO(out), dtype_backend=dtype_backend, orient=orient ) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -2079,12 +2063,13 @@ def test_read_json_dtype_backend( "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": Series(["a", "b", "c"], dtype=string_dtype), + "h": Series(["a", "b", None], dtype=string_dtype), } ) if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") from pandas.arrays import ArrowExtensionArray expected = DataFrame( @@ -2097,7 +2082,9 @@ def test_read_json_dtype_backend( if orient == "values": expected.columns = list(range(8)) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) @pytest.mark.parametrize("orient", ["split", "records", "index"]) def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): @@ -2146,18 +2133,18 @@ def test_pyarrow_engine_lines_false(): def test_json_roundtrip_string_inference(orient): - pytest.importorskip("pyarrow") df = DataFrame( [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] ) out = df.to_json() with pd.option_context("future.infer_string", True): result = read_json(StringIO(out)) + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( [["a", "b"], ["c", "d"]], - dtype="string[pyarrow_numpy]", - index=Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"), - columns=Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"), + dtype=dtype, + index=Index(["row 1", "row 2"], dtype=dtype), + columns=Index(["col 1", "col 2"], dtype=dtype), ) tm.assert_frame_equal(result, expected) @@ -2172,3 +2159,30 @@ def test_json_pos_args_deprecation(): with tm.assert_produces_warning(FutureWarning, match=msg): buf = BytesIO() df.to_json(buf, "split") + + +@td.skip_if_no("pyarrow") +def test_to_json_ea_null(): + # GH#57224 + df = DataFrame( + { + "a": Series([1, NA], dtype="int64[pyarrow]"), + "b": Series([2, NA], dtype="Int64"), + } + ) + result = df.to_json(orient="records", lines=True) + expected = """{"a":1,"b":2} +{"a":null,"b":null} +""" + assert result == expected + + +def test_read_json_lines_rangeindex(): + # GH 57429 + data = """ +{"a": 1, "b": 2} +{"a": 3, "b": 4} +""" + result = read_json(StringIO(data), lines=True).index + expected = RangeIndex(2) + tm.assert_index_equal(result, expected, exact=True) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 5e47bcc1c5b0e..5226476ef6eac 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -220,21 +220,15 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) # Coercions should work without warnings. - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - depr_msg = "Passing a BlockManager to DataFrame" - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - with monkeypatch.context() as m: - m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) - result = parser.read_csv(StringIO(data)) + with monkeypatch.context() as m: + m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) + result = parser.read_csv(StringIO(data)) assert type(result.a[0]) is np.float64 assert result.a.dtype == float -@pytest.mark.filterwarnings("ignore:make_block is deprecated:FutureWarning") -def test_warn_if_chunks_have_mismatched_type(all_parsers): +def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string): warning_type = None parser = all_parsers size = 10000 @@ -252,11 +246,8 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): buf = StringIO(data) if parser.engine == "pyarrow": - df = parser.read_csv_check_warnings( - DeprecationWarning, - "Passing a BlockManager to DataFrame is deprecated", + df = parser.read_csv( buf, - check_stacklevel=False, ) else: df = parser.read_csv_check_warnings( @@ -265,8 +256,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): "Specify dtype option on import or set low_memory=False.", buf, ) - - assert df.a.dtype == object + if parser.engine == "c" and parser.low_memory: + assert df.a.dtype == object + elif using_infer_string: + assert df.a.dtype == "str" + else: + assert df.a.dtype == object @pytest.mark.parametrize("iterator", [True, False]) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 7ffc49e941c14..2abca1bf52374 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -12,6 +12,9 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW from pandas.errors import ( EmptyDataError, ParserError, @@ -915,6 +918,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index a7a8d031da215..d573b47bb3279 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -72,8 +72,8 @@ def test_path_path_lib(all_parsers): parser = all_parsers df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) tm.assert_frame_equal(df, result) @@ -84,8 +84,8 @@ def test_path_local_path(all_parsers): parser = all_parsers df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_localpath( df.to_csv, lambda p: parser.read_csv(p, index_col=0) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 038c684c90c9e..aaa14216bd6d6 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -86,7 +86,9 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) -def test_multi_index_no_level_names(all_parsers, index_col): +def test_multi_index_no_level_names( + request, all_parsers, index_col, using_infer_string +): data = """index1,index2,A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 4a4ae2b259289..f5a724bad4fa2 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -130,14 +130,9 @@ def test_catch_too_many_names(all_parsers): else "Number of passed names did not match " "number of header fields in the file" ) - depr_msg = "Passing a BlockManager to DataFrame is deprecated" - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) @skip_pyarrow # CSV parse error: Empty CSV file or block @@ -168,13 +163,7 @@ def test_suppress_error_output(all_parsers): data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - msg = "Passing a BlockManager to DataFrame" - - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - result = parser.read_csv(StringIO(data), on_bad_lines="skip") + result = parser.read_csv(StringIO(data), on_bad_lines="skip") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 6d5f870f07206..90f77a7024235 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -4,6 +4,7 @@ import pytest +from pandas.compat import HAS_PYARROW from pandas.compat._optional import VERSIONS from pandas import ( @@ -117,7 +118,15 @@ def csv1(datapath): _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] -_pyarrow_parsers_only = [pytest.param(_pyarrowParser, marks=pytest.mark.single_cpu)] +_pyarrow_parsers_only = [ + pytest.param( + _pyarrowParser, + marks=[ + pytest.mark.single_cpu, + pytest.mark.skipif(not HAS_PYARROW, reason="pyarrow is not installed"), + ], + ) +] _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] @@ -181,7 +190,16 @@ def _get_all_parser_float_precision_combinations(): parser = parser.values[0] for precision in parser.float_precision_choices: # Re-wrap in pytest.param for pyarrow - mark = pytest.mark.single_cpu if parser.engine == "pyarrow" else () + mark = ( + [ + pytest.mark.single_cpu, + pytest.mark.skipif( + not HAS_PYARROW, reason="pyarrow is not installed" + ), + ] + if parser.engine == "pyarrow" + else () + ) param = pytest.param((parser(), precision), marks=mark) params.append(param) ids.append(f"{parser_id}-{precision}") diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index ce02e752fb90b..d28c43c45647a 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -16,21 +16,19 @@ Timestamp, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - IntegerArray, - StringArray, -) +from pandas.core.arrays import IntegerArray pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @pytest.mark.usefixtures("pyarrow_xfail") -def test_dtype_all_columns(all_parsers, dtype, check_orig): +def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): # see gh-3795, gh-6607 parser = all_parsers @@ -48,8 +46,10 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): if check_orig: expected = df.copy() result = result.astype(float) - else: + elif using_infer_string and dtype is str: expected = df.astype(str) + else: + expected = df.astype(str).astype(object) tm.assert_frame_equal(result, expected) @@ -67,7 +67,6 @@ def test_dtype_per_column(all_parsers): [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] ) expected["one"] = expected["one"].astype(np.float64) - expected["two"] = expected["two"].astype(object) result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) tm.assert_frame_equal(result, expected) @@ -460,8 +459,6 @@ def test_dtype_backend_and_dtype(all_parsers): def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 - pa = pytest.importorskip("pyarrow") - with pd.option_context("mode.string_storage", string_storage): parser = all_parsers @@ -471,21 +468,13 @@ def test_dtype_backend_string(all_parsers, string_storage): """ result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable") - if string_storage == "python": - expected = DataFrame( - { - "a": StringArray(np.array(["a", "b"], dtype=np.object_)), - "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), - } - ) - else: - expected = DataFrame( - { - "a": ArrowStringArray(pa.array(["a", "b"])), - "b": ArrowStringArray(pa.array(["x", None])), - } - ) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + { + "a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)), + "b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)), + }, + ) + tm.assert_frame_equal(result, expected) def test_dtype_backend_ea_dtype_specified(all_parsers): @@ -556,8 +545,7 @@ def test_ea_int_avoid_overflow(all_parsers): def test_string_inference(all_parsers): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) data = """a,b x,1 @@ -575,10 +563,8 @@ def test_string_inference(all_parsers): @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) -def test_string_inference_object_dtype(all_parsers, dtype): +def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): # GH#56047 - pytest.importorskip("pyarrow") - data = """a,b x,a y,a @@ -587,12 +573,13 @@ def test_string_inference_object_dtype(all_parsers, dtype): with pd.option_context("future.infer_string", True): result = parser.read_csv(StringIO(data), dtype=dtype) + expected_dtype = pd.StringDtype(na_value=np.nan) if dtype is str else object expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype=object), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), + "b": pd.Series(["a", "a", "a"], dtype=expected_dtype), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -601,14 +588,15 @@ def test_string_inference_object_dtype(all_parsers, dtype): expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), + "b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL @@ -619,7 +607,7 @@ def test_accurate_parsing_of_large_integers(all_parsers): AMZN,20230301181139587,2023552585717889759,2023552585717263360 MSFT,20230301181139587,2023552585717889863,2023552585717263361 NVDA,20230301181139587,2023552585717889827,2023552585717263361""" - orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) + orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2 @@ -641,3 +629,16 @@ def test_dtypes_with_usecols(all_parsers): values = ["1", "4"] expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]}) tm.assert_frame_equal(result, expected) + + +def test_index_col_with_dtype_no_rangeindex(all_parsers): + data = StringIO("345.5,519.5,0\n519.5,726.5,1") + result = all_parsers.read_csv( + data, + header=None, + names=["start", "stop", "bin_id"], + dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32}, + index_col="bin_id", + ).index + expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id") + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 27d7bc0bb6c07..5b72f76440349 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -183,7 +183,7 @@ def error(val: float, actual_val: Decimal) -> Decimal: assert max(precise_errors) <= max(normal_errors) -def test_usecols_dtypes(c_parser_only): +def test_usecols_dtypes(c_parser_only, using_infer_string): parser = c_parser_only data = """\ 1,2,3 @@ -208,8 +208,12 @@ def test_usecols_dtypes(c_parser_only): dtype={"b": int, "c": float}, ) - assert (result.dtypes == [object, int, float]).all() - assert (result2.dtypes == [object, float]).all() + if using_infer_string: + assert (result.dtypes == ["string", int, float]).all() + assert (result2.dtypes == ["string", float]).all() + else: + assert (result.dtypes == [object, int, float]).all() + assert (result2.dtypes == [object, float]).all() def test_disable_bool_parsing(c_parser_only): diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 7f3e45324dbd2..1848e1e571fc1 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -202,7 +202,7 @@ def test_converter_index_col_bug(all_parsers, conv_f): StringIO(data), sep=";", index_col="A", converters={"A": conv_f} ) - xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object")) + xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A")) tm.assert_frame_equal(rs, xp) diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index 7a72e66996d43..803114723bc74 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -26,7 +26,7 @@ def custom_dialect(): "escapechar": "~", "delimiter": ":", "skipinitialspace": False, - "quotechar": "~", + "quotechar": "`", "quoting": 3, } return dialect_name, dialect_kwargs diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index ba15d061b2deb..9224b743b8917 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -352,7 +352,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine") ) result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype}) - expected = DataFrame({"b": [2]}, index=Index([val], name="a")) + expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 1d245f81f027c..80c32d3a6262e 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -7,7 +7,10 @@ import pytest -from pandas import DataFrame +from pandas import ( + DataFrame, + Index, +) import pandas._testing as tm xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -129,7 +132,7 @@ def test_mangled_unnamed_placeholders(all_parsers): # This test recursively updates `df`. for i in range(3): - expected = DataFrame() + expected = DataFrame(columns=Index([], dtype="str")) for j in range(i + 1): col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index da9b9bddd30cd..704ca010f6506 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -12,6 +12,7 @@ import pandas as pd from pandas import DataFrame import pandas._testing as tm +from pandas.util.version import Version xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -23,10 +24,16 @@ ] -@xfail_pyarrow # ValueError: Found non-unique column index -def test_multi_thread_string_io_read_csv(all_parsers): +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_multi_thread_string_io_read_csv(all_parsers, request): # see gh-11786 parser = all_parsers + if parser.engine == "pyarrow": + pa = pytest.importorskip("pyarrow") + if Version(pa.__version__) < Version("16.0"): + request.applymarker( + pytest.mark.xfail(reason="# ValueError: Found non-unique column index") + ) max_row_range = 100 num_files = 10 diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index ca106fa772e82..dd168aaa45808 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -303,7 +303,9 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): ), ], ) -def test_na_values_keep_default(all_parsers, kwargs, expected, request): +def test_na_values_keep_default( + all_parsers, kwargs, expected, request, using_infer_string +): data = """\ A,B,C a,1,one @@ -321,8 +323,9 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request): with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), **kwargs) return - mark = pytest.mark.xfail() - request.applymarker(mark) + if not using_infer_string or "na_values" in kwargs: + mark = pytest.mark.xfail() + request.applymarker(mark) result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) @@ -432,7 +435,6 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) -@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case @pytest.mark.parametrize( "na_filter,row_data", [ @@ -440,14 +442,21 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), ], ) -def test_na_values_na_filter_override(all_parsers, na_filter, row_data): +def test_na_values_na_filter_override( + request, all_parsers, na_filter, row_data, using_infer_string +): + parser = all_parsers + if parser.engine == "pyarrow": + # mismatched dtypes in both cases, FutureWarning in the True case + if not (using_infer_string and na_filter): + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.applymarker(mark) data = """\ A,B 1,A nan,B 3,C """ - parser = all_parsers result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter) expected = DataFrame(row_data, columns=["A", "B"]) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index d8f362039ba13..616fcb81cf055 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -33,12 +33,9 @@ from pandas.io.parsers import read_csv -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), - pytest.mark.filterwarnings("ignore:make_block is deprecated:DeprecationWarning"), -] +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @@ -1807,7 +1804,7 @@ def test_parse_timezone(all_parsers): ) def test_invalid_parse_delimited_date(all_parsers, date_string): parser = all_parsers - expected = DataFrame({0: [date_string]}, dtype="object") + expected = DataFrame({0: [date_string]}, dtype="str") result = parser.read_csv( StringIO(date_string), header=None, @@ -2086,7 +2083,7 @@ def test_dayfirst_warnings(): # first in DD/MM/YYYY, second in MM/DD/YYYY input = "date\n31/12/2014\n03/30/2011" - expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date") + expected = Index(["31/12/2014", "03/30/2011"], dtype="str", name="date") # A. use dayfirst=True res5 = read_csv( @@ -2212,7 +2209,7 @@ def test_parse_dot_separated_dates(all_parsers): if parser.engine == "pyarrow": expected_index = Index( ["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"], - dtype="object", + dtype="str", name="a", ) warn = None diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index dbd474c6ae0b9..5f2ddf7de9c6d 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -520,6 +520,8 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d "c": [0, 4000, 131], } ) + if dtype["a"] == object: + expected["a"] = expected["a"].astype(object) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index bed2b5e10a6f7..d8fe168341ff1 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -22,10 +22,6 @@ DatetimeIndex, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.common import urlopen from pandas.io.parsers import ( @@ -968,36 +964,28 @@ def test_widths_and_usecols(): def test_dtype_backend(string_storage, dtype_backend): # GH#50289 - if string_storage == "python": - arr = StringArray(np.array(["a", "b"], dtype=np.object_)) - arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_)) - elif dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - arr = ArrowExtensionArray(pa.array(["a", "b"])) - arr_na = ArrowExtensionArray(pa.array([None, "a"])) - else: - pa = pytest.importorskip("pyarrow") - arr = ArrowStringArray(pa.array(["a", "b"])) - arr_na = ArrowStringArray(pa.array([None, "a"])) - data = """a b c d e f g h i 1 2.5 True a 3 4.5 False b True 6 7.5 a""" with pd.option_context("mode.string_storage", string_storage): result = read_fwf(StringIO(data), dtype_backend=dtype_backend) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = DataFrame( { "a": pd.Series([1, 3], dtype="Int64"), "b": pd.Series([2.5, 4.5], dtype="Float64"), "c": pd.Series([True, False], dtype="boolean"), - "d": arr, + "d": pd.Series(["a", "b"], dtype=string_dtype), "e": pd.Series([pd.NA, True], dtype="boolean"), "f": pd.Series([pd.NA, 6], dtype="Int64"), "g": pd.Series([pd.NA, 7.5], dtype="Float64"), - "h": arr_na, + "h": pd.Series([None, "a"], dtype=string_dtype), "i": pd.Series([pd.NA, pd.NA], dtype="Int64"), } ) @@ -1013,7 +1001,9 @@ def test_dtype_backend(string_storage, dtype_backend): ) expected["i"] = ArrowExtensionArray(pa.array([None, None])) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) def test_invalid_dtype_backend(): diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 00a81a4f1f385..d0246c8f58d6a 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -23,7 +23,7 @@ ensure_clean_store, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] tables = pytest.importorskip("tables") @@ -35,7 +35,7 @@ def test_append(setup_path): # tables.NaturalNameWarning): df = DataFrame( np.random.default_rng(2).standard_normal((20, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=20, freq="B"), ) _maybe_remove(store, "df1") @@ -196,7 +196,7 @@ def test_append_some_nans(setup_path): tm.assert_frame_equal(store["df3"], df3, check_index_type=True) -def test_append_all_nans(setup_path): +def test_append_all_nans(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: df = DataFrame( { @@ -248,7 +248,13 @@ def test_append_all_nans(setup_path): _maybe_remove(store, "df") store.append("df", df[:10], dropna=True) store.append("df", df[10:], dropna=True) - tm.assert_frame_equal(store["df"], df, check_index_type=True) + result = store["df"] + expected = df + if using_infer_string: + # TODO: Test is incorrect when not using_infer_string. + # Should take the last 4 rows uncondiationally. + expected = expected[-4:] + tm.assert_frame_equal(result, expected, check_index_type=True) _maybe_remove(store, "df2") store.append("df2", df[:10], dropna=False) @@ -287,7 +293,7 @@ def test_append_frame_column_oriented(setup_path): # column oriented df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.index = df.index._with_freq(None) # freq doesn't round-trip @@ -412,7 +418,7 @@ def check_col(key, name, size): { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]), "D": date_range("20130101", periods=5), } ).set_index("C") @@ -439,7 +445,7 @@ def check_col(key, name, size): _maybe_remove(store, "df") df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -503,7 +509,7 @@ def test_append_with_data_columns(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.iloc[0, df.columns.get_loc("B")] = 1.0 @@ -679,8 +685,8 @@ def test_append_misc(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store.append("df", df, chunksize=1) result = store.select("df") @@ -696,8 +702,8 @@ def test_append_misc_chunksize(setup_path, chunksize): # more chunksize in append tests df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["string"] = "foo" df["float322"] = 1.0 @@ -737,15 +743,15 @@ def test_append_misc_empty_frame(setup_path): # the conversion from AM->BM converts the invalid object dtype column into # a datetime64 column no longer raising an error @td.skip_array_manager_not_yet_implemented -def test_append_raise(setup_path): +def test_append_raise(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: # test append with invalid input to get good error messages # list in column df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["invalid"] = [["a"]] * len(df) assert df.dtypes["invalid"] == np.object_ @@ -765,8 +771,8 @@ def test_append_raise(setup_path): # datetime with embedded nans as object df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) s = Series(datetime.datetime(2001, 1, 2), index=df.index) s = s.astype(object) @@ -793,8 +799,8 @@ def test_append_raise(setup_path): # appending an incompatible table df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store.append("df", df) @@ -813,12 +819,9 @@ def test_append_raise(setup_path): store.append("df", df) df["foo"] = "bar" msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_1,cname->values_block_1," - "dtype->bytes24,kind->string,shape->(1, 30)] " - "vs current table " - "[name->values_block_1,cname->values_block_1," - "dtype->datetime64[s],kind->datetime64[s],shape->None]" + "Cannot serialize the column [foo] " + "because its data contents are not [string] " + "but [datetime64[s]] object dtype" ) with pytest.raises(ValueError, match=msg): store.append("df", df) @@ -874,7 +877,7 @@ def test_append_with_timedelta(setup_path): def test_append_to_multiple(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -911,12 +914,12 @@ def test_append_to_multiple(setup_path): def test_append_to_multiple_dropna(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ).rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan @@ -936,7 +939,7 @@ def test_append_to_multiple_dropna(setup_path): def test_append_to_multiple_dropna_false(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -984,3 +987,29 @@ def test_append_to_multiple_min_itemsize(setup_path): ) result = store.select_as_multiple(["index", "nums", "strs"]) tm.assert_frame_equal(result, expected, check_index_type=True) + + +def test_append_string_nan_rep(setup_path): + # GH 16300 + df = DataFrame({"A": "a", "B": "foo"}, index=np.arange(10)) + df_nan = df.copy() + df_nan.loc[0:4, :] = np.nan + msg = "NaN representation is too large for existing column size" + + with ensure_clean_store(setup_path) as store: + # string column too small + store.append("sa", df["A"]) + with pytest.raises(ValueError, match=msg): + store.append("sa", df_nan["A"]) + + # nan_rep too big + store.append("sb", df["B"], nan_rep="bars") + with pytest.raises(ValueError, match=msg): + store.append("sb", df_nan["B"]) + + # smaller modified nan_rep + store.append("sc", df["A"], nan_rep="n") + store.append("sc", df_nan["A"]) + result = store["sc"] + expected = concat([df["A"], df_nan["A"]]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py index 58ebdfe7696b4..449bc5cf1fc57 100644 --- a/pandas/tests/io/pytables/test_categorical.py +++ b/pandas/tests/io/pytables/test_categorical.py @@ -14,7 +14,7 @@ ensure_clean_store, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] def test_categorical(setup_path): diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index 2021101098892..b28101c09820f 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -22,7 +22,7 @@ _maybe_adjust_name, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] def test_pass_spec_to_storer(setup_path): @@ -88,9 +88,14 @@ def test_unimplemented_dtypes_table_columns(setup_path): with ensure_clean_store(setup_path) as store: # this fails because we have a date in the object block...... - msg = re.escape( - """Cannot serialize the column [datetime1] -because its data contents are not [string] but [date] object dtype""" + msg = "|".join( + [ + re.escape( + "Cannot serialize the column [datetime1]\nbecause its data " + "contents are not [string] but [date] object dtype" + ), + re.escape("[date] is not implemented as a table column"), + ] ) with pytest.raises(TypeError, match=msg): store.append("df_unimplemented", df) diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index d93de16816725..100a55e6e346d 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -32,11 +32,11 @@ from pandas.io import pytables from pandas.io.pytables import Term -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) -def test_mode(setup_path, tmp_path, mode): +def test_mode(setup_path, tmp_path, mode, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -85,10 +85,12 @@ def test_mode(setup_path, tmp_path, mode): read_hdf(path, "df", mode=mode) else: result = read_hdf(path, "df", mode=mode) + if using_infer_string: + df.columns = df.columns.astype("str") tm.assert_frame_equal(result, df) -def test_default_mode(tmp_path, setup_path): +def test_default_mode(tmp_path, setup_path, using_infer_string): # read_hdf uses default mode df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -98,7 +100,10 @@ def test_default_mode(tmp_path, setup_path): path = tmp_path / setup_path df.to_hdf(path, key="df", mode="w") result = read_hdf(path, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) def test_reopen_handle(tmp_path, setup_path): @@ -157,7 +162,7 @@ def test_reopen_handle(tmp_path, setup_path): assert not store.is_open -def test_open_args(setup_path): +def test_open_args(setup_path, using_infer_string): with tm.ensure_clean(setup_path) as path: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -172,8 +177,13 @@ def test_open_args(setup_path): store["df"] = df store.append("df2", df) - tm.assert_frame_equal(store["df"], df) - tm.assert_frame_equal(store["df2"], df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + + tm.assert_frame_equal(store["df"], expected) + tm.assert_frame_equal(store["df2"], expected) store.close() @@ -188,7 +198,7 @@ def test_flush(setup_path): store.flush(fsync=True) -def test_complibs_default_settings(tmp_path, setup_path): +def test_complibs_default_settings(tmp_path, setup_path, using_infer_string): # GH15943 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -201,7 +211,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complevel=9) result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -212,7 +226,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complib="zlib") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -223,7 +241,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -328,7 +350,7 @@ def test_encoding(setup_path): [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], ], ) -@pytest.mark.parametrize("dtype", ["category", object]) +@pytest.mark.parametrize("dtype", ["category", None]) def test_latin_encoding(tmp_path, setup_path, dtype, val): enc = "latin-1" nan_rep = "" diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py index 55bd3f0d5a03a..9c5fc8786c7c6 100644 --- a/pandas/tests/io/pytables/test_keys.py +++ b/pandas/tests/io/pytables/test_keys.py @@ -13,7 +13,7 @@ tables, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] def test_keys(setup_path): diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index bc5f046b7fa33..36ca68eb227a6 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -22,7 +22,7 @@ ) from pandas.util import _test_decorators as td -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] def test_format_type(tmp_path, setup_path): @@ -49,8 +49,8 @@ def test_api_default_format(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with pd.option_context("io.hdf.default_format", "fixed"): @@ -74,8 +74,8 @@ def test_api_default_format(tmp_path, setup_path): path = tmp_path / setup_path df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with pd.option_context("io.hdf.default_format", "fixed"): @@ -101,7 +101,7 @@ def test_put(setup_path): ) df = DataFrame( np.random.default_rng(2).standard_normal((20, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=20, freq="B"), ) store["a"] = ts @@ -161,7 +161,7 @@ def test_put_compression(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -178,7 +178,7 @@ def test_put_compression(setup_path): def test_put_compression_blosc(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -192,10 +192,20 @@ def test_put_compression_blosc(setup_path): tm.assert_frame_equal(store["c"], df) -def test_put_mixed_type(setup_path): +def test_put_datetime_ser(setup_path): + # https://github.com/pandas-dev/pandas/pull/60663 + ser = Series(3 * [Timestamp("20010102").as_unit("ns")]) + with ensure_clean_store(setup_path) as store: + store.put("ser", ser) + expected = ser.copy() + result = store.get("ser") + tm.assert_series_equal(result, expected) + + +def test_put_mixed_type(setup_path, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["obj1"] = "foo" @@ -215,13 +225,42 @@ def test_put_mixed_type(setup_path): with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") - with tm.assert_produces_warning(pd.errors.PerformanceWarning): + warning = None if using_infer_string else pd.errors.PerformanceWarning + with tm.assert_produces_warning(warning): store.put("df", df) expected = store.get("df") tm.assert_frame_equal(expected, df) +def test_put_str_frame(setup_path, string_dtype_arguments): + # https://github.com/pandas-dev/pandas/pull/60663 + dtype = pd.StringDtype(*string_dtype_arguments) + df = DataFrame({"a": pd.array(["x", pd.NA, "y"], dtype=dtype)}) + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + store.put("df", df) + expected_dtype = "str" if dtype.na_value is np.nan else "string" + expected = df.astype(expected_dtype) + result = store.get("df") + tm.assert_frame_equal(result, expected) + + +def test_put_str_series(setup_path, string_dtype_arguments): + # https://github.com/pandas-dev/pandas/pull/60663 + dtype = pd.StringDtype(*string_dtype_arguments) + ser = Series(["x", pd.NA, "y"], dtype=dtype) + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + store.put("ser", ser) + expected_dtype = "str" if dtype.na_value is np.nan else "string" + expected = ser.astype(expected_dtype) + result = store.get("ser") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format", ["table", "fixed"]) @pytest.mark.parametrize( "index", @@ -248,7 +287,7 @@ def test_store_index_types(setup_path, format, index): tm.assert_frame_equal(df, store["df"]) -def test_column_multiindex(setup_path): +def test_column_multiindex(setup_path, using_infer_string): # GH 4710 # recreate multi-indexes properly @@ -259,6 +298,12 @@ def test_column_multiindex(setup_path): expected = df.set_axis(df.index.to_numpy()) with ensure_clean_store(setup_path) as store: + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + store.put("df", df) + return store.put("df", df) tm.assert_frame_equal( store["df"], expected, check_index_type=True, check_column_type=True diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index e4a3ea1fc9db8..5bec673ad3c70 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -26,7 +26,7 @@ from pandas.io.pytables import TableIterator -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] def test_read_missing_key_close_store(tmp_path, setup_path): @@ -75,7 +75,7 @@ def test_read_missing_key_opened_store(tmp_path, setup_path): def test_read_column(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -216,7 +216,7 @@ def test_legacy_table_read_py2(datapath): tm.assert_frame_equal(expected, result) -def test_read_hdf_open_store(tmp_path, setup_path): +def test_read_hdf_open_store(tmp_path, setup_path, using_infer_string): # GH10330 # No check for non-string path_or-buf, and no test of open store df = DataFrame( @@ -228,6 +228,12 @@ def test_read_hdf_open_store(tmp_path, setup_path): df = df.set_index(keys="E", append=True) path = tmp_path / setup_path + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + df.to_hdf(path, key="df", mode="w") + return df.to_hdf(path, key="df", mode="w") direct = read_hdf(path, "df") with HDFStore(path, mode="r") as store: @@ -398,7 +404,6 @@ def test_read_py2_hdf_file_in_py3(datapath): def test_read_infer_string(tmp_path, setup_path): # GH#54431 - pytest.importorskip("pyarrow") df = DataFrame({"a": ["a", "b", None]}) path = tmp_path / setup_path df.to_hdf(path, key="data", format="table") @@ -406,7 +411,7 @@ def test_read_infer_string(tmp_path, setup_path): result = read_hdf(path, key="data", mode="r") expected = DataFrame( {"a": ["a", "b", None]}, - dtype="string[pyarrow_numpy]", - columns=Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 4ba9787a5a6b9..040708c9cedd0 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -24,7 +24,7 @@ ) from pandas.util import _test_decorators as td -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] def test_conv_read_write(): @@ -44,8 +44,8 @@ def roundtrip(key, obj, **kwargs): o = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) tm.assert_frame_equal(o, roundtrip("frame", o)) @@ -145,8 +145,8 @@ def test_api_invalid(tmp_path, setup_path): # Invalid. df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) msg = "Can only append to Tables" @@ -196,7 +196,7 @@ def test_put_integer(setup_path): _check_roundtrip(df, tm.assert_frame_equal, setup_path) -def test_table_values_dtypes_roundtrip(setup_path): +def test_table_values_dtypes_roundtrip(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") store.append("df_f8", df1) @@ -208,12 +208,9 @@ def test_table_values_dtypes_roundtrip(setup_path): # incompatible dtype msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_0,cname->values_block_0," - "dtype->float64,kind->float,shape->(1, 3)] vs " - "current table [name->values_block_0," - "cname->values_block_0,dtype->int64,kind->integer," - "shape->None]" + "Cannot serialize the column [a] " + "because its data contents are not [float] " + "but [integer] object dtype" ) with pytest.raises(ValueError, match=msg): store.append("df_i8", df1) @@ -242,6 +239,7 @@ def test_table_values_dtypes_roundtrip(setup_path): store.append("df_mixed_dtypes1", df1) result = store.select("df_mixed_dtypes1").dtypes.value_counts() result.index = [str(i) for i in result.index] + str_dtype = "str" if using_infer_string else "object" expected = Series( { "float32": 2, @@ -251,7 +249,7 @@ def test_table_values_dtypes_roundtrip(setup_path): "int16": 1, "int8": 1, "int64": 1, - "object": 1, + str_dtype: 1, "datetime64[ns]": 2, }, name="count", @@ -271,10 +269,10 @@ def test_series(setup_path): ) _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) - ts2 = Series(ts.index, Index(ts.index, dtype=object)) + ts2 = Series(ts.index, Index(ts.index)) _check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) - ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) + ts3 = Series(ts.values, Index(np.asarray(ts.index))) _check_roundtrip( ts3, tm.assert_series_equal, path=setup_path, check_index_type=False ) @@ -364,8 +362,8 @@ def test_timeseries_preepoch(setup_path, request): def test_frame(compression, setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) # put in some random NAs @@ -381,7 +379,7 @@ def test_frame(compression, setup_path): tdf = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _check_roundtrip( @@ -396,7 +394,10 @@ def test_frame(compression, setup_path): assert recons._mgr.is_consolidated() # empty - _check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) + df2 = df[:0] + # Prevent df2 from having index with inferred_type as string + df2.index = Index([]) + _check_roundtrip(df2[:0], tm.assert_frame_equal, path=setup_path) def test_empty_series_frame(setup_path): @@ -428,9 +429,17 @@ def test_can_serialize_dates(setup_path): _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) -def test_store_hierarchical(setup_path, multiindex_dataframe_random_data): +def test_store_hierarchical( + setup_path, using_infer_string, multiindex_dataframe_random_data +): frame = multiindex_dataframe_random_data + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + return _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) _check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) _check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) @@ -449,8 +458,8 @@ def test_store_mixed(compression, setup_path): def _make_one(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["obj1"] = "foo" df["obj2"] = "bar" diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index 0e303d1c890c5..e76934745f004 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -24,7 +24,7 @@ from pandas.io.pytables import Term -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] def test_select_columns_in_where(setup_path): @@ -132,7 +132,7 @@ def test_select(setup_path): # select with columns= df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _maybe_remove(store, "df") @@ -272,8 +272,8 @@ def test_select_dtypes(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) expected = df[df["A"] > 0] @@ -337,7 +337,7 @@ def test_select_iterator(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _maybe_remove(store, "df") @@ -362,7 +362,7 @@ def test_select_iterator(tmp_path, setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.to_hdf(path, key="df_non_table") @@ -378,7 +378,7 @@ def test_select_iterator(tmp_path, setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.to_hdf(path, key="df", format="table") @@ -395,7 +395,7 @@ def test_select_iterator(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store.append("df1", df1, data_columns=True) @@ -423,7 +423,7 @@ def test_select_iterator_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -458,7 +458,7 @@ def test_select_iterator_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -500,7 +500,7 @@ def test_select_iterator_non_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -534,7 +534,7 @@ def test_select_iterator_non_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -558,7 +558,7 @@ def test_select_iterator_many_empty_frames(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -610,7 +610,7 @@ def test_select_iterator_many_empty_frames(setup_path): def test_frame_select(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -635,7 +635,7 @@ def test_frame_select(setup_path): # invalid terms df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store.append("df_time", df) @@ -654,7 +654,7 @@ def test_frame_select_complex(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -771,7 +771,7 @@ def test_invalid_filtering(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -793,7 +793,7 @@ def test_string_select(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -837,7 +837,7 @@ def test_string_select(setup_path): def test_select_as_multiple(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -1038,7 +1038,6 @@ def test_select_large_integer(tmp_path): ), columns=["x", "y"], ) - result = None with HDFStore(path) as s: s.append("data", df, data_columns=True, index=False) result = s.select("data", where="y==-9223372036854775801").get("y").get(0) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 82d3052e7f5d6..f51d61e2d633c 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -7,6 +7,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -31,7 +35,7 @@ read_hdf, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] tables = pytest.importorskip("tables") @@ -103,7 +107,7 @@ def test_iter_empty(setup_path): assert list(store) == [] -def test_repr(setup_path): +def test_repr(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: repr(store) store.info() @@ -138,7 +142,9 @@ def test_repr(setup_path): df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate() - with tm.assert_produces_warning(pd.errors.PerformanceWarning): + warning = None if using_infer_string else pd.errors.PerformanceWarning + msg = "cannot\nmap directly to c-types .* dtype='object'" + with tm.assert_produces_warning(warning, match=msg): store["df"] = df # make a random group in hdf space @@ -309,7 +315,7 @@ def test_getattr(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store["df"] = df @@ -376,7 +382,7 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]), "D": date_range("20130101", periods=5), } ).set_index("C") @@ -392,6 +398,10 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]])) +@pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, + reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed", +) @pytest.mark.parametrize("format", ["fixed", "table"]) def test_to_hdf_errors(tmp_path, format, setup_path): data = ["\ud800foo"] @@ -413,7 +423,7 @@ def col(t, column): # data columns df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -448,7 +458,7 @@ def col(t, column): # data columns df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -490,8 +500,8 @@ def test_table_mixed_dtypes(setup_path): # frame df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["obj1"] = "foo" df["obj2"] = "bar" @@ -546,8 +556,8 @@ def test_remove(setup_path): ) df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store["a"] = ts store["b"] = df @@ -610,8 +620,8 @@ def test_same_name_scoping(setup_path): def test_store_index_name(setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df.index.name = "foo" @@ -653,8 +663,8 @@ def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz def test_store_series_name(setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) series = df["A"] @@ -668,7 +678,7 @@ def test_overwrite_node(setup_path): with ensure_clean_store(setup_path) as store: store["a"] = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) ts = Series( @@ -682,7 +692,7 @@ def test_overwrite_node(setup_path): def test_coordinates(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -717,7 +727,7 @@ def test_coordinates(setup_path): _maybe_remove(store, "df2") df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -873,8 +883,8 @@ def test_start_stop_fixed(setup_path): # sparse; not implemented df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df.iloc[3:5, 1:3] = np.nan df.iloc[8:10, -2] = np.nan @@ -900,8 +910,8 @@ def test_select_filter_corner(setup_path): def test_path_pathlib(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib( @@ -930,8 +940,8 @@ def test_contiguous_mixed_data_table(start, stop, setup_path): def test_path_pathlib_hdfstore(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) def writer(path): @@ -949,8 +959,8 @@ def reader(path): def test_pickle_path_localpath(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib( lambda p: df.to_hdf(p, key="df"), lambda p: read_hdf(p, "df") @@ -961,8 +971,8 @@ def test_pickle_path_localpath(): def test_path_localpath_hdfstore(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) def writer(path): @@ -981,8 +991,8 @@ def reader(path): def test_copy(propindexes): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 8c61830ebe038..c5613daf62207 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -104,7 +104,7 @@ def test_append_with_timezones(setup_path, gettz): msg = ( r"invalid info for \[values_block_1\] for \[tz\], " - r"existing_value \[(dateutil/.*)?US/Eastern\] " + r"existing_value \[(dateutil/.*)?(US/Eastern|America/New_York)\] " r"conflicts with new value \[(dateutil/.*)?EET\]" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index b71896c77ffb5..96aaa1e9bcb21 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -248,11 +248,13 @@ def test_zero_variables(datapath): pd.read_sas(fname) -def test_zero_rows(datapath): +@pytest.mark.parametrize("encoding", [None, "utf8"]) +def test_zero_rows(datapath, encoding): # GH 18198 fname = datapath("io", "sas", "data", "zero_rows.sas7bdat") - result = pd.read_sas(fname) - expected = pd.DataFrame([{"char_field": "a", "num_field": 1.0}]).iloc[:0] + result = pd.read_sas(fname, encoding=encoding) + str_value = b"a" if encoding is None else "a" + expected = pd.DataFrame([{"char_field": str_value, "num_field": 1.0}]).iloc[:0] tm.assert_frame_equal(result, expected) @@ -408,7 +410,7 @@ def test_0x40_control_byte(datapath): fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat") df = pd.read_sas(fname, encoding="ascii") fname = datapath("io", "sas", "data", "0x40controlbyte.csv") - df0 = pd.read_csv(fname, dtype="object") + df0 = pd.read_csv(fname, dtype="str") tm.assert_frame_equal(df, df0) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 3c0208fcc74ec..a16c63e8d3d65 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -17,10 +17,6 @@ read_clipboard, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.clipboard import ( CheckedCall, @@ -349,26 +345,18 @@ def test_raw_roundtrip(self, data): @pytest.mark.parametrize("engine", ["c", "python"]) def test_read_clipboard_dtype_backend( - self, clipboard, string_storage, dtype_backend, engine + self, clipboard, string_storage, dtype_backend, engine, using_infer_string ): # GH#50502 - if string_storage == "pyarrow" or dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - - if string_storage == "python": - string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) - string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow" and engine != "c": + if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["x", "y"])) - string_array_na = ArrowExtensionArray(pa.array(["x", None])) - + if engine == "c" and string_storage == "pyarrow": + # TODO avoid this exception? + string_dtype = pd.ArrowDtype(pa.large_string()) + else: + string_dtype = pd.ArrowDtype(pa.string()) else: - string_array = ArrowStringArray(pa.array(["x", "y"])) - string_array_na = ArrowStringArray(pa.array(["x", None])) + string_dtype = pd.StringDtype(string_storage) text = """a,b,c,d,e,f,g,h,i x,1,4.0,x,2,4.0,,True,False @@ -380,10 +368,10 @@ def test_read_clipboard_dtype_backend( expected = DataFrame( { - "a": string_array, + "a": Series(["x", "y"], dtype=string_dtype), "b": Series([1, 2], dtype="Int64"), "c": Series([4.0, 5.0], dtype="Float64"), - "d": string_array_na, + "d": Series(["x", None], dtype=string_dtype), "e": Series([2, NA], dtype="Int64"), "f": Series([4.0, NA], dtype="Float64"), "g": Series([NA, NA], dtype="Int64"), @@ -402,6 +390,11 @@ def test_read_clipboard_dtype_backend( ) expected["g"] = ArrowExtensionArray(pa.array([None, None])) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) def test_invalid_dtype_backend(self): diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 074033868635a..a815ba9c1650a 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -19,6 +19,7 @@ import pytest from pandas.compat import is_platform_windows +from pandas.compat.pyarrow import pa_version_under19p0 import pandas.util._test_decorators as td import pandas as pd @@ -166,6 +167,8 @@ def test_get_handle_pyarrow_compat(self): s = StringIO(data) with icom.get_handle(s, "rb", is_text=False) as handles: df = pa_csv.read_csv(handles.handle).to_pandas() + if pa_version_under19p0: + expected = expected.astype("object") tm.assert_frame_equal(df, expected) assert not s.closed @@ -305,10 +308,12 @@ def test_read_expands_user_home_dir( "pyarrow", ("io", "data", "feather", "feather-0_3_1.feather"), ), - ( + pytest.param( pd.read_hdf, "tables", ("io", "data", "legacy_hdf", "datetimetz_object.h5"), + # cleaned-up in https://github.com/pandas-dev/pandas/pull/57387 on main + marks=pytest.mark.xfail(reason="TODO(infer_string)", strict=False), ), (pd.read_stata, "os", ("io", "data", "stata", "stata10_115.dta")), (pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")), @@ -443,8 +448,8 @@ def test_unknown_engine(self): with tm.ensure_clean() as path: df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.to_csv(path) with pytest.raises(ValueError, match="Unknown engine"): @@ -459,8 +464,8 @@ def test_binary_mode(self): with tm.ensure_clean() as path: df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.to_csv(path, mode="w+b") tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) @@ -477,15 +482,18 @@ def test_warning_missing_utf_bom(self, encoding, compression_): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with tm.assert_produces_warning(UnicodeWarning): df.to_csv(path, compression=compression_, encoding=encoding) # reading should fail (otherwise we wouldn't need the warning) - msg = r"UTF-\d+ stream does not start with BOM" + msg = ( + r"UTF-\d+ stream does not start with BOM|" + r"'utf-\d+' codec can't decode byte" + ) with pytest.raises(UnicodeError, match=msg): pd.read_csv(path, compression=compression_, encoding=encoding) @@ -511,8 +519,8 @@ def test_codecs_encoding(encoding, format): # GH39247 expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with codecs.open(path, mode="w", encoding=encoding) as handle: @@ -529,8 +537,8 @@ def test_codecs_get_writer_reader(): # GH39247 expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with open(path, "wb") as handle: @@ -555,8 +563,8 @@ def test_explicit_encoding(io_class, mode, msg): # wrong mode is requested expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with io_class() as buffer: with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 3a58dda9e8dc4..af89f0916355e 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -145,8 +145,8 @@ def test_compression_binary(compression_only): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) # with a file @@ -177,8 +177,8 @@ def test_gzip_reproducibility_file_name(): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) compression_options = {"method": "gzip", "mtime": 1} @@ -200,8 +200,8 @@ def test_gzip_reproducibility_file_object(): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) compression_options = {"method": "gzip", "mtime": 1} diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 22a7d3b83a459..0ab23e3b51a03 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -2,12 +2,13 @@ import numpy as np import pytest +from pandas.compat.pyarrow import ( + pa_version_under18p0, + pa_version_under19p0, +) + import pandas as pd import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.feather_format import read_feather, to_feather # isort:skip @@ -15,6 +16,7 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) + pa = pytest.importorskip("pyarrow") @@ -134,8 +136,8 @@ def test_rw_use_threads(self): def test_path_pathlib(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() result = tm.round_trip_pathlib(df.to_feather, read_feather) tm.assert_frame_equal(df, result) @@ -143,8 +145,8 @@ def test_path_pathlib(self): def test_path_localpath(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() result = tm.round_trip_localpath(df.to_feather, read_feather) tm.assert_frame_equal(df, result) @@ -152,8 +154,8 @@ def test_path_localpath(self): def test_passthrough_keywords(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() self.check_round_trip(df, write_kwargs={"version": 1}) @@ -167,7 +169,9 @@ def test_http_path(self, feather_file, httpserver): res = read_feather(httpserver.url) tm.assert_frame_equal(expected, res) - def test_read_feather_dtype_backend(self, string_storage, dtype_backend): + def test_read_feather_dtype_backend( + self, string_storage, dtype_backend, using_infer_string + ): # GH#50765 df = pd.DataFrame( { @@ -182,25 +186,20 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): } ) - if string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - - else: - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - with tm.ensure_clean() as path: to_feather(df, path) with pd.option_context("mode.string_storage", string_storage): result = read_feather(path, dtype_backend=dtype_backend) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + if using_infer_string: + string_dtype = pd.ArrowDtype(pa.large_string()) + else: + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = pd.DataFrame( { "a": pd.Series([1, np.nan, 3], dtype="Int64"), @@ -209,8 +208,8 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"), "e": pd.Series([True, False, pd.NA], dtype="boolean"), "f": pd.Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": pd.Series(["a", "b", "c"], dtype=string_dtype), + "h": pd.Series(["a", "b", None], dtype=string_dtype), } ) @@ -224,6 +223,10 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): } ) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) tm.assert_frame_equal(result, expected) def test_int_columns_and_index(self): @@ -241,12 +244,43 @@ def test_invalid_dtype_backend(self): with pytest.raises(ValueError, match=msg): read_feather(path, dtype_backend="numpy") - def test_string_inference(self, tmp_path): + def test_string_inference(self, tmp_path, using_infer_string): # GH#54431 path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}) df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) - expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]") + dtype = pd.StringDtype(na_value=np.nan) + expected = pd.DataFrame( + data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) + ) + expected = pd.DataFrame( + data={"a": ["x", "y"]}, + dtype=dtype, + columns=pd.Index( + ["a"], + dtype=object + if pa_version_under19p0 and not using_infer_string + else dtype, + ), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0") + def test_string_inference_string_view_type(self, tmp_path): + # GH#54798 + import pyarrow as pa + from pyarrow import feather + + path = tmp_path / "string_view.parquet" + table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())}) + feather.write_feather(table, path) + + with pd.option_context("future.infer_string", True): + result = read_feather(path) + + expected = pd.DataFrame( + data={"a": [None, "b", "c"]}, dtype=pd.StringDtype(na_value=np.nan) + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index a1dec8a2d05b4..dde85f9f8409d 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, date_range, @@ -252,6 +254,7 @@ def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") @pytest.mark.single_cpu @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet def test_s3_parquet(s3_public_bucket, s3so, df1): diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 0ce6a8bf82cd8..9fc0f6eb47766 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under17p0 + from pandas import ( DataFrame, Index, @@ -52,7 +54,7 @@ def ls(self, path, **kwargs): # Patches pyarrow; other processes should not pick up change @pytest.mark.single_cpu @pytest.mark.parametrize("format", ["csv", "json", "parquet", "excel", "markdown"]) -def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys): +def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys, request): """ Test that many to/read functions support GCS. @@ -91,6 +93,13 @@ def from_uri(path): to_local = pathlib.Path(path.replace("gs://", "")).absolute().as_uri() return pa_fs.LocalFileSystem(to_local) + request.applymarker( + pytest.mark.xfail( + not pa_version_under17p0, + raises=TypeError, + reason="pyarrow 17 broke the mocked filesystem", + ) + ) with monkeypatch.context() as m: m.setattr(pa_fs, "FileSystem", MockFileSystem) df1.to_parquet(path) @@ -148,8 +157,8 @@ def test_to_csv_compression_encoding_gcs( """ df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) # reference of compressed and encoded file diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 607357e709b6e..b12098d4904c1 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -29,10 +29,6 @@ to_datetime, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.common import file_path_to_url @@ -154,7 +150,7 @@ def test_to_html_compat(self, flavor_read_html): df = ( DataFrame( np.random.default_rng(2).random((4, 3)), - columns=pd.Index(list("abc"), dtype=object), + columns=pd.Index(list("abc")), ) # pylint: disable-next=consider-using-f-string .map("{:.3f}".format).astype(float) @@ -180,24 +176,16 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): } ) - if string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) - elif dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - else: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - out = df.to_html(index=False) with pd.option_context("mode.string_storage", string_storage): result = flavor_read_html(StringIO(out), dtype_backend=dtype_backend)[0] + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -206,8 +194,8 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": Series(["a", "b", "c"], dtype=string_dtype), + "h": Series(["a", "b", None], dtype=string_dtype), } ) @@ -223,7 +211,9 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): } ) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) @pytest.mark.network @pytest.mark.single_cpu @@ -1391,6 +1381,7 @@ def test_displayed_only_with_many_elements(self, displayed_only, flavor_read_htm expected = DataFrame({"A": [1, 4], "B": [2, 5]}) tm.assert_frame_equal(result, expected) + @td.skip_if_windows() @pytest.mark.filterwarnings( "ignore:You provided Unicode markup but also provided a value for " "from_encoding.*:UserWarning" diff --git a/pandas/tests/io/test_http_headers.py b/pandas/tests/io/test_http_headers.py index 2ca11ad1f74e6..9918435cae15b 100644 --- a/pandas/tests/io/test_http_headers.py +++ b/pandas/tests/io/test_http_headers.py @@ -7,6 +7,8 @@ import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -105,6 +107,7 @@ def stata_responder(df): td.skip_if_no("fastparquet"), td.skip_if_no("fsspec"), td.skip_array_manager_not_yet_implemented, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string"), ], ), (pickle_respnder, pd.read_pickle), diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index a4021311fc963..4c4d7461e4ac5 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -42,7 +42,7 @@ def orc_writer_dtypes_not_supported(request): return pd.DataFrame({"unimpl": request.param}) -def test_orc_reader_empty(dirpath): +def test_orc_reader_empty(dirpath, using_infer_string): columns = [ "boolean1", "byte1", @@ -63,11 +63,12 @@ def test_orc_reader_empty(dirpath): "float32", "float64", "object", - "object", + "str" if using_infer_string else "object", ] expected = pd.DataFrame(index=pd.RangeIndex(0)) for colname, dtype in zip(columns, dtypes): expected[colname] = pd.Series(dtype=dtype) + expected.columns = expected.columns.astype("str") inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") got = read_orc(inputfile, columns=columns) @@ -304,7 +305,7 @@ def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported): orc_writer_dtypes_not_supported.to_orc() -def test_orc_dtype_backend_pyarrow(): +def test_orc_dtype_backend_pyarrow(using_infer_string): pytest.importorskip("pyarrow") df = pd.DataFrame( { @@ -335,6 +336,13 @@ def test_orc_dtype_backend_pyarrow(): for col in df.columns } ) + if using_infer_string: + # ORC does not preserve distinction between string and large string + # -> the default large string comes back as string + string_dtype = pd.ArrowDtype(pa.string()) + expected["string"] = expected["string"].astype(string_dtype) + expected["string_with_nan"] = expected["string_with_nan"].astype(string_dtype) + expected["string_with_none"] = expected["string_with_none"].astype(string_dtype) tm.assert_frame_equal(result, expected) @@ -430,7 +438,7 @@ def test_string_inference(tmp_path): result = read_orc(path) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ad7cdad363e78..45aed8df6d416 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -8,7 +8,10 @@ import numpy as np import pytest -from pandas._config import using_copy_on_write +from pandas._config import ( + using_copy_on_write, + using_string_dtype, +) from pandas._config.config import _get_option from pandas.compat import is_platform_windows @@ -16,6 +19,8 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under15p0, + pa_version_under19p0, + pa_version_under20p0, ) import pandas as pd @@ -60,11 +65,18 @@ params=[ pytest.param( "fastparquet", - marks=pytest.mark.skipif( - not _HAVE_FASTPARQUET - or _get_option("mode.data_manager", silent=True) == "array", - reason="fastparquet is not installed or ArrayManager is used", - ), + marks=[ + pytest.mark.skipif( + not _HAVE_FASTPARQUET + or _get_option("mode.data_manager", silent=True) == "array", + reason="fastparquet is not installed or ArrayManager is used", + ), + pytest.mark.xfail( + using_string_dtype(), + reason="TODO(infer_string) fastparquet", + strict=False, + ), + ], ), pytest.param( "pyarrow", @@ -86,17 +98,21 @@ def pa(): @pytest.fixture -def fp(): +def fp(request): if not _HAVE_FASTPARQUET: pytest.skip("fastparquet is not installed") elif _get_option("mode.data_manager", silent=True) == "array": pytest.skip("ArrayManager is not supported with fastparquet") + if using_string_dtype(): + request.applymarker( + pytest.mark.xfail(reason="TODO(infer_string) fastparquet", strict=False) + ) return "fastparquet" @pytest.fixture def df_compat(): - return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}) + return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"])) @pytest.fixture @@ -244,8 +260,10 @@ def test_invalid_engine(df_compat): check_round_trip(df_compat, "foo", "bar") -def test_options_py(df_compat, pa): +def test_options_py(df_compat, pa, using_infer_string): # use the set option + if using_infer_string and not pa_version_under19p0: + df_compat.columns = df_compat.columns.astype("str") with pd.option_context("io.parquet.engine", "pyarrow"): check_round_trip(df_compat) @@ -385,16 +403,6 @@ def check_external_error_on_write(self, df, engine, exc): with tm.external_error_raised(exc): to_parquet(df, path, engine, compression=None) - @pytest.mark.network - @pytest.mark.single_cpu - def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): - if engine != "auto": - pytest.importorskip(engine) - with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: - httpserver.serve_content(content=f.read()) - df = read_parquet(httpserver.url) - tm.assert_frame_equal(df, df_compat) - class TestBasic(Base): def test_error(self, engine): @@ -448,12 +456,8 @@ def test_read_filters(self, engine, tmp_path): repeat=1, ) - def test_write_index(self, engine, using_copy_on_write, request): + def test_write_index(self, engine): check_names = engine != "fastparquet" - if using_copy_on_write and engine == "fastparquet": - request.applymarker( - pytest.mark.xfail(reason="fastparquet write into index") - ) df = pd.DataFrame({"A": [1, 2, 3]}) check_round_trip(df, engine) @@ -696,6 +700,16 @@ def test_read_empty_array(self, pa, dtype): df, pa, read_kwargs={"dtype_backend": "numpy_nullable"}, expected=expected ) + @pytest.mark.network + @pytest.mark.single_cpu + def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): + if engine != "auto": + pytest.importorskip(engine) + with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: + httpserver.serve_content(content=f.read()) + df = read_parquet(httpserver.url, engine=engine) + tm.assert_frame_equal(df, df_compat) + class TestParquetPyArrow(Base): def test_basic(self, pa, df_full): @@ -785,18 +799,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type): def test_categorical(self, pa): # supported in >= 0.7.0 - df = pd.DataFrame() - df["a"] = pd.Categorical(list("abcdef")) - - # test for null, out-of-order values, and unobserved category - df["b"] = pd.Categorical( - ["bar", "foo", "foo", "bar", None, "bar"], - dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), - ) - - # test for ordered flag - df["c"] = pd.Categorical( - ["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True + df = pd.DataFrame( + { + "a": pd.Categorical(list("abcdef")), + # test for null, out-of-order values, and unobserved category + "b": pd.Categorical( + ["bar", "foo", "foo", "bar", None, "bar"], + dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), + ), + # test for ordered flag + "c": pd.Categorical( + ["a", "b", "c", "a", "c", "b"], + categories=["b", "c", "d"], + ordered=True, + ), + } ) check_round_trip(df, pa) @@ -865,11 +882,13 @@ def test_s3_roundtrip_for_dir( repeat=1, ) - def test_read_file_like_obj_support(self, df_compat): + def test_read_file_like_obj_support(self, df_compat, using_infer_string): pytest.importorskip("pyarrow") buffer = BytesIO() df_compat.to_parquet(buffer) df_from_buf = read_parquet(buffer) + if using_infer_string and not pa_version_under19p0: + df_compat.columns = df_compat.columns.astype("str") tm.assert_frame_equal(df_compat, df_from_buf) def test_expand_user(self, df_compat, monkeypatch): @@ -925,7 +944,7 @@ def test_write_with_schema(self, pa): out_df = df.astype(bool) check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df) - def test_additional_extension_arrays(self, pa): + def test_additional_extension_arrays(self, pa, using_infer_string): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol pytest.importorskip("pyarrow") @@ -936,17 +955,28 @@ def test_additional_extension_arrays(self, pa): "c": pd.Series(["a", None, "c"], dtype="string"), } ) - check_round_trip(df, pa) + if using_infer_string and pa_version_under19p0: + check_round_trip(df, pa, expected=df.astype({"c": "str"})) + else: + check_round_trip(df, pa) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) check_round_trip(df, pa) - def test_pyarrow_backed_string_array(self, pa, string_storage): + def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_string): # test ArrowStringArray supported through the __arrow_array__ protocol pytest.importorskip("pyarrow") df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) with pd.option_context("string_storage", string_storage): - check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]")) + if using_infer_string: + if pa_version_under19p0: + expected = df.astype("str") + else: + expected = df.astype(f"string[{string_storage}]") + expected.columns = expected.columns.astype("str") + else: + expected = df.astype(f"string[{string_storage}]") + check_round_trip(df, pa, expected=expected) def test_additional_extension_types(self, pa): # test additional ExtensionArrays that are supported through the @@ -972,14 +1002,9 @@ def test_timestamp_nanoseconds(self, pa): df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1ns", periods=10)}) check_round_trip(df, pa, write_kwargs={"version": ver}) - def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): - if timezone_aware_date_list.tzinfo != datetime.timezone.utc: - request.applymarker( - pytest.mark.xfail( - reason="temporary skip this test until it is properly resolved: " - "https://github.com/pandas-dev/pandas/issues/37286" - ) - ) + def test_timezone_aware_index(self, pa, timezone_aware_date_list): + pytest.importorskip("pyarrow", "11.0.0") + idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) @@ -992,7 +1017,23 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): # they both implement datetime.tzinfo # they both wrap datetime.timedelta() # this use-case sets the resolution to 1 minute - check_round_trip(df, pa, check_dtype=False) + + expected = df[:] + if pa_version_under11p0: + expected.index = expected.index.as_unit("ns") + if timezone_aware_date_list.tzinfo != datetime.timezone.utc: + # pyarrow returns pytz.FixedOffset while pandas constructs datetime.timezone + # https://github.com/pandas-dev/pandas/issues/37286 + try: + import pytz + except ImportError: + pass + else: + offset = df.index.tz.utcoffset(timezone_aware_date_list) + tz = pytz.FixedOffset(offset.total_seconds() / 60) + expected.index = expected.index.tz_convert(tz) + expected["index_as_col"] = expected["index_as_col"].dt.tz_convert(tz) + check_round_trip(df, pa, check_dtype=False, expected=expected) def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 @@ -1000,9 +1041,7 @@ def test_filter_row_groups(self, pa): df = pd.DataFrame({"a": list(range(3))}) with tm.ensure_clean() as path: df.to_parquet(path, engine=pa) - result = read_parquet( - path, pa, filters=[("a", "==", 0)], use_legacy_dataset=False - ) + result = read_parquet(path, pa, filters=[("a", "==", 0)]) assert len(result) == 1 def test_read_parquet_manager(self, pa, using_array_manager): @@ -1065,24 +1104,28 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) - def test_columns_dtypes_not_invalid(self, pa): + @pytest.mark.parametrize( + "columns", + [ + [0, 1], + pytest.param( + [b"foo", b"bar"], + marks=pytest.mark.xfail( + pa_version_under20p0, + raises=NotImplementedError, + reason="https://github.com/apache/arrow/pull/44171", + ), + ), + [ + datetime.datetime(2011, 1, 1, 0, 0), + datetime.datetime(2011, 1, 1, 1, 1), + ], + ], + ) + def test_columns_dtypes_not_invalid(self, pa, columns): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) - # numeric - df.columns = [0, 1] - check_round_trip(df, pa) - - # bytes - df.columns = [b"foo", b"bar"] - with pytest.raises(NotImplementedError, match="|S3"): - # Bytes fails on read_parquet - check_round_trip(df, pa) - - # python object - df.columns = [ - datetime.datetime(2011, 1, 1, 0, 0), - datetime.datetime(2011, 1, 1, 1, 1), - ] + df.columns = columns check_round_trip(df, pa) def test_empty_columns(self, pa): @@ -1098,17 +1141,24 @@ def test_df_attrs_persistence(self, tmp_path, pa): new_df = read_parquet(path, engine=pa) assert new_df.attrs == df.attrs - def test_string_inference(self, tmp_path, pa): + def test_string_inference(self, tmp_path, pa, using_infer_string): # GH#54431 path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"]) - df.to_parquet(path, engine="pyarrow") + df.to_parquet(path, engine=pa) with pd.option_context("future.infer_string", True): - result = read_parquet(path, engine="pyarrow") + result = read_parquet(path, engine=pa) + dtype = pd.StringDtype(na_value=np.nan) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + dtype=dtype, + index=pd.Index(["a", "b"], dtype=dtype), + columns=pd.Index( + ["a"], + dtype=object + if pa_version_under19p0 and not using_infer_string + else dtype, + ), ) tm.assert_frame_equal(result, expected) @@ -1121,7 +1171,10 @@ def test_roundtrip_decimal(self, tmp_path, pa): df = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="string[pyarrow]") df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))])) result = read_parquet(path) - expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + if pa_version_under19p0: + expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + else: + expected = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="object") tm.assert_frame_equal(result, expected) def test_infer_string_large_string_type(self, tmp_path, pa): @@ -1138,8 +1191,8 @@ def test_infer_string_large_string_type(self, tmp_path, pa): result = read_parquet(path) expected = pd.DataFrame( data={"a": [None, "b", "c"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -1193,7 +1246,17 @@ def test_duplicate_columns(self, fp): msg = "Cannot create parquet dataset with duplicate column names" self.check_error_on_write(df, fp, ValueError, msg) - def test_bool_with_none(self, fp): + def test_bool_with_none(self, fp, request): + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0") and Version( + np.__version__ + ) >= Version("2.0.0"): + request.applymarker( + pytest.mark.xfail( + reason=("fastparquet uses np.float_ in numpy2"), + ) + ) df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") # Fastparquet bug in 0.7.1 makes it so that this dtype becomes @@ -1308,8 +1371,21 @@ def test_empty_dataframe(self, fp): expected = df.copy() check_round_trip(df, fp, expected=expected) - @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index") - def test_timezone_aware_index(self, fp, timezone_aware_date_list): + def test_timezone_aware_index(self, fp, timezone_aware_date_list, request): + import fastparquet + + if Version(fastparquet.__version__) > Version("2022.12") and Version( + fastparquet.__version__ + ) < Version("2024.11.0"): + request.applymarker( + pytest.mark.xfail( + reason=( + "fastparquet bug, see " + "https://github.com/dask/fastparquet/issues/929" + ), + ) + ) + idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 4f3993a038197..05f4a20ee42d8 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -413,10 +413,16 @@ def test_read(self, protocol, get_random_path): @pytest.mark.parametrize( ["pickle_file", "excols"], [ - ("test_py27.pkl", Index(["a", "b", "c"])), + ("test_py27.pkl", Index(["a", "b", "c"], dtype=object)), ( "test_mi_py27.pkl", - pd.MultiIndex.from_arrays([["a", "b", "c"], ["A", "B", "C"]]), + pd.MultiIndex( + [ + Index(["a", "b", "c"], dtype=object), + Index(["A", "B", "C"], dtype=object), + ], + [np.array([0, 1, 2]), np.array([0, 1, 2])], + ), ), ], ) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index e118c90d9bc02..82613b4e80725 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -161,4 +161,6 @@ def test_spss_metadata(datapath): "modification_time": datetime.datetime(2015, 2, 6, 14, 33, 36), } ) - assert df.attrs == metadata + if Version(pyreadstat.__version__) >= Version("1.2.8"): + metadata["mr_sets"] = {} + tm.assert_dict_equal(df.attrs, metadata) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6645aefd4f0a7..89adf18545815 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas.compat import ( pa_version_under13p0, @@ -40,10 +42,6 @@ to_timedelta, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.util.version import Version from pandas.io import sql @@ -61,9 +59,12 @@ import sqlalchemy -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.single_cpu, +] @pytest.fixture @@ -684,6 +685,7 @@ def postgresql_psycopg2_conn(postgresql_psycopg2_engine): @pytest.fixture def postgresql_adbc_conn(): + pytest.importorskip("pyarrow") pytest.importorskip("adbc_driver_postgresql") from adbc_driver_postgresql import dbapi @@ -816,6 +818,7 @@ def sqlite_conn_types(sqlite_engine_types): @pytest.fixture def sqlite_adbc_conn(): + pytest.importorskip("pyarrow") pytest.importorskip("adbc_driver_sqlite") from adbc_driver_sqlite import dbapi @@ -956,12 +959,12 @@ def sqlite_buildin_types(sqlite_buildin, types_data): adbc_connectable_iris = [ pytest.param("postgresql_adbc_iris", marks=pytest.mark.db), - pytest.param("sqlite_adbc_iris", marks=pytest.mark.db), + "sqlite_adbc_iris", ] adbc_connectable_types = [ pytest.param("postgresql_adbc_types", marks=pytest.mark.db), - pytest.param("sqlite_adbc_types", marks=pytest.mark.db), + "sqlite_adbc_types", ] @@ -985,13 +988,13 @@ def test_dataframe_to_sql(conn, test_frame1, request): @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_empty(conn, test_frame1, request): - if conn == "postgresql_adbc_conn": + if conn == "postgresql_adbc_conn" and not using_string_dtype(): request.node.add_marker( pytest.mark.xfail( - reason="postgres ADBC driver cannot insert index with null type", - strict=True, + reason="postgres ADBC driver < 1.2 cannot insert index with null type", ) ) + # GH 51086 if conn is sqlite_engine conn = request.getfixturevalue(conn) empty_df = test_frame1.iloc[:0] @@ -1373,6 +1376,30 @@ def insert_on_conflict(table, conn, keys, data_iter): pandasSQL.drop_table("test_insert_conflict") +@pytest.mark.parametrize("conn", all_connectable) +def test_to_sql_on_public_schema(conn, request): + if "sqlite" in conn or "mysql" in conn: + request.applymarker( + pytest.mark.xfail( + reason="test for public schema only specific to postgresql" + ) + ) + + conn = request.getfixturevalue(conn) + + test_data = DataFrame([[1, 2.1, "a"], [2, 3.1, "b"]], columns=list("abc")) + test_data.to_sql( + name="test_public_schema", + con=conn, + if_exists="append", + index=False, + schema="public", + ) + + df_out = sql.read_sql_table("test_public_schema", conn, schema="public") + tm.assert_frame_equal(test_data, df_out) + + @pytest.mark.parametrize("conn", mysql_connectable) def test_insertion_method_on_conflict_update(conn, request): # GH 14553: Example in to_sql docstring @@ -2229,12 +2256,14 @@ def test_api_chunksize_read(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_categorical(conn, request): if conn == "postgresql_adbc_conn": - request.node.add_marker( - pytest.mark.xfail( - reason="categorical dtype not implemented for ADBC postgres driver", - strict=True, + adbc = import_optional_dependency("adbc_driver_postgresql", errors="ignore") + if adbc is not None and Version(adbc.__version__) < Version("0.9.0"): + request.node.add_marker( + pytest.mark.xfail( + reason="categorical dtype not implemented for ADBC postgres driver", + strict=True, + ) ) - ) # GH8624 # test that categorical gets written correctly as dense column conn = request.getfixturevalue(conn) @@ -2294,9 +2323,16 @@ def test_api_escaped_table_name(conn, request): def test_api_read_sql_duplicate_columns(conn, request): # GH#53117 if "adbc" in conn: - request.node.add_marker( - pytest.mark.xfail(reason="pyarrow->pandas throws ValueError", strict=True) - ) + pa = pytest.importorskip("pyarrow") + if not ( + Version(pa.__version__) >= Version("16.0") + and conn in ["sqlite_adbc_conn", "postgresql_adbc_conn"] + ): + request.node.add_marker( + pytest.mark.xfail( + reason="pyarrow->pandas throws ValueError", strict=True + ) + ) conn = request.getfixturevalue(conn) if sql.has_table("test_table", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -3537,7 +3573,8 @@ def test_read_sql_dtype_backend( result = getattr(pd, func)( f"Select * from {table}", conn, dtype_backend=dtype_backend ) - expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -3587,7 +3624,7 @@ def test_read_sql_dtype_backend_table( with pd.option_context("mode.string_storage", string_storage): result = getattr(pd, func)(table, conn, dtype_backend=dtype_backend) - expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -3640,24 +3677,13 @@ def dtype_backend_data() -> DataFrame: @pytest.fixture def dtype_backend_expected(): - def func(storage, dtype_backend, conn_name) -> DataFrame: - string_array: StringArray | ArrowStringArray - string_array_na: StringArray | ArrowStringArray - if storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": + def func(string_storage, dtype_backend, conn_name) -> DataFrame: + string_dtype: pd.StringDtype | pd.ArrowDtype + if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment] - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment] - + string_dtype = pd.ArrowDtype(pa.string()) else: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) + string_dtype = pd.StringDtype(string_storage) df = DataFrame( { @@ -3667,8 +3693,8 @@ def func(storage, dtype_backend, conn_name) -> DataFrame: "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, pd.NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": Series(["a", "b", "c"], dtype=string_dtype), + "h": Series(["a", "b", None], dtype=string_dtype), } ) if dtype_backend == "pyarrow": @@ -3817,7 +3843,6 @@ class Test(BaseModel): def test_read_sql_string_inference(sqlite_engine): conn = sqlite_engine # GH#54430 - pytest.importorskip("pyarrow") table = "test" df = DataFrame({"a": ["x", "y"]}) df.to_sql(table, con=conn, index=False, if_exists="replace") @@ -3825,7 +3850,7 @@ def test_read_sql_string_inference(sqlite_engine): with pd.option_context("future.infer_string", True): result = read_sql_table(table, conn) - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -4128,7 +4153,7 @@ def tquery(query, con=None): def test_xsqlite_basic(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) assert sql.to_sql(frame, name="test_table", con=sqlite_buildin, index=False) == 10 @@ -4155,7 +4180,7 @@ def test_xsqlite_basic(sqlite_buildin): def test_xsqlite_write_row_by_row(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) frame.iloc[0, 0] = np.nan @@ -4178,7 +4203,7 @@ def test_xsqlite_write_row_by_row(sqlite_buildin): def test_xsqlite_execute(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) create_sql = sql.get_schema(frame, "test") @@ -4199,7 +4224,7 @@ def test_xsqlite_execute(sqlite_buildin): def test_xsqlite_schema(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) create_sql = sql.get_schema(frame, "test") @@ -4230,11 +4255,11 @@ def test_xsqlite_execute_fail(sqlite_buildin): cur.execute(create_sql) with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql: - pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)') - pandas_sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)') + pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 1.234)") + pandas_sql.execute("INSERT INTO test VALUES('foo', 'baz', 2.567)") with pytest.raises(sql.DatabaseError, match="Execution failed on sql"): - pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 7)') + pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 7)") def test_xsqlite_execute_closed_connection(): @@ -4252,7 +4277,7 @@ def test_xsqlite_execute_closed_connection(): cur.execute(create_sql) with sql.pandasSQL_builder(conn) as pandas_sql: - pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)') + pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 1.234)") msg = "Cannot operate on a closed database." with pytest.raises(sqlite3.ProgrammingError, match=msg): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 3e4e1a107da9d..32f1c8d65271b 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -11,6 +11,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import CategoricalDtype import pandas._testing as tm @@ -344,7 +346,7 @@ def test_write_dta6(self, datapath): ) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_read_write_dta10(self, version): + def test_read_write_dta10(self, version, using_infer_string): original = DataFrame( data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]], columns=["string", "object", "integer", "floating", "datetime"], @@ -357,12 +359,17 @@ def test_read_write_dta10(self, version): with tm.ensure_clean() as path: original.to_stata(path, convert_dates={"datetime": "tc"}, version=version) written_and_read_again = self.read_dta(path) - # original.index is np.int32, read index is np.int64 - tm.assert_frame_equal( - written_and_read_again.set_index("index"), - original, - check_index_type=False, - ) + + expected = original.copy() + if using_infer_string: + expected["object"] = expected["object"].astype("str") + + # original.index is np.int32, read index is np.int64 + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + expected, + check_index_type=False, + ) def test_stata_doc_examples(self): with tm.ensure_clean() as path: @@ -1209,6 +1216,10 @@ def _convert_categorical(from_frame: DataFrame) -> DataFrame: if cat.categories.dtype == object: categories = pd.Index._with_infer(cat.categories._values) cat = cat.set_categories(categories) + elif cat.categories.dtype == "string" and len(cat.categories) == 0: + # if the read categories are empty, it comes back as object dtype + categories = cat.categories.astype(object) + cat = cat.set_categories(categories) from_frame[col] = cat return from_frame @@ -1544,8 +1555,8 @@ def test_inf(self, infval): def test_path_pathlib(self): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" reader = lambda x: read_stata(x).set_index("index") @@ -1555,8 +1566,8 @@ def test_path_pathlib(self): def test_pickle_path_localpath(self): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" reader = lambda x: read_stata(x).set_index("index") @@ -1580,8 +1591,8 @@ def test_set_index(self): # GH 17328 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" with tm.ensure_clean() as path: @@ -1609,7 +1620,7 @@ def test_date_parsing_ignores_format_details(self, column, datapath): formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted - def test_writer_117(self): + def test_writer_117(self, using_infer_string): original = DataFrame( data=[ [ @@ -1672,13 +1683,17 @@ def test_writer_117(self): version=117, ) written_and_read_again = self.read_dta(path) - # original.index is np.int32, read index is np.int64 - tm.assert_frame_equal( - written_and_read_again.set_index("index"), - original, - check_index_type=False, - ) - tm.assert_frame_equal(original, copy) + + expected = original[:] + if using_infer_string: + # object dtype (with only strings/None) comes back as string dtype + expected["object"] = expected["object"].astype("str") + + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + expected, + ) + tm.assert_frame_equal(original, copy) def test_convert_strl_name_swap(self): original = DataFrame( @@ -1721,8 +1736,8 @@ def test_nonfile_writing(self, version): bio = io.BytesIO() df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" with tm.ensure_clean() as path: @@ -1737,8 +1752,8 @@ def test_gzip_writing(self): # writing version 117 requires seek and cannot be used with gzip df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" with tm.ensure_clean() as path: @@ -1765,7 +1780,7 @@ def test_unicode_dta_118(self, datapath): tm.assert_frame_equal(unicode_df, expected) - def test_mixed_string_strl(self): + def test_mixed_string_strl(self, using_infer_string): # GH 23633 output = [{"mixed": "string" * 500, "number": 0}, {"mixed": None, "number": 1}] output = DataFrame(output) @@ -1783,7 +1798,10 @@ def test_mixed_string_strl(self): path, write_index=False, convert_strl=["mixed"], version=117 ) reread = read_stata(path) - expected = output.fillna("") + expected = output.copy() + if using_infer_string: + expected["mixed"] = expected["mixed"].astype("str") + expected = expected.fillna("") tm.assert_frame_equal(reread, expected) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @@ -1862,6 +1880,7 @@ def test_stata_119(self, datapath): reader._ensure_open() assert reader._nvar == 32999 + @pytest.mark.filterwarnings("ignore:Downcasting behavior:FutureWarning") @pytest.mark.parametrize("version", [118, 119, None]) def test_utf8_writer(self, version): cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) @@ -1921,6 +1940,41 @@ def test_writer_118_exceptions(self): with pytest.raises(ValueError, match="You must use version 119"): StataWriterUTF8(path, df, version=118) + @pytest.mark.parametrize( + "dtype_backend", + ["numpy_nullable", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))], + ) + def test_read_write_ea_dtypes(self, dtype_backend): + df = DataFrame( + { + "a": [1, 2, None], + "b": ["a", "b", "c"], + "c": [True, False, None], + "d": [1.5, 2.5, 3.5], + "e": pd.date_range("2020-12-31", periods=3, freq="D"), + }, + index=pd.Index([0, 1, 2], name="index"), + ) + df = df.convert_dtypes(dtype_backend=dtype_backend) + df.to_stata("test_stata.dta", version=118) + + with tm.ensure_clean() as path: + df.to_stata(path) + written_and_read_again = self.read_dta(path) + + expected = DataFrame( + { + "a": [1, 2, np.nan], + "b": ["a", "b", "c"], + "c": [1.0, 0, np.nan], + "d": [1.5, 2.5, 3.5], + "e": pd.date_range("2020-12-31", periods=3, freq="D"), + }, + index=pd.Index([0, 1, 2], name="index", dtype=np.int32), + ) + + tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) + @pytest.mark.parametrize("version", [105, 108, 111, 113, 114]) def test_backward_compat(version, datapath): @@ -2100,7 +2154,7 @@ def test_iterator_value_labels(): df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)}) with tm.ensure_clean() as path: df.to_stata(path, write_index=False) - expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object") + expected = pd.Index(["a_label", "b_label", "c_label"]) with read_stata(path, chunksize=100) as reader: for j, chunk in enumerate(reader): for i in range(2): diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 6f429c1ecbf8a..92e89ddbc8e80 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -28,11 +28,6 @@ Series, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) -from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.common import get_handle from pandas.io.xml import read_xml @@ -1044,7 +1039,7 @@ def test_utf16_encoding(xml_baby_names, parser): UnicodeError, match=( "UTF-16 stream does not start with BOM|" - "'utf-16-le' codec can't decode byte" + "'utf-16(-le)?' codec can't decode byte" ), ): read_xml(xml_baby_names, encoding="UTF-16", parser=parser) @@ -2035,36 +2030,21 @@ def test_read_xml_nullable_dtypes( """ - if using_infer_string: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArrayNumpySemantics(pa.array(["x", "y"])) - string_array_na = ArrowStringArrayNumpySemantics(pa.array(["x", None])) - - elif string_storage == "python": - string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) - string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + with pd.option_context("mode.string_storage", string_storage): + result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend) - elif dtype_backend == "pyarrow": + if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["x", "y"])) - string_array_na = ArrowExtensionArray(pa.array(["x", None])) - + string_dtype = pd.ArrowDtype(pa.string()) else: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArray(pa.array(["x", "y"])) - string_array_na = ArrowStringArray(pa.array(["x", None])) - - with pd.option_context("mode.string_storage", string_storage): - result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend) + string_dtype = pd.StringDtype(string_storage) expected = DataFrame( { - "a": string_array, + "a": Series(["x", "y"], dtype=string_dtype), "b": Series([1, 2], dtype="Int64"), "c": Series([4.0, 5.0], dtype="Float64"), - "d": string_array_na, + "d": Series(["x", None], dtype=string_dtype), "e": Series([2, NA], dtype="Int64"), "f": Series([4.0, NA], dtype="Float64"), "g": Series([NA, NA], dtype="Int64"), @@ -2085,7 +2065,9 @@ def test_read_xml_nullable_dtypes( ) expected["g"] = ArrowExtensionArray(pa.array([None, None])) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) def test_invalid_dtype_backend(): diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 8583d8bcc052c..17dae1879f3b8 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -1,3 +1,5 @@ +import pickle + import numpy as np import pytest @@ -283,3 +285,15 @@ def test_no_default_pickle(): # GH#40397 obj = tm.round_trip_pickle(lib.no_default) assert obj is lib.no_default + + +def test_ensure_string_array_copy(): + # ensure the original array is not modified in case of copy=False with + # pickle-roundtripped object dtype array + # https://github.com/pandas-dev/pandas/issues/54654 + arr = np.array(["a", None], dtype=object) + arr = pickle.loads(pickle.dumps(arr)) + result = lib.ensure_string_array(arr, copy=False) + assert not np.shares_memory(arr, result) + assert arr[1] is None + assert result[1] is np.nan diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 45dc612148f40..33366b4eabba5 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -44,6 +44,7 @@ _check_visible, get_y_axis, ) +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -1058,28 +1059,43 @@ def test_boxplot_series_positions(self, hist_df): tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] # if horizontal, yticklabels are rotated - ax = df.plot.box(rot=50, fontsize=8, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(rot=50, fontsize=8, **kwargs) _check_ticks_props(ax, xrot=0, yrot=50, ylabelsize=8) _check_text_labels(ax.get_yticklabels(), labels) assert len(ax.lines) == 7 * len(numeric_cols) - @pytest.mark.filterwarnings("ignore:Attempt:UserWarning") + @pytest.mark.filterwarnings("ignore::UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib version 3.10", + ) def test_boxplot_vertical_subplots(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) axes = _check_plot_works( - df.plot.box, - default_axes=True, - subplots=True, - vert=False, - logx=True, + df.plot.box, default_axes=True, subplots=True, logx=True, **kwargs ) _check_axes_shape(axes, axes_num=3, layout=(1, 3)) _check_ax_scales(axes, xaxis="log") @@ -1087,12 +1103,22 @@ def test_boxplot_vertical_subplots(self, hist_df): _check_text_labels(ax.get_yticklabels(), [label]) assert len(ax.lines) == 7 + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical_positions(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] positions = np.array([3, 2, 8]) - ax = df.plot.box(positions=positions, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(positions=positions, **kwargs) _check_text_labels(ax.get_yticklabels(), labels) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) @@ -2487,8 +2513,14 @@ def test_group_subplot_invalid_column_name(self): d = {"a": np.arange(10), "b": np.arange(10)} df = DataFrame(d) - with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"): - df.plot(subplots=[("a", "bad_name")]) + if Version(np.__version__) < Version("2.0.0"): + with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"): + df.plot(subplots=[("a", "bad_name")]) + else: + with pytest.raises( + ValueError, match=r"Column label\(s\) \[np\.str\_\('bad_name'\)\]" + ): + df.plot(subplots=[("a", "bad_name")]) def test_group_subplot_duplicated_column(self): d = {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 76f7fa1f22eec..e1b03a34086c0 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -1,5 +1,7 @@ """ Test cases for .boxplot method """ +from __future__ import annotations + import itertools import string @@ -22,6 +24,7 @@ _check_ticks_props, _check_visible, ) +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -35,6 +38,17 @@ def _check_ax_limits(col, ax): assert y_max >= col.max() +if Version(mpl.__version__) < Version("3.10"): + verts: list[dict[str, bool | str]] = [{"vert": False}, {"vert": True}] +else: + verts = [{"orientation": "horizontal"}, {"orientation": "vertical"}] + + +@pytest.fixture(params=verts) +def vert(request): + return request.param + + class TestDataFramePlots: def test_stacked_boxplot_set_axis(self): # GH2980 @@ -315,7 +329,7 @@ def test_specified_props_kwd(self, props, expected): assert result[expected][0].get_color() == "C1" - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -325,11 +339,11 @@ def test_plot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.plot(kind="box", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.plot(kind="box", xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_box(self, vert): # GH 54941 rng = np.random.default_rng(2) @@ -338,14 +352,14 @@ def test_plot_box(self, vert): xlabel, ylabel = "x", "y" _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True) - df1.plot.box(ax=axs[0], vert=vert, xlabel=xlabel, ylabel=ylabel) - df2.plot.box(ax=axs[1], vert=vert, xlabel=xlabel, ylabel=ylabel) + df1.plot.box(ax=axs[0], xlabel=xlabel, ylabel=ylabel, **vert) + df2.plot.box(ax=axs[1], xlabel=xlabel, ylabel=ylabel, **vert) for ax in axs: assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel mpl.pyplot.close() - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -355,11 +369,11 @@ def test_boxplot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_group_xlabel_ylabel(self, vert): df = DataFrame( { @@ -369,14 +383,20 @@ def test_boxplot_group_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(by="group", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(by="group", xlabel=xlabel, ylabel=ylabel, **vert) for subplot in ax: assert subplot.get_xlabel() == xlabel assert subplot.get_ylabel() == ylabel mpl.pyplot.close() - @pytest.mark.parametrize("vert", [True, False]) - def test_boxplot_group_no_xlabel_ylabel(self, vert): + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + def test_boxplot_group_no_xlabel_ylabel(self, vert, request): + if Version(mpl.__version__) >= Version("3.10") and vert == { + "orientation": "horizontal" + }: + request.applymarker( + pytest.mark.xfail(reason=f"{vert} fails starting with matplotlib 3.10") + ) df = DataFrame( { "a": np.random.default_rng(2).standard_normal(10), @@ -384,9 +404,14 @@ def test_boxplot_group_no_xlabel_ylabel(self, vert): "group": np.random.default_rng(2).choice(["group1", "group2"], 10), } ) - ax = df.boxplot(by="group", vert=vert) + ax = df.boxplot(by="group", **vert) for subplot in ax: - target_label = subplot.get_xlabel() if vert else subplot.get_ylabel() + target_label = ( + subplot.get_xlabel() + if vert == {"vert": True} # noqa: PLR1714 + or vert == {"orientation": "vertical"} + else subplot.get_ylabel() + ) assert target_label == pprint_thing(["group"]) mpl.pyplot.close() diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 112172656b6ec..6c318402ea226 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -1451,13 +1451,19 @@ def test_mpl_nopandas(self): values1 = np.arange(10.0, 11.0, 0.5) values2 = np.arange(11.0, 12.0, 0.5) - kw = {"fmt": "-", "lw": 4} - _, ax = mpl.pyplot.subplots() - ax.plot_date([x.toordinal() for x in dates], values1, **kw) - ax.plot_date([x.toordinal() for x in dates], values2, **kw) - - line1, line2 = ax.get_lines() + ( + line1, + line2, + ) = ax.plot( + [x.toordinal() for x in dates], + values1, + "-", + [x.toordinal() for x in dates], + values2, + "-", + linewidth=4, + ) exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line1.get_xydata()[:, 0], exp) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 30ec0d0affaa3..7ca1239286188 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1089,25 +1089,62 @@ def test_any_all_datetimelike(self): assert df.any().all() assert not df.all().any() - def test_any_all_pyarrow_string(self): + def test_any_all_string_dtype(self, any_string_dtype): # GH#54591 - pytest.importorskip("pyarrow") - ser = Series(["", "a"], dtype="string[pyarrow_numpy]") + if ( + isinstance(any_string_dtype, pd.StringDtype) + and any_string_dtype.na_value is pd.NA + ): + # the nullable string dtype currently still raise an error + # https://github.com/pandas-dev/pandas/issues/51939 + ser = Series(["a", "b"], dtype=any_string_dtype) + with pytest.raises(TypeError): + ser.any() + with pytest.raises(TypeError): + ser.all() + return + + ser = Series(["", "a"], dtype=any_string_dtype) assert ser.any() assert not ser.all() + assert ser.any(skipna=False) + assert not ser.all(skipna=False) - ser = Series([None, "a"], dtype="string[pyarrow_numpy]") + ser = Series([np.nan, "a"], dtype=any_string_dtype) assert ser.any() assert ser.all() - assert not ser.all(skipna=False) + assert ser.any(skipna=False) + assert ser.all(skipna=False) # NaN is considered truthy - ser = Series([None, ""], dtype="string[pyarrow_numpy]") + ser = Series([np.nan, ""], dtype=any_string_dtype) assert not ser.any() assert not ser.all() + assert ser.any(skipna=False) # NaN is considered truthy + assert not ser.all(skipna=False) - ser = Series(["a", "b"], dtype="string[pyarrow_numpy]") + ser = Series(["a", "b"], dtype=any_string_dtype) assert ser.any() assert ser.all() + assert ser.any(skipna=False) + assert ser.all(skipna=False) + + ser = Series([], dtype=any_string_dtype) + assert not ser.any() + assert ser.all() + assert not ser.any(skipna=False) + assert ser.all(skipna=False) + + ser = Series([""], dtype=any_string_dtype) + assert not ser.any() + assert not ser.all() + assert not ser.any(skipna=False) + assert not ser.all(skipna=False) + + ser = Series([np.nan], dtype=any_string_dtype) + assert not ser.any() + assert ser.all() + assert ser.any(skipna=False) # NaN is considered truthy + assert ser.all(skipna=False) # NaN is considered truthy def test_timedelta64_analytics(self): # index min/max @@ -1442,10 +1479,13 @@ def test_mode_numerical_nan(self, dropna, expected): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "dropna, expected1, expected2, expected3", - [(True, ["b"], ["bar"], ["nan"]), (False, ["b"], [np.nan], ["nan"])], + "dropna, expected1, expected2", + [ + (True, ["b"], ["bar"]), + (False, ["b"], [np.nan]), + ], ) - def test_mode_str_obj(self, dropna, expected1, expected2, expected3): + def test_mode_object(self, dropna, expected1, expected2): # Test string and object types. data = ["a"] * 2 + ["b"] * 3 @@ -1458,15 +1498,31 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): s = Series(data, dtype=object) result = s.mode(dropna) - expected2 = Series(expected2, dtype=None if expected2 == ["bar"] else object) + expected2 = Series(expected2, dtype=object) tm.assert_series_equal(result, expected2) + @pytest.mark.parametrize( + "dropna, expected1, expected2", + [ + (True, ["b"], ["bar"]), + (False, ["b"], [np.nan]), + ], + ) + def test_mode_string(self, dropna, expected1, expected2, any_string_dtype): + # Test string and object types. + data = ["a"] * 2 + ["b"] * 3 + + s = Series(data, dtype=any_string_dtype) + result = s.mode(dropna) + expected1 = Series(expected1, dtype=any_string_dtype) + tm.assert_series_equal(result, expected1) + data = ["foo", "bar", "bar", np.nan, np.nan, np.nan] - s = Series(data, dtype=object).astype(str) + s = Series(data, dtype=any_string_dtype) result = s.mode(dropna) - expected3 = Series(expected3) - tm.assert_series_equal(result, expected3) + expected2 = Series(expected2, dtype=any_string_dtype) + tm.assert_series_equal(result, expected2) @pytest.mark.parametrize( "dropna, expected1, expected2", @@ -1475,12 +1531,12 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): def test_mode_mixeddtype(self, dropna, expected1, expected2): s = Series([1, "foo", "foo"]) result = s.mode(dropna) - expected = Series(expected1) + expected = Series(expected1, dtype=object) tm.assert_series_equal(result, expected) s = Series([1, "foo", "foo", np.nan, np.nan, np.nan]) result = s.mode(dropna) - expected = Series(expected2, dtype=None if expected2 == ["foo"] else object) + expected = Series(expected2, dtype=object) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1605,17 +1661,10 @@ def test_mode_intoverflow(self, dropna, expected1, expected2): expected2 = Series(expected2, dtype=np.uint64) tm.assert_series_equal(result, expected2) - def test_mode_sortwarning(self): - # Check for the warning that is raised when the mode - # results cannot be sorted - - expected = Series(["foo", np.nan]) + def test_mode_sort_with_na(self): s = Series([1, "foo", "foo", np.nan, np.nan]) - - with tm.assert_produces_warning(UserWarning): - result = s.mode(dropna=False) - result = result.sort_values().reset_index(drop=True) - + expected = Series(["foo", np.nan], dtype=object) + result = s.mode(dropna=False) tm.assert_series_equal(result, expected) def test_mode_boolean_with_na(self): diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 50644e33e45e1..dcf6c6099abab 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -3,6 +3,9 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_extension_array_dtype + +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -429,3 +432,29 @@ def test_resample_quantile(series): result = ser.resample(freq).quantile(q) expected = ser.resample(freq).agg(lambda x: x.quantile(q)).rename(ser.name) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("how", ["first", "last"]) +def test_first_last_skipna(any_real_nullable_dtype, skipna, how): + # GH#57019 + if is_extension_array_dtype(any_real_nullable_dtype): + na_value = Series(dtype=any_real_nullable_dtype).dtype.na_value + else: + na_value = np.nan + df = DataFrame( + { + "a": [2, 1, 1, 2], + "b": [na_value, 3.0, na_value, 4.0], + "c": [na_value, 3.0, na_value, 4.0], + }, + index=date_range("2020-01-01", periods=4, freq="D"), + dtype=any_real_nullable_dtype, + ) + rs = df.resample("ME") + method = getattr(rs, how) + result = method(skipna=skipna) + + gb = df.groupby(df.shape[0] * [pd.to_datetime("2020-01-31")]) + expected = getattr(gb, how)(skipna=skipna) + expected.index.freq = "ME" + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index eb80f56dd7d4b..6b7cce7d15a5b 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -1006,6 +1006,32 @@ def test_resample_t_l_deprecated(self): result = ser.resample("T").mean() tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "freq, freq_depr, freq_res, freq_depr_res, data", + [ + ("2Q", "2q", "2Y", "2y", [0.5]), + ("2M", "2m", "2Q", "2q", [1.0, 3.0]), + ], + ) + def test_resample_lowercase_frequency_deprecated( + self, freq, freq_depr, freq_res, freq_depr_res, data + ): + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq[1:]}' instead." + depr_msg_res = f"'{freq_depr_res[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_res[1:]}' instead." + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + rng_l = period_range("2020-01-01", "2020-08-01", freq=freq_depr) + ser = Series(np.arange(len(rng_l)), index=rng_l) + + rng = period_range("2020-01-01", "2020-08-01", freq=freq_res) + expected = Series(data=data, index=rng) + + with tm.assert_produces_warning(FutureWarning, match=depr_msg_res): + result = ser.resample(freq_depr_res).mean() + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( "offset", [ @@ -1014,8 +1040,8 @@ def test_resample_t_l_deprecated(self): offsets.BusinessHour(2), ], ) - def test_asfreq_invalid_period_freq(self, offset, series_and_frame): - # GH#9586 + def test_asfreq_invalid_period_offset(self, offset, series_and_frame): + # GH#55785 msg = f"Invalid offset: '{offset.base}' for converting time series " df = series_and_frame @@ -1031,6 +1057,9 @@ def test_asfreq_invalid_period_freq(self, offset, series_and_frame): ("2Q-FEB", "2QE-FEB"), ("2Y", "2YE"), ("2Y-MAR", "2YE-MAR"), + ("2M", "2me"), + ("2Q", "2qe"), + ("2Y-MAR", "2ye-mar"), ], ) def test_resample_frequency_ME_QE_YE_error_message(series_and_frame, freq, freq_depr): diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 7e8779ab48b7e..74d06117cbb4a 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -188,7 +188,7 @@ def test_api_compat_before_use(attr): getattr(rs, attr) -def tests_raises_on_nuisance(test_frame): +def tests_raises_on_nuisance(test_frame, using_infer_string): df = test_frame df["D"] = "foo" r = df.resample("h") @@ -198,6 +198,8 @@ def tests_raises_on_nuisance(test_frame): expected = r[["A", "B", "C"]].mean() msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): r.mean() result = r.mean(numeric_only=True) @@ -932,7 +934,9 @@ def test_end_and_end_day_origin( ("sem", lib.no_default, "could not convert string to float"), ], ) -def test_frame_downsample_method(method, numeric_only, expected_data): +def test_frame_downsample_method( + method, numeric_only, expected_data, using_infer_string +): # GH#46442 test if `numeric_only` behave as expected for DataFrameGroupBy index = date_range("2018-01-01", periods=2, freq="D") @@ -949,6 +953,11 @@ def test_frame_downsample_method(method, numeric_only, expected_data): if method in ("var", "mean", "median", "prod"): klass = TypeError msg = re.escape(f"agg function failed [how->{method},dtype->") + if using_infer_string: + msg = f"dtype 'str' does not support operation '{method}'" + elif method in ["sum", "std", "sem"] and using_infer_string: + klass = TypeError + msg = f"dtype 'str' does not support operation '{method}'" else: klass = ValueError msg = expected_data @@ -983,7 +992,9 @@ def test_frame_downsample_method(method, numeric_only, expected_data): ("last", lib.no_default, ["cat_2"]), ], ) -def test_series_downsample_method(method, numeric_only, expected_data): +def test_series_downsample_method( + method, numeric_only, expected_data, using_infer_string +): # GH#46442 test if `numeric_only` behave as expected for SeriesGroupBy index = date_range("2018-01-01", periods=2, freq="D") @@ -999,8 +1010,11 @@ def test_series_downsample_method(method, numeric_only, expected_data): func(**kwargs) elif method == "prod": msg = re.escape("agg function failed [how->prod,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'prod'" with pytest.raises(TypeError, match=msg): func(**kwargs) + else: result = func(**kwargs) expected = Series(expected_data, index=expected_index) @@ -1040,11 +1054,11 @@ def test_args_kwargs_depr(method, raises): if raises: with tm.assert_produces_warning(FutureWarning, match=warn_msg): with pytest.raises(UnsupportedFunctionCall, match=error_msg): - func(*args, 1, 2, 3) + func(*args, 1, 2, 3, 4) else: with tm.assert_produces_warning(FutureWarning, match=warn_msg): with pytest.raises(TypeError, match=error_msg_type): - func(*args, 1, 2, 3) + func(*args, 1, 2, 3, 4) def test_df_axis_param_depr(): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 337c5ff53bd14..e2d456fea2b23 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -503,7 +503,8 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): if consolidate: df = df._consolidate() - with tm.assert_produces_warning(FutureWarning): + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby(["key"]).resample("W", on="date").min() idx = pd.MultiIndex.from_arrays( [ diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index bbaaf0abecfbd..8e6a14e6bfb8f 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -59,9 +59,7 @@ def test_categorical_concat_dtypes(self, using_infer_string): num = Series([1, 2, 3]) df = pd.concat([Series(cat), obj, num], axis=1, keys=index) - result = df.dtypes == ( - object if not using_infer_string else "string[pyarrow_numpy]" - ) + result = df.dtypes == (object if not using_infer_string else "str") expected = Series([False, True, False], index=index) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 9e34d02091e69..77c45cf36894b 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import InvalidIndexError import pandas.util._test_decorators as td @@ -44,6 +46,8 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] + # test is not written to work with string dtype (checks .base) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_copy(self, using_array_manager, using_copy_on_write): df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1)) @@ -77,6 +81,7 @@ def test_concat_copy(self, using_array_manager, using_copy_on_write): assert arr is df3._mgr.arrays[0] else: assert arr.base is not None + assert arr.base is not None # Float block was consolidated. df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1))) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 71ddff7438254..4c94dc0d51f7e 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -73,23 +73,23 @@ def test_concat_datetime_timezone(self): exp_idx = DatetimeIndex( [ - "2010-12-31 23:00:00+00:00", - "2011-01-01 00:00:00+00:00", - "2011-01-01 01:00:00+00:00", "2010-12-31 15:00:00+00:00", "2010-12-31 16:00:00+00:00", "2010-12-31 17:00:00+00:00", + "2010-12-31 23:00:00+00:00", + "2011-01-01 00:00:00+00:00", + "2011-01-01 01:00:00+00:00", ] ).as_unit("ns") expected = DataFrame( [ - [1, np.nan], - [2, np.nan], - [3, np.nan], [np.nan, 1], [np.nan, 2], [np.nan, 3], + [1, np.nan], + [2, np.nan], + [3, np.nan], ], index=exp_idx, columns=["a", "b"], diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index 30ef0a934157b..8f7ea0c42f2c3 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -27,7 +29,7 @@ def test_handle_empty_objects(self, sort, using_infer_string): expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) expected["foo"] = expected["foo"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.loc[0:4, "foo"] = "bar" @@ -238,6 +240,8 @@ def test_concat_empty_dataframe_dtypes(self): assert result["b"].dtype == np.float64 assert result["c"].dtype == np.float64 + # triggers warning about empty entries + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_inner_join_empty(self): # GH 15328 df_empty = DataFrame() @@ -284,7 +288,7 @@ def test_concat_empty_dataframe_different_dtypes(self, using_infer_string): result = concat([df1[:0], df2[:0]]) assert result["a"].dtype == np.int64 - assert result["b"].dtype == np.object_ if not using_infer_string else "string" + assert result["b"].dtype == np.object_ if not using_infer_string else "str" def test_concat_to_empty_ea(self): """48510 `concat` to an empty EA should maintain type EA dtype.""" diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 52bb9fa0f151b..49c94168d203e 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -452,9 +452,7 @@ def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string): s1 = Series(["a", "b", "c"]) s2 = Series(["a", "b"]) s3 = Series(["a", "b", "c", "d"]) - s4 = Series( - [], dtype=object if not using_infer_string else "string[pyarrow_numpy]" - ) + s4 = Series([], dtype=object if not using_infer_string else "str") result = concat( [s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1 ) @@ -465,7 +463,7 @@ def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string): ["c", np.nan] * 2, [np.nan] * 2 + ["d"] + [np.nan], ], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ) tm.assert_frame_equal( result, expected, check_index_type=True, check_column_type=True diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 5a1f47e341222..4b79860437f72 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -16,6 +18,7 @@ bdate_range, concat, merge, + option_context, ) import pandas._testing as tm @@ -155,7 +158,7 @@ def test_join_on(self, target_source, infer_string): # overlap source_copy = source.copy() msg = ( - "You are trying to merge on float64 and object|string columns for key " + "You are trying to merge on float64 and object|str columns for key " "'A'. If you wish to proceed you should use pd.concat" ) with pytest.raises(ValueError, match=msg): @@ -340,6 +343,8 @@ def test_join_index_mixed_overlap(self): expected = _join_by_hand(df1, df2) tm.assert_frame_equal(joined, expected) + # triggers warning about empty entries + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() @@ -563,24 +568,30 @@ def test_join_many_non_unique_index(self): tm.assert_frame_equal(inner, left) tm.assert_frame_equal(inner, right) - def test_join_sort(self): - left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}) - right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) - - joined = left.join(right, on="key", sort=True) - expected = DataFrame( - { - "key": ["bar", "baz", "foo", "foo"], - "value": [2, 3, 1, 4], - "value2": ["a", "b", "c", "c"], - }, - index=[1, 2, 0, 3], - ) - tm.assert_frame_equal(joined, expected) - - # smoke test - joined = left.join(right, on="key", sort=False) - tm.assert_index_equal(joined.index, Index(range(4)), exact=True) + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_join_sort(self, infer_string): + with option_context("future.infer_string", infer_string): + left = DataFrame( + {"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]} + ) + right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) + + joined = left.join(right, on="key", sort=True) + expected = DataFrame( + { + "key": ["bar", "baz", "foo", "foo"], + "value": [2, 3, 1, 4], + "value2": ["a", "b", "c", "c"], + }, + index=[1, 2, 0, 3], + ) + tm.assert_frame_equal(joined, expected) + + # smoke test + joined = left.join(right, on="key", sort=False) + tm.assert_index_equal(joined.index, Index(range(4)), exact=True) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index @@ -614,7 +625,7 @@ def test_join_non_unique_period_index(self): ) tm.assert_frame_equal(result, expected) - def test_mixed_type_join_with_suffix(self): + def test_mixed_type_join_with_suffix(self, using_infer_string): # GH #916 df = DataFrame( np.random.default_rng(2).standard_normal((20, 6)), @@ -624,7 +635,9 @@ def test_mixed_type_join_with_suffix(self): df.insert(5, "dt", "foo") grouped = df.groupby("id") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.mean() mn = grouped.mean(numeric_only=True) @@ -769,7 +782,7 @@ def test_join_on_tz_aware_datetimeindex(self): ) result = df1.join(df2.set_index("date"), on="date") expected = df1.copy() - expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object) + expected["vals_2"] = Series([np.nan] * 2 + list("tuv")) tm.assert_frame_equal(result, expected) def test_join_datetime_string(self): @@ -1035,6 +1048,25 @@ def test_join_empty(left_empty, how, exp): tm.assert_frame_equal(result, expected) +def test_join_empty_uncomparable_columns(): + # GH 57048 + df1 = DataFrame() + df2 = DataFrame(columns=["test"]) + df3 = DataFrame(columns=["foo", ("bar", "baz")]) + + result = df1 + df2 + expected = DataFrame(columns=["test"]) + tm.assert_frame_equal(result, expected) + + result = df2 + df3 + expected = DataFrame(columns=[("bar", "baz"), "foo", "test"]) + tm.assert_frame_equal(result, expected) + + result = df1 + df3 + expected = DataFrame(columns=[("bar", "baz"), "foo"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "how, values", [ diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d7a343ae9f152..8a9fe9f3e2cfd 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -8,7 +8,10 @@ import numpy as np import pytest -from pandas.core.dtypes.common import is_object_dtype +from pandas.core.dtypes.common import ( + is_object_dtype, + is_string_dtype, +) from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -316,14 +319,15 @@ def test_merge_copy(self): merged["d"] = "peekaboo" assert (right["d"] == "bar").all() - def test_merge_nocopy(self, using_array_manager): + def test_merge_nocopy(self, using_array_manager, using_infer_string): left = DataFrame({"a": 0, "b": 1}, index=range(10)) right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) merged = merge(left, right, left_index=True, right_index=True, copy=False) assert np.shares_memory(merged["a"]._values, left["a"]._values) - assert np.shares_memory(merged["d"]._values, right["d"]._values) + if not using_infer_string: + assert np.shares_memory(merged["d"]._values, right["d"]._values) def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -667,11 +671,13 @@ def test_merge_nan_right(self): "i1_": {0: 0, 1: np.nan}, "i3": {0: 0.0, 1: np.nan}, None: {0: 0, 1: 0}, - } + }, + columns=Index(["i1", "i2", "i1_", "i3", None], dtype=object), ) .set_index(None) .reset_index()[["i1", "i2", "i1_", "i3"]] ) + result.columns = result.columns.astype("object") tm.assert_frame_equal(result, expected, check_dtype=False) def test_merge_nan_right2(self): @@ -820,7 +826,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)" + msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|str'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) @@ -1498,7 +1504,7 @@ def test_different(self, right_vals): # We allow merging on object and categorical cols and cast # categorical cols to object result = merge(left, right, on="A") - assert is_object_dtype(result.A.dtype) + assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) @pytest.mark.parametrize( "d1", [np.int64, np.int32, np.intc, np.int16, np.int8, np.uint8] @@ -1637,7 +1643,7 @@ def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals): result = merge(df1, df2, on=["A"]) assert is_object_dtype(result.A.dtype) result = merge(df2, df1, on=["A"]) - assert is_object_dtype(result.A.dtype) + assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) @pytest.mark.parametrize( "df1_vals, df2_vals", @@ -1867,25 +1873,27 @@ def right(): class TestMergeCategorical: - def test_identical(self, left): + def test_identical(self, left, using_infer_string): # merging on the same, should preserve dtypes merged = merge(left, left, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( - [CategoricalDtype(categories=["foo", "bar"]), np.dtype("O"), np.dtype("O")], + [CategoricalDtype(categories=["foo", "bar"]), dtype, dtype], index=["X", "Y_x", "Y_y"], ) tm.assert_series_equal(result, expected) - def test_basic(self, left, right): + def test_basic(self, left, right, using_infer_string): # we have matching Categorical dtypes in X # so should preserve the merged column merged = merge(left, right, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), - np.dtype("O"), + dtype, np.dtype("int64"), ], index=["X", "Y", "Z"], @@ -1989,16 +1997,17 @@ def test_multiindex_merge_with_unordered_categoricalindex(self, ordered): ).set_index(["id", "p"]) tm.assert_frame_equal(result, expected) - def test_other_columns(self, left, right): + def test_other_columns(self, left, right, using_infer_string): # non-merge columns should preserve if possible right = right.assign(Z=right.Z.astype("category")) merged = merge(left, right, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), - np.dtype("O"), + dtype, CategoricalDtype(categories=[1, 2]), ], index=["X", "Y", "Z"], @@ -2017,7 +2026,9 @@ def test_other_columns(self, left, right): lambda x: x.astype(CategoricalDtype(ordered=True)), ], ) - def test_dtype_on_merged_different(self, change, join_type, left, right): + def test_dtype_on_merged_different( + self, change, join_type, left, right, using_infer_string + ): # our merging columns, X now has 2 different dtypes # so we must be object as a result @@ -2029,9 +2040,8 @@ def test_dtype_on_merged_different(self, change, join_type, left, right): merged = merge(left, right, on="X", how=join_type) result = merged.dtypes.sort_index() - expected = Series( - [np.dtype("O"), np.dtype("O"), np.dtype("int64")], index=["X", "Y", "Z"] - ) + dtype = np.dtype("O") if not using_infer_string else "str" + expected = Series([dtype, dtype, np.dtype("int64")], index=["X", "Y", "Z"]) tm.assert_series_equal(result, expected) def test_self_join_multiple_categories(self): @@ -2499,7 +2509,7 @@ def test_merge_multiindex_columns(): expected_index = MultiIndex.from_tuples(tuples, names=["outer", "inner"]) expected = DataFrame(columns=expected_index) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_dtype=False) def test_merge_datetime_upcast_dtype(): @@ -2984,7 +2994,27 @@ def test_merge_empty_frames_column_order(left_empty, right_empty): if left_empty and right_empty: expected = expected.iloc[:0] elif left_empty: - expected.loc[:, "B"] = np.nan + expected["B"] = np.nan elif right_empty: - expected.loc[:, ["C", "D"]] = np.nan + expected[["C", "D"]] = np.nan tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) +def test_merge_datetime_and_timedelta(how): + left = DataFrame({"key": Series([1, None], dtype="datetime64[ns]")}) + right = DataFrame({"key": Series([1], dtype="timedelta64[ns]")}) + + msg = ( + f"You are trying to merge on {left['key'].dtype} and {right['key'].dtype} " + "columns for key 'key'. If you wish to proceed you should use pd.concat" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + left.merge(right, on="key", how=how) + + msg = ( + f"You are trying to merge on {right['key'].dtype} and {left['key'].dtype} " + "columns for key 'key'. If you wish to proceed you should use pd.concat" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + right.merge(left, on="key", how=how) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index b656191cc739d..77a3d64415ace 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -3134,7 +3134,7 @@ def test_merge_on_nans(self, func, side): else: merge_asof(df, df_null, on="a") - def test_by_nullable(self, any_numeric_ea_dtype): + def test_by_nullable(self, any_numeric_ea_dtype, using_infer_string): # Note: this test passes if instead of using pd.array we use # np.array([np.nan, 1]). Other than that, I (@jbrockmendel) # have NO IDEA what the expected behavior is. @@ -3176,6 +3176,8 @@ def test_by_nullable(self, any_numeric_ea_dtype): } ) expected["value_y"] = np.array([np.nan, np.nan, np.nan], dtype=object) + if using_infer_string: + expected["value_y"] = expected["value_y"].astype("str") tm.assert_frame_equal(result, expected) def test_merge_by_col_tz_aware(self): @@ -3201,7 +3203,7 @@ def test_merge_by_col_tz_aware(self): ) tm.assert_frame_equal(result, expected) - def test_by_mixed_tz_aware(self): + def test_by_mixed_tz_aware(self, using_infer_string): # GH 26649 left = pd.DataFrame( { @@ -3225,6 +3227,8 @@ def test_by_mixed_tz_aware(self): columns=["by_col1", "by_col2", "on_col", "value_x"], ) expected["value_y"] = np.array([np.nan], dtype=object) + if using_infer_string: + expected["value_y"] = expected["value_y"].astype("str") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", ["float64", "int16", "m8[ns]", "M8[us]"]) diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index abd61026b4e37..0bd3ca3cf2c1b 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -219,3 +219,26 @@ def test_ffill_validate_fill_method(self, left, right, invalid_method): ValueError, match=re.escape("fill_method must be 'ffill' or None") ): merge_ordered(left, right, on="key", fill_method=invalid_method) + + def test_ffill_left_merge(self): + # GH 57010 + df1 = DataFrame( + { + "key": ["a", "c", "e", "a", "c", "e"], + "lvalue": [1, 2, 3, 1, 2, 3], + "group": ["a", "a", "a", "b", "b", "b"], + } + ) + df2 = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) + result = merge_ordered( + df1, df2, fill_method="ffill", left_by="group", how="left" + ) + expected = DataFrame( + { + "key": ["a", "c", "e", "a", "c", "e"], + "lvalue": [1, 2, 3, 1, 2, 3], + "group": ["a", "a", "a", "b", "b", "b"], + "rvalue": [np.nan, 2.0, 2.0, np.nan, 2.0, 2.0], + } + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 269d3a2b7078e..402ff049884ba 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -9,6 +11,7 @@ RangeIndex, Series, Timestamp, + option_context, ) import pandas._testing as tm from pandas.core.reshape.concat import concat @@ -88,67 +91,71 @@ def test_merge_on_multikey(self, left, right, join_type): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("sort", [False, True]) - def test_left_join_multi_index(self, sort): - icols = ["1st", "2nd", "3rd"] + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + @pytest.mark.parametrize("sort", [True, False]) + def test_left_join_multi_index(self, sort, infer_string): + with option_context("future.infer_string", infer_string): + icols = ["1st", "2nd", "3rd"] - def bind_cols(df): - iord = lambda a: 0 if a != a else ord(a) - f = lambda ts: ts.map(iord) - ord("a") - return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10 + def bind_cols(df): + iord = lambda a: 0 if a != a else ord(a) + f = lambda ts: ts.map(iord) - ord("a") + return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10 - def run_asserts(left, right, sort): - res = left.join(right, on=icols, how="left", sort=sort) + def run_asserts(left, right, sort): + res = left.join(right, on=icols, how="left", sort=sort) - assert len(left) < len(res) + 1 - assert not res["4th"].isna().any() - assert not res["5th"].isna().any() + assert len(left) < len(res) + 1 + assert not res["4th"].isna().any() + assert not res["5th"].isna().any() - tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) - result = bind_cols(res.iloc[:, :-2]) - tm.assert_series_equal(res["4th"], result, check_names=False) - assert result.name is None + tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) + result = bind_cols(res.iloc[:, :-2]) + tm.assert_series_equal(res["4th"], result, check_names=False) + assert result.name is None - if sort: - tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) + if sort: + tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) - out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") + out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") - res.index = RangeIndex(len(res)) - tm.assert_frame_equal(out, res) + res.index = RangeIndex(len(res)) + tm.assert_frame_equal(out, res) - lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) - left = DataFrame( - np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"] - ) - # Explicit cast to float to avoid implicit cast when setting nan - left.insert( - 1, - "2nd", - np.random.default_rng(2).integers(0, 10, len(left)).astype("float"), - ) + lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) + left = DataFrame( + np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"] + ) + # Explicit cast to float to avoid implicit cast when setting nan + left.insert( + 1, + "2nd", + np.random.default_rng(2).integers(0, 10, len(left)).astype("float"), + ) - i = np.random.default_rng(2).permutation(len(left)) - right = left.iloc[i].copy() + i = np.random.default_rng(2).permutation(len(left)) + right = left.iloc[i].copy() - left["4th"] = bind_cols(left) - right["5th"] = -bind_cols(right) - right.set_index(icols, inplace=True) + left["4th"] = bind_cols(left) + right["5th"] = -bind_cols(right) + right.set_index(icols, inplace=True) - run_asserts(left, right, sort) + run_asserts(left, right, sort) - # inject some nulls - left.loc[1::4, "1st"] = np.nan - left.loc[2::5, "2nd"] = np.nan - left.loc[3::6, "3rd"] = np.nan - left["4th"] = bind_cols(left) + # inject some nulls + left.loc[1::4, "1st"] = np.nan + left.loc[2::5, "2nd"] = np.nan + left.loc[3::6, "3rd"] = np.nan + left["4th"] = bind_cols(left) - i = np.random.default_rng(2).permutation(len(left)) - right = left.iloc[i, :-1] - right["5th"] = -bind_cols(right) - right.set_index(icols, inplace=True) + i = np.random.default_rng(2).permutation(len(left)) + right = left.iloc[i, :-1] + right["5th"] = -bind_cols(right) + right.set_index(icols, inplace=True) - run_asserts(left, right, sort) + run_asserts(left, right, sort) @pytest.mark.parametrize("sort", [False, True]) def test_merge_right_vs_left(self, left, right, sort): @@ -632,7 +639,7 @@ def test_join_multi_levels_outer(self, portfolio, household, expected): axis=0, sort=True, ).reindex(columns=expected.columns) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_index_type=False) def test_join_multi_levels_invalid(self, portfolio, household): portfolio = portfolio.copy() diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 0811c69859c0d..cab2302b3d877 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -727,6 +727,7 @@ def test_cut_with_duplicated_index_lowest_included(): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_cut_with_nonexact_categorical_indices(): # GH 42424 diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index f9a03222c8057..59c81c545697a 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -334,7 +334,7 @@ def test_no_prefix_string_cats_default_category( dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) result = from_dummies(dummies, default_category=default_category) if using_infer_string: - expected[""] = expected[""].astype("string[pyarrow_numpy]") + expected[""] = expected[""].astype("str") tm.assert_frame_equal(result, expected) @@ -397,11 +397,13 @@ def test_with_prefix_contains_get_dummies_NaN_column(): ], ) def test_with_prefix_default_category( - dummies_with_unassigned, default_category, expected + dummies_with_unassigned, default_category, expected, using_infer_string ): result = from_dummies( dummies_with_unassigned, sep="_", default_category=default_category ) + if using_infer_string: + expected = expected.astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 31260e4dcb7d2..637bce59e9e2c 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -120,7 +120,7 @@ def test_get_dummies_basic_types(self, sparse, dtype, using_infer_string): result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype) - key = "string" if using_infer_string else "object" + key = "str" if using_infer_string else "object" expected_counts = {"int64": 1, key: 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) @@ -214,10 +214,10 @@ def test_dataframe_dummies_all_obj(self, df, sparse): tm.assert_frame_equal(result, expected) - def test_dataframe_dummies_string_dtype(self, df, using_infer_string): + def test_dataframe_dummies_string_dtype(self, df, any_string_dtype): # GH44965 df = df[["A", "B"]] - df = df.astype({"A": "object", "B": "string"}) + df = df.astype({"A": "str", "B": any_string_dtype}) result = get_dummies(df) expected = DataFrame( { @@ -228,8 +228,7 @@ def test_dataframe_dummies_string_dtype(self, df, using_infer_string): }, dtype=bool, ) - if not using_infer_string: - # infer_string returns numpy bools + if any_string_dtype == "string" and any_string_dtype.na_value is pd.NA: expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean") tm.assert_frame_equal(result, expected) @@ -708,19 +707,17 @@ def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype): ) tm.assert_frame_equal(result, expected) - @td.skip_if_no("pyarrow") - def test_get_dummies_ea_dtype(self): + @pytest.mark.parametrize("dtype_type", ["string", "category"]) + def test_get_dummies_ea_dtype(self, dtype_type, string_dtype_no_object): # GH#56273 - for dtype, exp_dtype in [ - ("string[pyarrow]", "boolean"), - ("string[pyarrow_numpy]", "bool"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow]")), "boolean"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow_numpy]")), "bool"), - ]: - df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) - result = get_dummies(df) - expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) - tm.assert_frame_equal(result, expected) + dtype = string_dtype_no_object + exp_dtype = "boolean" if dtype.na_value is pd.NA else "bool" + if dtype_type == "category": + dtype = CategoricalDtype(Index(["a"], dtype)) + df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) + result = get_dummies(df) + expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) + tm.assert_frame_equal(result, expected) @td.skip_if_no("pyarrow") def test_get_dummies_arrow_dtype(self): diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index ff9f927597956..72fd72df60761 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -19,7 +19,7 @@ def df(): res = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) res["id1"] = (res["A"] > 0).astype(np.int64) @@ -364,6 +364,8 @@ def test_melt_mixed_int_str_id_vars(self): expected = DataFrame( {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]} ) + # the df's columns are mixed type and thus object -> preserves object dtype + expected["variable"] = expected["variable"].astype(object) tm.assert_frame_equal(result, expected) def test_melt_mixed_int_str_value_vars(self): @@ -1197,11 +1199,13 @@ def test_raise_of_column_name_value(self): ): df.melt(id_vars="value", value_name="value") - @pytest.mark.parametrize("dtype", ["O", "string"]) - def test_missing_stubname(self, dtype): + def test_missing_stubname(self, request, any_string_dtype, using_infer_string): + if using_infer_string and any_string_dtype == "object": + # triggers object dtype inference warning of dtype=object + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) # GH46044 df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]}) - df = df.astype({"id": dtype}) + df = df.astype({"id": any_string_dtype}) result = wide_to_long( df, stubnames=["a", "b"], @@ -1217,6 +1221,38 @@ def test_missing_stubname(self, dtype): {"a": [100, 200, 300, 400], "b": [np.nan] * 4}, index=index, ) - new_level = expected.index.levels[0].astype(dtype) + new_level = expected.index.levels[0].astype(any_string_dtype) + if any_string_dtype == "object": + new_level = expected.index.levels[0].astype("str") expected.index = expected.index.set_levels(new_level, level=0) tm.assert_frame_equal(result, expected) + + +def test_wide_to_long_string_columns(string_storage): + # GH 57066 + string_dtype = pd.StringDtype(string_storage, na_value=np.nan) + df = DataFrame( + { + "ID": {0: 1}, + "R_test1": {0: 1}, + "R_test2": {0: 1}, + "R_test3": {0: 2}, + "D": {0: 1}, + } + ) + df.columns = df.columns.astype(string_dtype) + result = wide_to_long( + df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*" + ) + expected = DataFrame( + [[1, 1], [1, 1], [1, 2]], + columns=Index(["D", "R"]), + index=pd.MultiIndex.from_arrays( + [ + [1, 1, 1], + Index(["test1", "test2", "test3"], dtype=string_dtype), + ], + names=["ID", "UNPIVOTED"], + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 18a449b4d0c67..519564a96aa7e 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.errors import PerformanceWarning @@ -948,12 +948,14 @@ def test_margins(self, data): for value_col in table.columns.levels[0]: self._check_output(table[value_col], value_col, data) - def test_no_col(self, data): + def test_no_col(self, data, using_infer_string): # no col # to help with a buglet data.columns = [k * 2 for k in data.columns] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") table = data.drop(columns="CC").pivot_table( @@ -1003,7 +1005,7 @@ def test_no_col(self, data): ], ) def test_margin_with_only_columns_defined( - self, columns, aggfunc, values, expected_columns + self, columns, aggfunc, values, expected_columns, using_infer_string ): # GH 31016 df = DataFrame( @@ -1027,6 +1029,8 @@ def test_margin_with_only_columns_defined( ) if aggfunc != "sum": msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) if "B" not in columns: @@ -1090,7 +1094,7 @@ def test_pivot_table_multiindex_only(self, cols): expected = DataFrame( [[4.0, 5.0, 6.0]], columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), - index=Index(["v"], dtype=object), + index=Index(["v"], dtype="str" if cols == ("a", "b") else "object"), ) tm.assert_frame_equal(result, expected) @@ -2524,12 +2528,16 @@ def test_pivot_empty(self): expected = DataFrame(index=[], columns=[]) tm.assert_frame_equal(result, expected, check_names=False) - @pytest.mark.parametrize("dtype", [object, "string"]) - def test_pivot_integer_bug(self, dtype): - df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=dtype) + def test_pivot_integer_bug(self, any_string_dtype): + df = DataFrame( + data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=any_string_dtype + ) result = df.pivot(index=1, columns=0, values=2) - tm.assert_index_equal(result.columns, Index(["A", "B"], name=0, dtype=dtype)) + expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype) + if any_string_dtype == "object": + expected_columns = expected_columns.astype("str") + tm.assert_index_equal(result.columns, expected_columns) def test_pivot_index_none(self): # GH#3962 @@ -2611,7 +2619,11 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() # pylint: disable=missing-kwoa - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") + # this still fails because columns=None gets passed down to unstack as level=None + # while at that point None was converted to NaN + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_columns_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2627,8 +2639,7 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") - def test_pivot_index_is_none(self): + def test_pivot_index_is_none(self, using_infer_string): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2639,9 +2650,10 @@ def test_pivot_index_is_none(self): result = df.pivot(columns="b", index=None, values="c") expected = DataFrame(3, index=[1], columns=Index([2], name="b")) + if using_infer_string: + expected.index.name = np.nan tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 8d78d34e936f0..081feae6fc43f 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -126,7 +126,11 @@ def test_union_categoricals_nan(self): def test_union_categoricals_empty(self, val, request, using_infer_string): # GH 13759 if using_infer_string and val == ["1"]: - request.applymarker(pytest.mark.xfail("object and strings dont match")) + request.applymarker( + pytest.mark.xfail( + reason="TDOD(infer_string) object and strings dont match" + ) + ) res = union_categoricals([Categorical([]), Categorical(val)]) exp = Categorical(val) tm.assert_categorical_equal(res, exp) diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 4489c307172d7..73c4d8061c257 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -820,10 +820,9 @@ def test_asfreq_MS(self): assert initial.asfreq(freq="M", how="S") == Period("2013-01", "M") - msg = INVALID_FREQ_ERR_MSG + msg = "MS is not supported as period frequency" with pytest.raises(ValueError, match=msg): initial.asfreq(freq="MS", how="S") - msg = "MonthBegin is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): Period("2013-01", "MS") diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index aa4a8b152b19f..2c3a0816737fc 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -3,6 +3,7 @@ datetime, timedelta, ) +import re import numpy as np import pytest @@ -40,21 +41,22 @@ class TestPeriodDisallowedFreqs: ) def test_offsets_not_supported(self, freq, freq_msg): # GH#55785 - msg = f"{freq_msg} is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + msg = re.escape(f"{freq} is not supported as period frequency") + with pytest.raises(ValueError, match=msg): Period(year=2014, freq=freq) def test_custom_business_day_freq_raises(self): # GH#52534 - msg = "CustomBusinessDay is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + msg = "C is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2023-04-10", freq="C") - with pytest.raises(TypeError, match=msg): + msg = f"{offsets.CustomBusinessDay().base} is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2023-04-10", freq=offsets.CustomBusinessDay()) def test_invalid_frequency_error_message(self): - msg = "WeekOfMonth is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + msg = "WOM-1MON is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2012-01-02", freq="WOM-1MON") def test_invalid_frequency_period_error_message(self): @@ -106,7 +108,9 @@ def test_construction(self): assert i1 == i3 i1 = Period("1982", freq="min") - i2 = Period("1982", freq="MIN") + msg = "'MIN' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + i2 = Period("1982", freq="MIN") assert i1 == i2 i1 = Period(year=2005, month=3, day=1, freq="D") diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index d2fa0f722ca6f..a4d846f068d00 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -418,7 +418,7 @@ def test_td_mul_numeric_ndarray(self): def test_td_mul_numeric_ndarray_0d(self): td = Timedelta("1 day") - other = np.array(2) + other = np.array(2, dtype=np.int64) assert other.ndim == 0 expected = Timedelta("2 days") @@ -622,6 +622,7 @@ def test_td_floordiv_invalid_scalar(self): [ r"Invalid dtype datetime64\[D\] for __floordiv__", "'dtype' is an invalid keyword argument for this function", + "this function got an unexpected keyword argument 'dtype'", r"ufunc '?floor_divide'? cannot use operands with types", ] ) diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index d7160597ea6d6..e7ebcccef1c86 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -88,7 +88,7 @@ def test_isoformat(ts, timespec, expected_iso): class TestTimestampRendering: - timezones = ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/US/Pacific"] + timezones = ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/America/Los_Angeles"] @pytest.mark.parametrize("tz", timezones) @pytest.mark.parametrize("freq", ["D", "M", "S", "N"]) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 34465a7c12c18..a06a3a0d40675 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -27,6 +27,7 @@ Period, PeriodIndex, Series, + StringDtype, TimedeltaIndex, date_range, period_range, @@ -582,7 +583,6 @@ def test_strftime_dt64_days(self): expected = Index( ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], - dtype=np.object_, ) # dtype may be S10 or U10 depending on python version tm.assert_index_equal(result, expected) @@ -595,7 +595,7 @@ def test_strftime_period_days(self, using_infer_string): dtype="=U10", ) if using_infer_string: - expected = expected.astype("string[pyarrow_numpy]") + expected = expected.astype(StringDtype(na_value=np.nan)) tm.assert_index_equal(result, expected) def test_strftime_dt64_microsecond_resolution(self): @@ -652,7 +652,7 @@ def test_strftime_all_nat(self, data): ser = Series(data) with tm.assert_produces_warning(None): result = ser.dt.strftime("%Y-%m-%d") - expected = Series([np.nan], dtype=object) + expected = Series([np.nan], dtype="str") tm.assert_series_equal(result, expected) def test_valid_dt_with_missing_values(self): diff --git a/pandas/tests/series/accessors/test_struct_accessor.py b/pandas/tests/series/accessors/test_struct_accessor.py index 1ec5b3b726d17..80aea75fda406 100644 --- a/pandas/tests/series/accessors/test_struct_accessor.py +++ b/pandas/tests/series/accessors/test_struct_accessor.py @@ -2,6 +2,11 @@ import pytest +from pandas.compat.pyarrow import ( + pa_version_under11p0, + pa_version_under13p0, +) + from pandas import ( ArrowDtype, DataFrame, @@ -11,6 +16,7 @@ import pandas._testing as tm pa = pytest.importorskip("pyarrow") +pc = pytest.importorskip("pyarrow.compute") def test_struct_accessor_dtypes(): @@ -53,6 +59,7 @@ def test_struct_accessor_dtypes(): tm.assert_series_equal(actual, expected) +@pytest.mark.skipif(pa_version_under13p0, reason="pyarrow>=13.0.0 required") def test_struct_accessor_field(): index = Index([-100, 42, 123]) ser = Series( @@ -94,10 +101,11 @@ def test_struct_accessor_field(): def test_struct_accessor_field_with_invalid_name_or_index(): ser = Series([], dtype=ArrowDtype(pa.struct([("field", pa.int64())]))) - with pytest.raises(ValueError, match="name_or_index must be an int or str"): + with pytest.raises(ValueError, match="name_or_index must be an int, str,"): ser.struct.field(1.1) +@pytest.mark.skipif(pa_version_under11p0, reason="pyarrow>=11.0.0 required") def test_struct_accessor_explode(): index = Index([-100, 42, 123]) ser = Series( @@ -148,3 +156,41 @@ def test_struct_accessor_api_for_invalid(invalid): ), ): invalid.struct + + +@pytest.mark.parametrize( + ["indices", "name"], + [ + (0, "int_col"), + ([1, 2], "str_col"), + (pc.field("int_col"), "int_col"), + ("int_col", "int_col"), + (b"string_col", b"string_col"), + ([b"string_col"], "string_col"), + ], +) +@pytest.mark.skipif(pa_version_under13p0, reason="pyarrow>=13.0.0 required") +def test_struct_accessor_field_expanded(indices, name): + arrow_type = pa.struct( + [ + ("int_col", pa.int64()), + ( + "struct_col", + pa.struct( + [ + ("int_col", pa.int64()), + ("float_col", pa.float64()), + ("str_col", pa.string()), + ] + ), + ), + (b"string_col", pa.string()), + ] + ) + + data = pa.array([], type=arrow_type) + ser = Series(data, dtype=ArrowDtype(arrow_type)) + expected = pc.struct_field(data, indices) + result = ser.struct.field(indices) + tm.assert_equal(result.array._pa_array.combine_chunks(), expected) + assert result.name == name diff --git a/pandas/tests/series/indexing/test_delitem.py b/pandas/tests/series/indexing/test_delitem.py index 3d1082c3d040b..7440ef2692c47 100644 --- a/pandas/tests/series/indexing/test_delitem.py +++ b/pandas/tests/series/indexing/test_delitem.py @@ -31,16 +31,15 @@ def test_delitem(self): del s[0] tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) - def test_delitem_object_index(self, using_infer_string): + def test_delitem_object_index(self): # Index(dtype=object) - dtype = "string[pyarrow_numpy]" if using_infer_string else object - s = Series(1, index=Index(["a"], dtype=dtype)) + s = Series(1, index=Index(["a"], dtype="str")) del s["a"] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="str"))) s["a"] = 1 - tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype=dtype))) + tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype="str"))) del s["a"] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="str"))) def test_delitem_missing_key(self): # empty diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 596a225c288b8..9891684e9597c 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -360,12 +360,10 @@ def test_getitem_no_matches(self, box): # GH#33462 we expect the same behavior for list/ndarray/Index/Series ser = Series(["A", "B"]) - key = Series(["C"], dtype=object) + key = Series(["C"]) key = box(key) - msg = ( - r"None of \[Index\(\['C'\], dtype='object|string'\)\] are in the \[index\]" - ) + msg = r"None of \[Index\(\['C'\], dtype='object|str'\)\] are in the \[index\]" with pytest.raises(KeyError, match=msg): ser[key] diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index c52e47a812183..9ab7dff64b182 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -272,13 +272,25 @@ def test_timedelta_assignment(): # GH 8209 s = Series([], dtype=object) s.loc["B"] = timedelta(1) - tm.assert_series_equal(s, Series(Timedelta("1 days"), index=["B"])) + expected = Series( + Timedelta("1 days"), dtype="timedelta64[ns]", index=Index(["B"], dtype=object) + ) + tm.assert_series_equal(s, expected) s = s.reindex(s.index.insert(0, "A")) - tm.assert_series_equal(s, Series([np.nan, Timedelta("1 days")], index=["A", "B"])) + expected = Series( + [np.nan, Timedelta("1 days")], + dtype="timedelta64[ns]", + index=Index(["A", "B"], dtype=object), + ) + tm.assert_series_equal(s, expected) s.loc["A"] = timedelta(1) - expected = Series(Timedelta("1 days"), index=["A", "B"]) + expected = Series( + Timedelta("1 days"), + dtype="timedelta64[ns]", + index=Index(["A", "B"], dtype=object), + ) tm.assert_series_equal(s, expected) @@ -491,7 +503,7 @@ def _check_setitem_invalid(self, ser, invalid, indexer, warn): np.datetime64("NaT"), np.timedelta64("NaT"), ] - _indexers = [0, [0], slice(0, 1), [True, False, False]] + _indexers = [0, [0], slice(0, 1), [True, False, False], slice(None, None, None)] @pytest.mark.parametrize( "invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)] @@ -505,7 +517,7 @@ def test_setitem_validation_scalar_bool(self, invalid, indexer): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): ser = Series([1, 2, 3], dtype=any_int_numpy_dtype) - if isna(invalid) and invalid is not NaT: + if isna(invalid) and invalid is not NaT and not np.isnat(invalid): warn = None else: warn = FutureWarning diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 23137f0975fb1..85558e85494eb 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -3,11 +3,15 @@ datetime, ) from decimal import Decimal +import os import numpy as np import pytest -from pandas.compat.numpy import np_version_gte1p24 +from pandas.compat.numpy import ( + np_version_gt2, + np_version_gte1p24, +) from pandas.errors import IndexingError from pandas.core.dtypes.common import is_list_like @@ -563,7 +567,10 @@ def test_setitem_with_expansion_type_promotion(self): ser["a"] = Timestamp("2016-01-01") ser["b"] = 3.0 ser["c"] = "foo" - expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) + expected = Series( + [Timestamp("2016-01-01"), 3.0, "foo"], + index=Index(["a", "b", "c"], dtype=object), + ) tm.assert_series_equal(ser, expected) def test_setitem_not_contained(self, string_series): @@ -616,7 +623,7 @@ def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string ser = Series(["a", "b"]) ser[3] = nulls_fixture dtype = ( - "string[pyarrow_numpy]" + "str" if using_infer_string and not isinstance(nulls_fixture, Decimal) else object ) @@ -869,28 +876,20 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): self._check_inplace(is_inplace, orig, arr, obj) - def test_index_where(self, obj, key, expected, warn, val, using_infer_string): + def test_index_where(self, obj, key, expected, warn, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if using_infer_string and obj.dtype == object: - with pytest.raises(TypeError, match="Scalar must"): - Index(obj).where(~mask, val) - else: - res = Index(obj).where(~mask, val) - expected_idx = Index(expected, dtype=expected.dtype) - tm.assert_index_equal(res, expected_idx) + res = Index(obj, dtype=obj.dtype).where(~mask, val) + expected_idx = Index(expected, dtype=expected.dtype) + tm.assert_index_equal(res, expected_idx) - def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): + def test_index_putmask(self, obj, key, expected, warn, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if using_infer_string and obj.dtype == object: - with pytest.raises(TypeError, match="Scalar must"): - Index(obj).putmask(mask, val) - else: - res = Index(obj).putmask(mask, val) - tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) + res = Index(obj, dtype=obj.dtype).putmask(mask, val) + tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) @pytest.mark.parametrize( @@ -1443,7 +1442,11 @@ def obj(self): marks=pytest.mark.xfail( ( not np_version_gte1p24 - or (np_version_gte1p24 and np._get_promotion_state() != "weak") + or ( + np_version_gte1p24 + and not np_version_gt2 + and os.environ.get("NPY_PROMOTION_STATE", "legacy") != "weak" + ) ), reason="np.float32(1.1) ends up as 1.100000023841858, so " "np_can_hold_element raises and we cast to float64", diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index c978481ca9988..0fa2f63e5fb36 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas.core.dtypes.common import is_integer import pandas as pd @@ -232,7 +230,6 @@ def test_where_ndframe_align(): tm.assert_series_equal(out, expected) -@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set ints into string") def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment @@ -242,7 +239,7 @@ def test_where_setitem_invalid(): "different length than the value" ) # slice - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) with pytest.raises(ValueError, match=msg("slice")): s[0:3] = list(range(27)) @@ -252,18 +249,18 @@ def test_where_setitem_invalid(): tm.assert_series_equal(s.astype(np.int64), expected) # slice with step - s = Series(list("abcdef")) + s = Series(list("abcdef"), dtype=object) with pytest.raises(ValueError, match=msg("slice")): s[0:4:2] = list(range(27)) - s = Series(list("abcdef")) + s = Series(list("abcdef"), dtype=object) s[0:4:2] = list(range(2)) expected = Series([0, "b", 1, "d", "e", "f"]) tm.assert_series_equal(s, expected) # neg slices - s = Series(list("abcdef")) + s = Series(list("abcdef"), dtype=object) with pytest.raises(ValueError, match=msg("slice")): s[:-1] = list(range(27)) @@ -273,18 +270,18 @@ def test_where_setitem_invalid(): tm.assert_series_equal(s, expected) # list - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) with pytest.raises(ValueError, match=msg("list-like")): s[[0, 1, 2]] = list(range(27)) - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) with pytest.raises(ValueError, match=msg("list-like")): s[[0, 1, 2]] = list(range(2)) # scalar - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) s[0] = list(range(10)) expected = Series([list(range(10)), "b", "c"]) tm.assert_series_equal(s, expected) diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index cb60cd2e5bcf3..f332aad0c05f9 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -211,6 +211,19 @@ def test_align_periodindex(join_type): ts.align(ts[::2], join=join_type) +def test_align_stringindex(any_string_dtype): + left = Series(range(3), index=pd.Index(["a", "b", "d"], dtype=any_string_dtype)) + right = Series(range(3), index=pd.Index(["a", "b", "c"], dtype=any_string_dtype)) + result_left, result_right = left.align(right) + + expected_idx = pd.Index(["a", "b", "c", "d"], dtype=any_string_dtype) + expected_left = Series([0, 1, np.nan, 2], index=expected_idx) + expected_right = Series([0, 1, 2, np.nan], index=expected_idx) + + tm.assert_series_equal(result_left, expected_left) + tm.assert_series_equal(result_right, expected_right) + + def test_align_left_fewer_levels(): # GH#45224 left = Series([2], index=pd.MultiIndex.from_tuples([(1, 3)], names=["a", "c"])) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 46f55fff91e41..b9ba03d1e9f41 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -76,7 +76,7 @@ def test_astype_dict_like(self, dtype_class): dt1 = dtype_class({"abc": str}) result = ser.astype(dt1) - expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object) + expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype="str") tm.assert_series_equal(result, expected) dt2 = dtype_class({"abc": "float64"}) @@ -172,10 +172,14 @@ def test_astype_empty_constructor_equality(self, dtype): ) def test_astype_str_map(self, dtype, series, using_infer_string): # see GH#4405 + using_string_dtype = using_infer_string and dtype is str result = series.astype(dtype) - expected = series.map(str) - if using_infer_string: - expected = expected.astype(object) + if using_string_dtype: + expected = series.map(lambda val: str(val) if val is not np.nan else np.nan) + else: + expected = series.map(str) + if using_infer_string: + expected = expected.astype(object) tm.assert_series_equal(result, expected) def test_astype_float_to_period(self): @@ -212,7 +216,7 @@ def test_astype_dt64_to_str(self): # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex dti = date_range("2012-01-01", periods=3) result = Series(dti).astype(str) - expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object) + expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype="str") tm.assert_series_equal(result, expected) def test_astype_dt64tz_to_str(self): @@ -225,7 +229,7 @@ def test_astype_dt64tz_to_str(self): "2012-01-02 00:00:00-05:00", "2012-01-03 00:00:00-05:00", ], - dtype=object, + dtype="str", ) tm.assert_series_equal(result, expected) @@ -285,13 +289,13 @@ def test_astype_str_cast_dt64(self): ts = Series([Timestamp("2010-01-04 00:00:00")]) res = ts.astype(str) - expected = Series(["2010-01-04"], dtype=object) + expected = Series(["2010-01-04"], dtype="str") tm.assert_series_equal(res, expected) ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) res = ts.astype(str) - expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object) + expected = Series(["2010-01-04 00:00:00-05:00"], dtype="str") tm.assert_series_equal(res, expected) def test_astype_str_cast_td64(self): @@ -300,7 +304,7 @@ def test_astype_str_cast_td64(self): td = Series([Timedelta(1, unit="d")]) ser = td.astype(str) - expected = Series(["1 days"], dtype=object) + expected = Series(["1 days"], dtype="str") tm.assert_series_equal(ser, expected) def test_dt64_series_astype_object(self): @@ -347,7 +351,7 @@ def test_astype_from_float_to_str(self, dtype): # https://github.com/pandas-dev/pandas/issues/36451 ser = Series([0.1], dtype=dtype) result = ser.astype(str) - expected = Series(["0.1"], dtype=object) + expected = Series(["0.1"], dtype="str") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -358,11 +362,13 @@ def test_astype_from_float_to_str(self, dtype): (NA, ""), ], ) - def test_astype_to_str_preserves_na(self, value, string_value): + def test_astype_to_str_preserves_na(self, value, string_value, using_infer_string): # https://github.com/pandas-dev/pandas/issues/36904 ser = Series(["a", "b", value], dtype=object) result = ser.astype(str) - expected = Series(["a", "b", string_value], dtype=object) + expected = Series( + ["a", "b", None if using_infer_string else string_value], dtype="str" + ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) @@ -538,12 +544,12 @@ def test_astype_categorical_to_other(self): expected = ser tm.assert_series_equal(ser.astype("category"), expected) tm.assert_series_equal(ser.astype(CategoricalDtype()), expected) - msg = r"Cannot cast object|string dtype to float64" + msg = r"Cannot cast object|str dtype to float64" with pytest.raises(ValueError, match=msg): ser.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) - exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object) + exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype="str") tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype("int") @@ -673,3 +679,11 @@ def test_astype_timedelta64_with_np_nan(self): result = Series([Timedelta(1), np.nan], dtype="timedelta64[ns]") expected = Series([Timedelta(1), NaT], dtype="timedelta64[ns]") tm.assert_series_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_astype_int_na_string(self): + # GH#57418 + ser = Series([12, NA], dtype="Int64[pyarrow]") + result = ser.astype("string[pyarrow]") + expected = Series(["12", NA], dtype="string[pyarrow]") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_case_when.py b/pandas/tests/series/methods/test_case_when.py new file mode 100644 index 0000000000000..7cb60a11644a3 --- /dev/null +++ b/pandas/tests/series/methods/test_case_when.py @@ -0,0 +1,148 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, + array as pd_array, + date_range, +) +import pandas._testing as tm + + +@pytest.fixture +def df(): + """ + base dataframe for testing + """ + return DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + +def test_case_when_caselist_is_not_a_list(df): + """ + Raise ValueError if caselist is not a list. + """ + msg = "The caselist argument should be a list; " + msg += "instead got.+" + with pytest.raises(TypeError, match=msg): # GH39154 + df["a"].case_when(caselist=()) + + +def test_case_when_no_caselist(df): + """ + Raise ValueError if no caselist is provided. + """ + msg = "provide at least one boolean condition, " + msg += "with a corresponding replacement." + with pytest.raises(ValueError, match=msg): # GH39154 + df["a"].case_when([]) + + +def test_case_when_odd_caselist(df): + """ + Raise ValueError if no of caselist is odd. + """ + msg = "Argument 0 must have length 2; " + msg += "a condition and replacement; instead got length 3." + + with pytest.raises(ValueError, match=msg): + df["a"].case_when([(df["a"].eq(1), 1, df.a.gt(1))]) + + +def test_case_when_raise_error_from_mask(df): + """ + Raise Error from within Series.mask + """ + msg = "Failed to apply condition0 and replacement0." + with pytest.raises(ValueError, match=msg): + df["a"].case_when([(df["a"].eq(1), [1, 2])]) + + +def test_case_when_single_condition(df): + """ + Test output on a single condition. + """ + result = Series([np.nan, np.nan, np.nan]).case_when([(df.a.eq(1), 1)]) + expected = Series([1, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions(df): + """ + Test output when booleans are derived from a computation + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [(df.a.eq(1), 1), (Series([False, True, False]), 2)] + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_list(df): + """ + Test output when replacement is a list + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [([True, False, False], 1), (df["a"].gt(1) & df["b"].eq(5), [1, 2, 3])] + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_extension_dtype(df): + """ + Test output when replacement has an extension dtype + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [ + ([True, False, False], 1), + (df["a"].gt(1) & df["b"].eq(5), pd_array([1, 2, 3], dtype="Int64")), + ], + ) + expected = Series([1, 2, np.nan], dtype="Float64") + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_series(df): + """ + Test output when replacement is a Series + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [ + (np.array([True, False, False]), 1), + (df["a"].gt(1) & df["b"].eq(5), Series([1, 2, 3])), + ], + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_non_range_index(): + """ + Test output if index is not RangeIndex + """ + rng = np.random.default_rng(seed=123) + dates = date_range("1/1/2000", periods=8) + df = DataFrame( + rng.standard_normal(size=(8, 4)), index=dates, columns=["A", "B", "C", "D"] + ) + result = Series(5, index=df.index, name="A").case_when([(df.A.gt(0), df.B)]) + expected = df.A.mask(df.A.gt(0), df.B).where(df.A.gt(0), 5) + tm.assert_series_equal(result, expected) + + +def test_case_when_callable(): + """ + Test output on a callable + """ + # https://numpy.org/doc/stable/reference/generated/numpy.piecewise.html + x = np.linspace(-2.5, 2.5, 6) + ser = Series(x) + result = ser.case_when( + caselist=[ + (lambda df: df < 0, lambda df: -df), + (lambda df: df >= 0, lambda df: df), + ] + ) + expected = np.piecewise(x, [x < 0, x >= 0], [lambda x: -x, lambda x: x]) + tm.assert_series_equal(result, Series(expected)) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index b0a920ba02cad..c2cc838619790 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib import pandas as pd @@ -181,6 +183,7 @@ def test_cases(request): class TestSeriesConvertDtypes: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("params", product(*[(True, False)] * 5)) def test_convert_dtypes( self, @@ -227,9 +230,9 @@ def test_convert_dtypes( and params[0] and not params[1] ): - # If we would convert with convert strings then infer_objects converts - # with the option - expected_dtype = "string[pyarrow_numpy]" + # If convert_string=False and infer_objects=True, we end up with the + # default string dtype instead of preserving object for string data + expected_dtype = pd.StringDtype(na_value=np.nan) expected = pd.Series(data, dtype=expected_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_info.py b/pandas/tests/series/methods/test_info.py index 29dd704f6efa9..7defad8a463f3 100644 --- a/pandas/tests/series/methods/test_info.py +++ b/pandas/tests/series/methods/test_info.py @@ -5,10 +5,16 @@ import numpy as np import pytest -from pandas.compat import PYPY +from pandas._config import using_string_dtype + +from pandas.compat import ( + HAS_PYARROW, + PYPY, +) from pandas import ( CategoricalIndex, + Index, MultiIndex, Series, date_range, @@ -39,7 +45,9 @@ def test_info_categorical(): @pytest.mark.parametrize("verbose", [True, False]) -def test_info_series(lexsorted_two_level_string_multiindex, verbose): +def test_info_series( + lexsorted_two_level_string_multiindex, verbose, using_infer_string +): index = lexsorted_two_level_string_multiindex ser = Series(range(len(index)), index=index, name="sth") buf = StringIO() @@ -61,10 +69,11 @@ def test_info_series(lexsorted_two_level_string_multiindex, verbose): 10 non-null int64 """ ) + qualifier = "" if using_infer_string and HAS_PYARROW else "+" expected += textwrap.dedent( f"""\ dtypes: int64(1) - memory usage: {ser.memory_usage()}.0+ bytes + memory usage: {ser.memory_usage()}.0{qualifier} bytes """ ) assert result == expected @@ -141,18 +150,20 @@ def test_info_memory_usage_deep_pypy(): @pytest.mark.parametrize( - "series, plus", + "index, plus", [ - (Series(1, index=[1, 2, 3]), False), - (Series(1, index=list("ABC")), True), - (Series(1, index=MultiIndex.from_product([range(3), range(3)])), False), + ([1, 2, 3], False), + (Index(list("ABC"), dtype="str"), not (using_string_dtype() and HAS_PYARROW)), + (Index(list("ABC"), dtype=object), True), + (MultiIndex.from_product([range(3), range(3)]), False), ( - Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]])), - True, + MultiIndex.from_product([range(3), ["foo", "bar"]]), + not (using_string_dtype() and HAS_PYARROW), ), ], ) -def test_info_memory_usage_qualified(series, plus): +def test_info_memory_usage_qualified(index, plus): + series = Series(1, index=index) buf = StringIO() series.info(buf=buf) if plus: diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 251d4063008b9..f33f5edb5ee66 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -101,16 +101,16 @@ def test_map_series_stringdtype(any_string_dtype, using_infer_string): expected = Series(data=["rabbit", "dog", "cat", item], dtype=any_string_dtype) if using_infer_string and any_string_dtype == "object": - expected = expected.astype("string[pyarrow_numpy]") + expected = expected.astype("str") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "data, expected_dtype", - [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], object)], + [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], "str")], ) -def test_map_categorical_with_nan_values(data, expected_dtype, using_infer_string): +def test_map_categorical_with_nan_values(data, expected_dtype): # GH 20714 bug fixed in: GH 24275 def func(val): return val.split("-")[0] @@ -118,8 +118,6 @@ def func(val): s = Series(data, dtype="category") result = s.map(func, na_action="ignore") - if using_infer_string and expected_dtype == object: - expected_dtype = "string[pyarrow_numpy]" expected = Series(["1", "1", np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -145,9 +143,7 @@ def test_map_simple_str_callables_same_as_astype( # test that we are evaluating row-by-row first # before vectorized evaluation result = string_series.map(func) - expected = string_series.astype( - str if not using_infer_string else "string[pyarrow_numpy]" - ) + expected = string_series.astype(str if not using_infer_string else "str") tm.assert_series_equal(result, expected) @@ -225,6 +221,7 @@ def test_map_category_string(): tm.assert_series_equal(a.map(c), exp) +@pytest.mark.filterwarnings(r"ignore:Dtype inference:FutureWarning") def test_map_empty(request, index): if isinstance(index, MultiIndex): request.applymarker( @@ -497,7 +494,7 @@ def test_map_categorical(na_action, using_infer_string): result = s.map(lambda x: "A", na_action=na_action) exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object if not using_infer_string else "string" + assert result.dtype == object if not using_infer_string else "str" @pytest.mark.parametrize( @@ -557,13 +554,11 @@ def f(x): (list(range(3)), {0: 42}, [42] + [np.nan] * 3), ], ) -def test_map_missing_mixed(vals, mapping, exp, using_infer_string): +def test_map_missing_mixed(vals, mapping, exp): # GH20495 s = Series(vals + [np.nan]) result = s.map(mapping) exp = Series(exp) - if using_infer_string and mapping == {np.nan: "not NaN"}: - exp.iloc[-1] = np.nan tm.assert_series_equal(result, exp) diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py index 9727ef3d5c27c..6c80e711c3684 100644 --- a/pandas/tests/series/methods/test_pct_change.py +++ b/pandas/tests/series/methods/test_pct_change.py @@ -118,3 +118,11 @@ def test_pct_change_no_warning_na_beginning(): result = ser.pct_change() expected = Series([np.nan, np.nan, np.nan, 1, 0.5]) tm.assert_series_equal(result, expected) + + +def test_pct_change_empty(): + # GH 57056 + ser = Series([], dtype="float64") + expected = ser.copy() + result = ser.pct_change(periods=0) + tm.assert_series_equal(expected, result) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 24cf97c05c0a8..1c3ebe5653ce3 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -33,7 +33,8 @@ def ser(): ["max", np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6])], ["first", np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6])], ["dense", np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])], - ] + ], + ids=lambda x: x[0], ) def results(request): return request.param @@ -48,12 +49,29 @@ def results(request): "Int64", pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + "string[python]", + "str", ] ) def dtype(request): return request.param +def expected_dtype(dtype, method, pct=False): + exp_dtype = "float64" + # elif dtype in ["Int64", "Float64", "string[pyarrow]", "string[python]"]: + if dtype in ["string[pyarrow]"]: + exp_dtype = "Float64" + elif dtype in ["float64[pyarrow]", "int64[pyarrow]"]: + if method == "average" or pct: + exp_dtype = "double[pyarrow]" + else: + exp_dtype = "uint64[pyarrow]" + + return exp_dtype + + class TestSeriesRank: def test_rank(self, datetime_series): sp_stats = pytest.importorskip("scipy.stats") @@ -241,12 +259,18 @@ def test_rank_signature(self): with pytest.raises(ValueError, match=msg): s.rank("average") - @pytest.mark.parametrize("dtype", [None, object]) - def test_rank_tie_methods(self, ser, results, dtype): + def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): method, exp = results + if ( + dtype == "int64" + or dtype == "Int64" + or (not using_infer_string and dtype == "str") + ): + pytest.skip("int64/str does not support NaN") + ser = ser if dtype is None else ser.astype(dtype) result = ser.rank(method=method) - tm.assert_series_equal(result, Series(exp)) + tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method))) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) @@ -346,25 +370,35 @@ def test_rank_methods_series(self, method, op, value): ], ) def test_rank_dense_method(self, dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="dense") - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "dense")) tm.assert_series_equal(result, expected) - def test_rank_descending(self, ser, results, dtype): + def test_rank_descending(self, ser, results, dtype, using_infer_string): method, _ = results - if "i" in dtype: + if dtype == "int64" or (not using_infer_string and dtype == "str"): s = ser.dropna() else: s = ser.astype(dtype) res = s.rank(ascending=False) - expected = (s.max() - s).rank() - tm.assert_series_equal(res, expected) + if dtype.startswith("str"): + expected = (s.astype("float64").max() - s.astype("float64")).rank() + else: + expected = (s.max() - s).rank() + tm.assert_series_equal(res, expected.astype(expected_dtype(dtype, "average"))) - expected = (s.max() - s).rank(method=method) + if dtype.startswith("str"): + expected = (s.astype("float64").max() - s.astype("float64")).rank( + method=method + ) + else: + expected = (s.max() - s).rank(method=method) res2 = s.rank(method=method, ascending=False) - tm.assert_series_equal(res2, expected) + tm.assert_series_equal(res2, expected.astype(expected_dtype(dtype, method))) def test_rank_int(self, ser, results): method, exp = results @@ -421,9 +455,11 @@ def test_rank_ea_small_values(self): ], ) def test_rank_dense_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="dense", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "dense", pct=True)) tm.assert_series_equal(result, expected) @@ -442,9 +478,11 @@ def test_rank_dense_pct(dtype, ser, exp): ], ) def test_rank_min_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="min", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "min", pct=True)) tm.assert_series_equal(result, expected) @@ -463,9 +501,11 @@ def test_rank_min_pct(dtype, ser, exp): ], ) def test_rank_max_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="max", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "max", pct=True)) tm.assert_series_equal(result, expected) @@ -484,9 +524,11 @@ def test_rank_max_pct(dtype, ser, exp): ], ) def test_rank_average_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="average", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "average", pct=True)) tm.assert_series_equal(result, expected) @@ -505,9 +547,11 @@ def test_rank_average_pct(dtype, ser, exp): ], ) def test_rank_first_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="first", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "first", pct=True)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 6f0c8d751a92a..ecfbecf12bdd3 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - import pandas.util._test_decorators as td from pandas import ( @@ -24,13 +22,10 @@ import pandas._testing as tm -@pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="share memory doesn't work for arrow" -) def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) - assert np.may_share_memory(string_series.index, identity.index) + assert tm.shares_memory(string_series.index, identity.index) assert identity.index.is_(string_series.index) assert identity.index.identical(string_series.index) diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index 119654bd19b3f..a8f3862d39f07 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -64,7 +64,7 @@ def test_rename_set_name_inplace(self, using_infer_string): assert ser.name == name exp = np.array(["a", "b", "c"], dtype=np.object_) if using_infer_string: - exp = array(exp, dtype="string[pyarrow_numpy]") + exp = array(exp, dtype="str") tm.assert_extension_array_equal(ser.index.values, exp) else: tm.assert_numpy_array_equal(ser.index.values, exp) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 4330153c186ca..0c2e0fdc2616f 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -391,7 +389,6 @@ def test_replace_mixed_types_with_string(self): expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") @pytest.mark.parametrize( "categorical, numeric", [ @@ -399,7 +396,7 @@ def test_replace_mixed_types_with_string(self): (pd.Categorical(["A", "B"], categories=["A", "B"]), [1, 2]), ], ) - def test_replace_categorical(self, categorical, numeric): + def test_replace_categorical(self, categorical, numeric, using_infer_string): # GH 24971, GH#23305 ser = pd.Series(categorical) msg = "Downcasting behavior in `replace`" @@ -731,17 +728,25 @@ def test_replace_nullable_numeric(self): with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 1 in string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 - series = pd.Series(["0"]) + series = pd.Series(["0"], dtype=object) expected = pd.Series([1]) msg = "Downcasting behavior in `replace`" with tm.assert_produces_warning(FutureWarning, match=msg): result = series.replace(to_replace="0", value=1, regex=regex) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("regex", [False, True]) + def test_replace_regex_dtype_series_string(self, regex): + series = pd.Series(["0"], dtype="str") + expected = pd.Series([1], dtype="int64") + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = series.replace(to_replace="0", value=1, regex=regex) + tm.assert_series_equal(result, expected) + def test_replace_different_int_types(self, any_int_numpy_dtype): # GH#45311 labs = pd.Series([1, 1, 1, 0, 0, 2, 2, 2], dtype=any_int_numpy_dtype) @@ -761,20 +766,18 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) - def test_replace_change_dtype_series(self, using_infer_string): + def test_replace_change_dtype_series(self): # GH#25797 - df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) - warn = FutureWarning if using_infer_string else None - with tm.assert_produces_warning(warn, match="Downcasting"): - df["Test"] = df["Test"].replace([True], [np.nan]) - expected = pd.DataFrame.from_dict({"Test": ["0.5", np.nan, "0.6"]}) + df = pd.DataFrame({"Test": ["0.5", True, "0.6"]}, dtype=object) + df["Test"] = df["Test"].replace([True], [np.nan]) + expected = pd.DataFrame({"Test": ["0.5", np.nan, "0.6"]}, dtype=object) tm.assert_frame_equal(df, expected) - df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]}) + df = pd.DataFrame({"Test": ["0.5", None, "0.6"]}, dtype=object) df["Test"] = df["Test"].replace([None], [np.nan]) tm.assert_frame_equal(df, expected) - df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]}) + df = pd.DataFrame({"Test": ["0.5", None, "0.6"]}, dtype=object) df["Test"] = df["Test"].fillna(np.nan) tm.assert_frame_equal(df, expected) @@ -799,3 +802,15 @@ def test_replace_numeric_column_with_na(self, val): ser.replace(to_replace=1, value=pd.NA, inplace=True) tm.assert_series_equal(ser, expected) + + def test_replace_ea_float_with_bool(self): + # GH#55398 + ser = pd.Series([0.0], dtype="Float64") + expected = ser.copy() + result = ser.replace(False, 1.0) + tm.assert_series_equal(result, expected) + + ser = pd.Series([False], dtype="boolean") + expected = ser.copy() + result = ser.replace(0.0, True) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index 48e2608a1032a..fa571fa126b38 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -193,7 +193,7 @@ def test_reset_index_dtypes_on_empty_series_with_multiindex( # GH 19602 - Preserve dtype on empty Series with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = Series(dtype=object, index=idx)[:0].reset_index().dtypes - exp = "string" if using_infer_string else object + exp = "str" if using_infer_string else object expected = Series( { "level_0": np.int64, diff --git a/pandas/tests/series/methods/test_round.py b/pandas/tests/series/methods/test_round.py index 7f60c94f10e4f..c330b7a7dfbbb 100644 --- a/pandas/tests/series/methods/test_round.py +++ b/pandas/tests/series/methods/test_round.py @@ -63,3 +63,12 @@ def test_round_nat(self, method, freq, unit): round_method = getattr(ser.dt, method) result = round_method(freq) tm.assert_series_equal(result, expected) + + def test_round_ea_boolean(self): + # GH#55936 + ser = Series([True, False], dtype="boolean") + expected = ser.copy() + result = ser.round(2) + tm.assert_series_equal(result, expected) + result.iloc[0] = False + tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 1c17013d621c7..ba75c7786ef72 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -174,9 +174,6 @@ def test_to_csv_interval_index(self, using_infer_string): result = self.read_csv(path, index_col=0) # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) - expected = s.copy() - if using_infer_string: - expected.index = expected.index.astype("string[pyarrow_numpy]") - else: - expected.index = expected.index.astype(str) + expected = s + expected.index = expected.index.astype("str") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_to_numpy.py b/pandas/tests/series/methods/test_to_numpy.py index 5fe3e19b0a20b..4bc7631090761 100644 --- a/pandas/tests/series/methods/test_to_numpy.py +++ b/pandas/tests/series/methods/test_to_numpy.py @@ -1,9 +1,12 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( NA, Series, + Timedelta, ) import pandas._testing as tm @@ -23,3 +26,24 @@ def test_to_numpy_cast_before_setting_na(): result = ser.to_numpy(dtype=np.float64, na_value=np.nan) expected = np.array([1.0]) tm.assert_numpy_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_to_numpy_arrow_dtype_given(): + # GH#57121 + ser = Series([1, NA], dtype="int64[pyarrow]") + result = ser.to_numpy(dtype="float64") + expected = np.array([1.0, np.nan]) + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_ea_int_to_td_ts(): + # GH#57093 + ser = Series([1, None], dtype="Int64") + result = ser.astype("m8[ns]") + expected = Series([1, Timedelta("nat")], dtype="m8[ns]") + tm.assert_series_equal(result, expected) + + result = ser.astype("M8[ns]") + expected = Series([1, Timedelta("nat")], dtype="M8[ns]") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index 3c70e839c8e20..11995260dd0be 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -137,7 +137,7 @@ def test_unstack_mixed_type_name_in_multiindex( def test_unstack_multi_index_categorical_values(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) mi = df.stack(future_stack=True).index.rename(["major", "minor"]) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 29d6e2036476e..7e10a337cdd3a 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -169,7 +169,6 @@ def test_attrs(self): def test_inspect_getmembers(self): # GH38782 - pytest.importorskip("jinja2") ser = Series(dtype=object) msg = "Series._data is deprecated" with tm.assert_produces_warning( diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index b40e2e99dae2e..a65d7687cfb06 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -212,9 +212,9 @@ def test_series_integer_mod(self, index): s1 = Series(range(1, 10)) s2 = Series("foo", index=index) - msg = "not all arguments converted during string formatting|mod not" + msg = "not all arguments converted during string formatting|'mod' not supported" - with pytest.raises((TypeError, NotImplementedError), match=msg): + with pytest.raises(TypeError, match=msg): s2 % s1 def test_add_with_duplicate_index(self): @@ -499,27 +499,14 @@ def test_ser_cmp_result_names(self, names, comparison_op): result = op(ser, cidx) assert result.name == names[2] - def test_comparisons(self, using_infer_string): + def test_comparisons(self): s = Series(["a", "b", "c"]) s2 = Series([False, True, False]) # it works! exp = Series([False, False, False]) - if using_infer_string: - import pyarrow as pa - - msg = "has no kernel" - # TODO(3.0) GH56008 - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - s == s2 - with tm.assert_produces_warning( - DeprecationWarning, match="comparison", check_stacklevel=False - ): - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - s2 == s - else: - tm.assert_series_equal(s == s2, exp) - tm.assert_series_equal(s2 == s, exp) + tm.assert_series_equal(s == s2, exp) + tm.assert_series_equal(s2 == s, exp) # ----------------------------------------------------------------- # Categorical Dtype Comparisons diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index da069afe5e709..60b2ec7b6912d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -14,6 +14,7 @@ iNaT, lib, ) +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -166,7 +167,7 @@ def test_constructor(self, datetime_series, using_infer_string): # Mixed type Series mixed = Series(["hello", np.nan], index=[0, 1]) - assert mixed.dtype == np.object_ if not using_infer_string else "string" + assert mixed.dtype == np.object_ if not using_infer_string else "str" assert np.isnan(mixed[1]) assert not empty_series.index._is_all_dates @@ -229,7 +230,7 @@ def test_constructor_empty(self, input_class, using_infer_string): # GH 19853 : with empty string, index and dtype str empty = Series("", dtype=str, index=range(3)) if using_infer_string: - empty2 = Series("", index=range(3), dtype=object) + empty2 = Series("", index=range(3), dtype="str") else: empty2 = Series("", index=range(3)) tm.assert_series_equal(empty, empty2) @@ -1468,7 +1469,7 @@ def test_fromDict(self, using_infer_string): data = {"a": 0, "b": "1", "c": "2", "d": "3"} series = Series(data) - assert series.dtype == np.object_ if not using_infer_string else "string" + assert series.dtype == np.object_ if not using_infer_string else "str" data = {"a": "0", "b": "1"} series = Series(data, dtype=float) @@ -1480,7 +1481,7 @@ def test_fromValue(self, datetime_series, using_infer_string): assert len(nans) == len(datetime_series) strings = Series("foo", index=datetime_series.index) - assert strings.dtype == np.object_ if not using_infer_string else "string" + assert strings.dtype == np.object_ if not using_infer_string else "str" assert len(strings) == len(datetime_series) d = datetime.now() @@ -1958,9 +1959,15 @@ def test_constructor_int64_dtype(self, any_int_dtype): def test_constructor_raise_on_lossy_conversion_of_strings(self): # GH#44923 - with pytest.raises( - ValueError, match="string values cannot be losslessly cast to int8" - ): + if not np_version_gt2: + raises = pytest.raises( + ValueError, match="string values cannot be losslessly cast to int8" + ) + else: + raises = pytest.raises( + OverflowError, match="The elements provided in the data" + ) + with raises: Series(["128"], dtype="int8") def test_constructor_dtype_timedelta_alternative_construct(self): @@ -2088,11 +2095,10 @@ def test_series_from_index_dtype_equal_does_not_copy(self): def test_series_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Series(["a", "b"], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", "b"]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series(["a", "b"], dtype=dtype) tm.assert_series_equal(ser, expected) expected = Series(["a", 1], dtype="object") @@ -2103,37 +2109,43 @@ def test_series_string_inference(self): @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA]) def test_series_string_with_na_inference(self, na_value): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Series(["a", na_value], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", na_value]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series(["a", None], dtype=dtype) tm.assert_series_equal(ser, expected) def test_series_string_inference_scalar(self): # GH#54430 - pytest.importorskip("pyarrow") - expected = Series("a", index=[1], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series("a", index=[1]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series("a", index=[1], dtype=dtype) tm.assert_series_equal(ser, expected) def test_series_string_inference_array_string_dtype(self): # GH#54496 - pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series(np.array(["a", "b"])) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series(["a", "b"], dtype=dtype) tm.assert_series_equal(ser, expected) def test_series_string_inference_storage_definition(self): - # GH#54793 - pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") + # https://github.com/pandas-dev/pandas/issues/54793 + # but after PDEP-14 (string dtype), it was decided to keep dtype="string" + # returning the NA string dtype, so expected is changed from + # "string[pyarrow_numpy]" to "string[python]" + expected = Series(["a", "b"], dtype="string[python]") with pd.option_context("future.infer_string", True): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) + expected = Series(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)) + with pd.option_context("future.infer_string", True): + result = Series(["a", "b"], dtype="str") + tm.assert_series_equal(result, expected) + def test_series_constructor_infer_string_scalar(self): # GH#55537 with pd.option_context("future.infer_string", True): @@ -2144,10 +2156,10 @@ def test_series_constructor_infer_string_scalar(self): def test_series_string_inference_na_first(self): # GH#55655 - pytest.importorskip("pyarrow") - expected = Series([pd.NA, "b"], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): result = Series([pd.NA, "b"]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series([None, "b"], dtype=dtype) tm.assert_series_equal(result, expected) def test_inference_on_pandas_objects(self): @@ -2185,6 +2197,25 @@ def test_series_constructor_infer_multiindex(self, container, data): multi = Series(data, index=indexes) assert isinstance(multi.index, MultiIndex) + # TODO: make this not cast to object in pandas 3.0 + @pytest.mark.skipif( + not np_version_gt2, reason="StringDType only available in numpy 2 and above" + ) + @pytest.mark.parametrize( + "data", + [ + ["a", "b", "c"], + ["a", "b", np.nan], + ], + ) + def test_np_string_array_object_cast(self, data): + from numpy.dtypes import StringDType + + arr = np.array(data, dtype=StringDType()) + res = Series(arr) + assert res.dtype == np.object_ + assert (res == data).all() + class TestSeriesConstructorInternals: def test_constructor_no_pandas_array(self, using_array_manager): diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index e6f7b2a5e69e0..97f5fb4a9f96f 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -6,6 +6,8 @@ tests.frame.test_cumulative """ +import re + import numpy as np import pytest @@ -155,3 +157,56 @@ def test_cumprod_timedelta(self): ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)]) with pytest.raises(TypeError, match="cumprod not supported for Timedelta"): ser.cumprod() + + @pytest.mark.parametrize( + "data, op, skipna, expected_data", + [ + ([], "cumsum", True, []), + ([], "cumsum", False, []), + (["x", "z", "y"], "cumsum", True, ["x", "xz", "xzy"]), + (["x", "z", "y"], "cumsum", False, ["x", "xz", "xzy"]), + (["x", pd.NA, "y"], "cumsum", True, ["x", pd.NA, "xy"]), + (["x", pd.NA, "y"], "cumsum", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cumsum", True, [pd.NA, "x", "xy"]), + ([pd.NA, "x", "y"], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummin", True, []), + ([], "cummin", False, []), + (["y", "z", "x"], "cummin", True, ["y", "y", "x"]), + (["y", "z", "x"], "cummin", False, ["y", "y", "x"]), + (["y", pd.NA, "x"], "cummin", True, ["y", pd.NA, "x"]), + (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]), + ([pd.NA, "y", "x"], "cummin", True, [pd.NA, "y", "x"]), + ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummax", True, []), + ([], "cummax", False, []), + (["x", "z", "y"], "cummax", True, ["x", "z", "z"]), + (["x", "z", "y"], "cummax", False, ["x", "z", "z"]), + (["x", pd.NA, "y"], "cummax", True, ["x", pd.NA, "y"]), + (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cummax", True, [pd.NA, "x", "y"]), + ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ], + ) + def test_cum_methods_ea_strings( + self, string_dtype_no_object, data, op, skipna, expected_data + ): + # https://github.com/pandas-dev/pandas/pull/60633 - pyarrow + # https://github.com/pandas-dev/pandas/pull/60938 - Python + ser = pd.Series(data, dtype=string_dtype_no_object) + method = getattr(ser, op) + expected = pd.Series(expected_data, dtype=string_dtype_no_object) + result = method(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna): + # https://github.com/pandas-dev/pandas/pull/60633 + ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype) + msg = re.escape(f"operation 'cumprod' not supported for dtype '{ser.dtype}'") + with pytest.raises(TypeError, match=msg): + ser.cumprod(skipna=skipna) diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index a1c5018ea7961..4f93e7424bfd5 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -144,11 +142,13 @@ def test_tidy_repr_name_0(self, arg): rep_str = repr(ser) assert "Name: 0" in rep_str - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="TODO: investigate why this is failing" - ) - def test_newline(self): - ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) + def test_newline(self, any_string_dtype): + ser = Series( + ["a\n\r\tb"], + name="a\n\r\td", + index=Index(["a\n\r\tf"], dtype=any_string_dtype), + dtype=any_string_dtype, + ) assert "\t" not in repr(ser) assert "\r" not in repr(ser) assert "a\n" not in repr(ser) @@ -323,7 +323,7 @@ def test_categorical_repr(self, using_infer_string): "0 a\n1 b\n" " ..\n" "48 a\n49 b\n" - "Length: 50, dtype: category\nCategories (2, string): [a, b]" + "Length: 50, dtype: category\nCategories (2, str): [a, b]" ) else: exp = ( @@ -341,7 +341,7 @@ def test_categorical_repr(self, using_infer_string): exp = ( "0 a\n1 b\n" "dtype: category\n" - "Categories (26, string): [a < b < c < d ... w < x < y < z]" + "Categories (26, str): [a < b < c < d ... w < x < y < z]" ) else: exp = ( diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index d9c94e871bd4b..8d7adc1c1aae6 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -4,10 +4,14 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( + ArrowDtype, DataFrame, Index, Series, + StringDtype, bdate_range, ) import pandas._testing as tm @@ -146,7 +150,7 @@ def test_logical_operators_int_dtype_with_bool(self): expected = Series([False, True, True, True]) tm.assert_series_equal(result, expected) - def test_logical_operators_int_dtype_with_object(self, using_infer_string): + def test_logical_operators_int_dtype_with_object(self): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") @@ -155,14 +159,10 @@ def test_logical_operators_int_dtype_with_object(self, using_infer_string): tm.assert_series_equal(result, expected) s_abNd = Series(["a", "b", np.nan, "d"]) - if using_infer_string: - import pyarrow as pa - - with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): - s_0123 & s_abNd - else: - with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): - s_0123 & s_abNd + with pytest.raises( + TypeError, match="unsupported.* 'int' and 'str'|'rand_' not supported" + ): + s_0123 & s_abNd def test_logical_operators_bool_dtype_with_int(self): index = list("bca") @@ -360,6 +360,7 @@ def test_reverse_ops_with_index(self, op, expected): result = op(ser, idx) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_logical_ops_label_based(self, using_infer_string): # GH#4947 # logical ops should be label based @@ -428,15 +429,13 @@ def test_logical_ops_label_based(self, using_infer_string): tm.assert_series_equal(result, a[a]) for e in [Series(["z"])]: - warn = FutureWarning if using_infer_string else None if using_infer_string: - import pyarrow as pa - - with tm.assert_produces_warning(warn, match="Operation between non"): - with pytest.raises( - pa.lib.ArrowNotImplementedError, match="has no kernel" - ): - result = a[a | e] + # TODO(infer_string) should this behave differently? + # -> https://github.com/pandas-dev/pandas/issues/60234 + with pytest.raises( + TypeError, match="not supported for dtype|unsupported operand type" + ): + result = a[a | e] else: result = a[a | e] tm.assert_series_equal(result, a[a]) @@ -531,18 +530,38 @@ def test_int_dtype_different_index_not_bool(self): result = ser1 ^ ser2 tm.assert_series_equal(result, expected) + # TODO: this belongs in comparison tests def test_pyarrow_numpy_string_invalid(self): # GH#56008 - pytest.importorskip("pyarrow") + pa = pytest.importorskip("pyarrow") ser = Series([False, True]) - ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]") + ser2 = Series(["a", "b"], dtype=StringDtype(na_value=np.nan)) result = ser == ser2 - expected = Series(False, index=ser.index) - tm.assert_series_equal(result, expected) + expected_eq = Series(False, index=ser.index) + tm.assert_series_equal(result, expected_eq) result = ser != ser2 - expected = Series(True, index=ser.index) - tm.assert_series_equal(result, expected) + expected_ne = Series(True, index=ser.index) + tm.assert_series_equal(result, expected_ne) with pytest.raises(TypeError, match="Invalid comparison"): ser > ser2 + + # GH#59505 + ser3 = ser2.astype("string[pyarrow]") + result3_eq = ser3 == ser + tm.assert_series_equal(result3_eq, expected_eq.astype("bool[pyarrow]")) + result3_ne = ser3 != ser + tm.assert_series_equal(result3_ne, expected_ne.astype("bool[pyarrow]")) + + with pytest.raises(TypeError, match="Invalid comparison"): + ser > ser3 + + ser4 = ser2.astype(ArrowDtype(pa.string())) + result4_eq = ser4 == ser + tm.assert_series_equal(result4_eq, expected_eq.astype("bool[pyarrow]")) + result4_ne = ser4 != ser + tm.assert_series_equal(result4_ne, expected_ne.astype("bool[pyarrow]")) + + with pytest.raises(TypeError, match="Invalid comparison"): + ser > ser4 diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 76353ab25fca6..5415f220cadd4 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -166,19 +166,15 @@ def test_validate_stat_keepdims(): def test_mean_with_convertible_string_raises(using_array_manager, using_infer_string): # GH#44008 ser = Series(["1", "2"]) - if using_infer_string: - msg = "does not support" - with pytest.raises(TypeError, match=msg): - ser.sum() - else: - assert ser.sum() == "12" - msg = "Could not convert string '12' to numeric|does not support" + assert ser.sum() == "12" + + msg = "Could not convert string '12' to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): ser.mean() df = ser.to_frame() if not using_array_manager: - msg = r"Could not convert \['12'\] to numeric|does not support" + msg = r"Could not convert \['12'\] to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): df.mean() @@ -189,30 +185,33 @@ def test_mean_dont_convert_j_to_complex(using_array_manager): if using_array_manager: msg = "Could not convert string 'J' to numeric" else: - msg = r"Could not convert \['J'\] to numeric|does not support" + msg = r"Could not convert \['J'\] to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): df.mean() with pytest.raises(TypeError, match=msg): df.agg("mean") - msg = "Could not convert string 'J' to numeric|does not support" + msg = "Could not convert string 'J' to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): df["db"].mean() - msg = "Could not convert string 'J' to numeric|ufunc 'divide'" + msg = "Could not convert string 'J' to numeric|ufunc 'divide'|Cannot perform" with pytest.raises(TypeError, match=msg): np.mean(df["db"].astype("string").array) def test_median_with_convertible_string_raises(using_array_manager): # GH#34671 this _could_ return a string "2", but definitely not float 2.0 - msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support" + msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support|Cannot perform" ser = Series(["1", "2", "3"]) with pytest.raises(TypeError, match=msg): ser.median() if not using_array_manager: - msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support" + msg = ( + r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support" + "|Cannot perform" + ) df = ser.to_frame() with pytest.raises(TypeError, match=msg): df.median() diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 9d13ebf740eab..e03e87a44107f 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -18,7 +18,10 @@ def ufunc(request): return request.param -@pytest.fixture(params=[True, False], ids=["sparse", "dense"]) +@pytest.fixture( + params=[pytest.param(True, marks=pytest.mark.fails_arm_wheels), False], + ids=["sparse", "dense"], +) def sparse(request): return request.param diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index 01b49b5e5b633..6c4bec6a23789 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -2,12 +2,20 @@ import pandas as pd -object_pyarrow_numpy = ("object", "string[pyarrow_numpy]") + +def is_object_or_nan_string_dtype(dtype): + """ + Check if string-like dtype is following NaN semantics, i.e. is object + dtype or a NaN-variant of the StringDtype. + """ + return (isinstance(dtype, np.dtype) and dtype == "object") or ( + dtype.na_value is np.nan + ) def _convert_na_value(ser, expected): if ser.dtype != object: - if ser.dtype.storage == "pyarrow_numpy": + if ser.dtype.na_value is np.nan: expected = expected.fillna(np.nan) else: # GH#18463 diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index 31e005466af7b..8987fc36656c5 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -111,6 +111,7 @@ def test_api_per_method( any_allowed_skipna_inferred_dtype, any_string_method, request, + using_infer_string, ): # this test does not check correctness of the different methods, # just that the methods work on the specified (inferred) dtypes, @@ -149,6 +150,10 @@ def test_api_per_method( t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) + if using_infer_string and dtype == "category": + string_allowed = method_name not in ["decode"] + else: + string_allowed = True bytes_allowed = method_name in ["decode", "get", "len", "slice"] # as of v0.23.4, all methods except 'cat' are very lenient with the # allowed data types, just returning NaN for entries that error. @@ -157,7 +162,8 @@ def test_api_per_method( mixed_allowed = method_name not in ["cat"] allowed_types = ( - ["string", "unicode", "empty"] + ["empty"] + + ["string", "unicode"] * string_allowed + ["bytes"] * bytes_allowed + ["mixed", "mixed-integer"] * mixed_allowed ) @@ -171,6 +177,7 @@ def test_api_per_method( msg = ( f"Cannot use .str.{method_name} with values of " f"inferred dtype {repr(inferred_dtype)}." + "|a bytes-like object is required, not 'str'" ) with pytest.raises(TypeError, match=msg): method(*args, **kwargs) diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index 41aedae90ca76..819556f961fa3 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -291,11 +291,7 @@ def test_center_ljust_rjust_mixed_object(): def test_center_ljust_rjust_fillchar(any_string_dtype): - if any_string_dtype == "string[pyarrow_numpy]": - pytest.skip( - "Arrow logic is different, " - "see https://github.com/pandas-dev/pandas/pull/54533/files#r1299808126", - ) + # GH#54533, GH#54792 s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) result = s.str.center(5, fillchar="X") diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 3f58c6d703f8f..dfa9a36995480 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -4,7 +4,6 @@ import numpy as np import pytest -from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td import pandas as pd @@ -14,7 +13,7 @@ ) from pandas.tests.strings import ( _convert_na_value, - object_pyarrow_numpy, + is_object_or_nan_string_dtype, ) # -------------------------------------------------------------------------------------- @@ -22,10 +21,6 @@ # -------------------------------------------------------------------------------------- -def using_pyarrow(dtype): - return dtype in ("string[pyarrow]", "string[pyarrow_numpy]") - - def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ @@ -34,18 +29,28 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" - expected = Series( - np.array([False, np.nan, True, True, False], dtype=np.object_), - dtype=expected_dtype, - ) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([False, False, True, True, False], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series( + np.array([False, np.nan, True, True, False], dtype=np.object_), + dtype=expected_dtype, + ) + tm.assert_series_equal(result, expected) result = values.str.contains(pat, regex=False) - expected = Series( - np.array([False, np.nan, False, False, True], dtype=np.object_), - dtype=expected_dtype, - ) + if any_string_dtype == "str": + expected = Series([False, False, False, False, True], dtype=bool) + else: + expected = Series( + np.array([False, np.nan, False, False, True], dtype=np.object_), + dtype=expected_dtype, + ) tm.assert_series_equal(result, expected) values = Series( @@ -53,7 +58,9 @@ def test_contains(any_string_dtype): dtype=any_string_dtype, ) result = values.str.contains(pat) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -80,14 +87,22 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" - expected = Series( - np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype - ) + if any_string_dtype == "str": + expected = Series([False, False, True, True], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series( + np.array([False, np.nan, True, True], dtype=np.object_), + dtype=expected_dtype, + ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -159,7 +174,16 @@ def test_contains_na_kwarg_for_nullable_string_dtype( # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416 values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype) - result = values.str.contains("a", na=na, regex=regex) + + msg = ( + "Allowing a non-bool 'na' in obj.str.contains is deprecated and " + "will raise in a future version" + ) + warn = None + if not pd.isna(na) and not isinstance(na, bool): + warn = FutureWarning + with tm.assert_produces_warning(warn, match=msg): + result = values.str.contains("a", na=na, regex=regex) expected = Series([True, False, False, True, expected], dtype="boolean") tm.assert_series_equal(result, expected) @@ -172,37 +196,45 @@ def test_contains_moar(any_string_dtype): ) result = s.str.contains("a") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + na_value = False + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + na_value = np.nan expected = Series( - [False, False, False, True, True, False, np.nan, False, False, True], + [False, False, False, True, True, False, na_value, False, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("a", case=False) expected = Series( - [True, False, False, True, True, False, np.nan, True, False, True], + [True, False, False, True, True, False, na_value, True, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("Aa") expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False], + [False, False, False, True, False, False, na_value, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba") expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False], + [False, False, False, True, False, False, na_value, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba", case=False) expected = Series( - [False, False, False, True, True, False, np.nan, True, False, False], + [False, False, False, True, True, False, na_value, True, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) @@ -213,7 +245,9 @@ def test_contains_nan(any_string_dtype): s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) result = s.str.contains("foo", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -221,18 +255,38 @@ def test_contains_nan(any_string_dtype): expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) - result = s.str.contains("foo", na="foo") - if any_string_dtype == "object": - expected = Series(["foo", "foo", "foo"], dtype=np.object_) - elif any_string_dtype == "string[pyarrow_numpy]": - expected = Series([True, True, True], dtype=np.bool_) - else: - expected = Series([True, True, True], dtype="boolean") - tm.assert_series_equal(result, expected) + # TODO(infer_string) + # this particular combination of events is broken on 2.3 + # would require cherry picking #58483, which in turn requires #57481 + # which introduce many behavioral changes + if not ( + hasattr(any_string_dtype, "storage") + and any_string_dtype.storage == "python" + and any_string_dtype.na_value is np.nan + ): + msg = ( + "Allowing a non-bool 'na' in obj.str.contains is deprecated and " + "will raise in a future version" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.str.contains("foo", na="foo") + if any_string_dtype == "object": + expected = Series(["foo", "foo", "foo"], dtype=np.object_) + elif any_string_dtype.na_value is np.nan: + expected = Series([True, True, True], dtype=np.bool_) + else: + expected = Series([True, True, True], dtype="boolean") + tm.assert_series_equal(result, expected) result = s.str.contains("foo") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" - expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([False, False, False], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -241,11 +295,33 @@ def test_contains_nan(any_string_dtype): # -------------------------------------------------------------------------------------- +def test_startswith_endswith_validate_na(request, any_string_dtype): + if ( + any_string_dtype == "string" + and any_string_dtype.na_value is np.nan + and any_string_dtype.storage == "python" + ): + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + # GH#59615 + ser = Series( + ["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"], + dtype=any_string_dtype, + ) + + msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.startswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.endswith("bar", na="baz") + + +@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) -def test_startswith(pat, dtype, null_value, na): +def test_startswith(pat, dtype, null_value, na, using_infer_string): # add category dtype parametrizations for GH-36241 values = Series( ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], @@ -259,6 +335,8 @@ def test_startswith(pat, dtype, null_value, na): exp = exp.fillna(null_value) elif dtype == "object" and null_value is None: exp[exp.isna()] = None + elif using_infer_string and dtype == "category": + exp = exp.fillna(False).astype(bool) tm.assert_series_equal(result, exp) result = values.str.startswith(pat, na=na) @@ -276,20 +354,31 @@ def test_startswith(pat, dtype, null_value, na): @pytest.mark.parametrize("na", [None, True, False]) -def test_startswith_nullable_string_dtype(nullable_string_dtype, na): +def test_startswith_string_dtype(any_string_dtype, na): values = Series( ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], - dtype=nullable_string_dtype, + dtype=any_string_dtype, ) result = values.str.startswith("foo", na=na) + + expected_dtype = ( + (object if na is None else bool) + if is_object_or_nan_string_dtype(any_string_dtype) + else "boolean" + ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + if na is None: + na = False exp = Series( - [False, na, True, False, False, na, True, False, False], dtype="boolean" + [False, na, True, False, False, na, True, False, False], dtype=expected_dtype ) tm.assert_series_equal(result, exp) result = values.str.startswith("rege.", na=na) exp = Series( - [False, na, False, False, False, na, False, False, True], dtype="boolean" + [False, na, False, False, False, na, False, False, True], dtype=expected_dtype ) tm.assert_series_equal(result, exp) @@ -299,11 +388,12 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na): # -------------------------------------------------------------------------------------- +@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) -def test_endswith(pat, dtype, null_value, na): +def test_endswith(pat, dtype, null_value, na, using_infer_string): # add category dtype parametrizations for GH-36241 values = Series( ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], @@ -317,6 +407,8 @@ def test_endswith(pat, dtype, null_value, na): exp = exp.fillna(null_value) elif dtype == "object" and null_value is None: exp[exp.isna()] = None + elif using_infer_string and dtype == "category": + exp = exp.fillna(False).astype(bool) tm.assert_series_equal(result, exp) result = values.str.endswith(pat, na=na) @@ -334,20 +426,30 @@ def test_endswith(pat, dtype, null_value, na): @pytest.mark.parametrize("na", [None, True, False]) -def test_endswith_nullable_string_dtype(nullable_string_dtype, na): +def test_endswith_string_dtype(any_string_dtype, na): values = Series( ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], - dtype=nullable_string_dtype, + dtype=any_string_dtype, ) result = values.str.endswith("foo", na=na) + expected_dtype = ( + (object if na is None else bool) + if is_object_or_nan_string_dtype(any_string_dtype) + else "boolean" + ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + if na is None: + na = False exp = Series( - [False, na, False, False, True, na, True, False, False], dtype="boolean" + [False, na, False, False, True, na, True, False, False], dtype=expected_dtype ) tm.assert_series_equal(result, exp) result = values.str.endswith("rege.", na=na) exp = Series( - [False, na, False, False, False, na, False, False, True], dtype="boolean" + [False, na, False, False, False, na, False, False, True], dtype=expected_dtype ) tm.assert_series_equal(result, exp) @@ -391,8 +493,7 @@ def test_replace_mixed_object(): def test_replace_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) + result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @@ -412,8 +513,7 @@ def test_replace_callable(any_string_dtype): # test with callable repl = lambda m: m.group(0).swapcase() - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) + result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -431,10 +531,7 @@ def test_replace_callable_raises(any_string_dtype, repl): r"(?(3)required )positional arguments?" ) with pytest.raises(TypeError, match=msg): - with tm.maybe_produces_warning( - PerformanceWarning, using_pyarrow(any_string_dtype) - ): - values.str.replace("a", repl, regex=True) + values.str.replace("a", repl, regex=True) def test_replace_callable_named_groups(any_string_dtype): @@ -442,8 +539,7 @@ def test_replace_callable_named_groups(any_string_dtype): ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, repl, regex=True) + result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -454,13 +550,11 @@ def test_replace_compiled_regex(any_string_dtype): # test with compiled regex pat = re.compile(r"BAD_*") - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, "", regex=True) + result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, "", n=1, regex=True) + result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -481,8 +575,7 @@ def test_replace_compiled_regex_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, ", ", regex=True) + result = ser.str.replace(pat, ", ", regex=True) tm.assert_series_equal(result, expected) @@ -509,8 +602,7 @@ def test_replace_compiled_regex_callable(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, repl, n=2, regex=True) + result = ser.str.replace(pat, repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -558,8 +650,7 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("A", "YYY", case=False) + result = ser.str.replace("A", "YYY", case=False) expected = Series( [ "YYY", @@ -577,8 +668,7 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) + result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ "A", @@ -601,13 +691,11 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("a", "c", case=False, regex=False) + result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("a.", "c.", case=False, regex=False) + result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -640,34 +728,41 @@ def test_replace_regex_single_character(regex, any_string_dtype): def test_match(any_string_dtype): - # New match behavior introduced in 0.13 - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + na_value = False + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + na_value = np.nan values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") - expected = Series([True, np.nan, False], dtype=expected_dtype) + expected = Series([True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = values.str.match(".*BAD[_]+.*BAD") - expected = Series([True, True, np.nan, False], dtype=expected_dtype) + expected = Series([True, True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = values.str.match("BAD[_]+.*BAD") - expected = Series([False, True, np.nan, False], dtype=expected_dtype) + expected = Series([False, True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = values.str.match("^BAD[_]+.*BAD") - expected = Series([False, False, np.nan, False], dtype=expected_dtype) + expected = Series([False, False, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = values.str.match("\\^BAD[_]+.*BAD") - expected = Series([False, True, np.nan, False], dtype=expected_dtype) + expected = Series([False, True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -696,20 +791,33 @@ def test_match_na_kwarg(any_string_dtype): s = Series(["a", "b", np.nan], dtype=any_string_dtype) result = s.str.match("a", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.match("a") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" - expected = Series([True, False, np.nan], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + na_value = False + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + na_value = np.nan + + expected = Series([True, False, na_value], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = values.str.match("ab", case=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -725,8 +833,29 @@ def test_fullmatch(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" - expected = Series([True, False, np.nan, False], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([True, False, False, False], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([True, False, np.nan, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + +def test_fullmatch_dollar_literal(any_string_dtype): + # GH 56652 + ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype) + result = ser.str.fullmatch("foo\\$") + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([False, False, False, True], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([False, False, np.nan, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -735,14 +864,18 @@ def test_fullmatch_na_kwarg(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_fullmatch_case_kwarg(any_string_dtype): ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False, False], dtype=expected_dtype) @@ -754,8 +887,7 @@ def test_fullmatch_case_kwarg(any_string_dtype): result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.fullmatch("ab", flags=re.IGNORECASE) + result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) @@ -814,7 +946,9 @@ def test_find(any_string_dtype): ser = Series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = ser.str.find("EF") expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype) @@ -866,7 +1000,9 @@ def test_find_nan(any_string_dtype): ser = Series( ["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = ser.str.find("EF") expected = Series([4, np.nan, 1, np.nan, -1], dtype=expected_dtype) @@ -936,17 +1072,13 @@ def test_flags_kwarg(any_string_dtype): pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" - use_pyarrow = using_pyarrow(any_string_dtype) - result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] - with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): - result = data.str.match(pat, flags=re.IGNORECASE) + result = data.str.match(pat, flags=re.IGNORECASE) assert result.iloc[0] - with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): - result = data.str.fullmatch(pat, flags=re.IGNORECASE) + result = data.str.fullmatch(pat, flags=re.IGNORECASE) assert result.iloc[0] result = data.str.findall(pat, flags=re.IGNORECASE) @@ -956,8 +1088,6 @@ def test_flags_kwarg(any_string_dtype): assert result.iloc[0] == 1 msg = "has match groups" - with tm.assert_produces_warning( - UserWarning, match=msg, raise_on_extra_warnings=not use_pyarrow - ): + with tm.assert_produces_warning(UserWarning, match=msg): result = data.str.contains(pat, flags=re.IGNORECASE) assert result.iloc[0] diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 9ff1fc0e13ae9..423993e881b98 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -14,7 +14,7 @@ ) from pandas.tests.strings import ( _convert_na_value, - object_pyarrow_numpy, + is_object_or_nan_string_dtype, ) @@ -384,7 +384,7 @@ def test_split_nan_expand(any_string_dtype): # check that these are actually np.nan/pd.NA and not None # TODO see GH 18463 # tm.assert_frame_equal does not differentiate - if any_string_dtype in object_pyarrow_numpy: + if is_object_or_nan_string_dtype(any_string_dtype): assert all(np.isnan(x) for x in result.iloc[1]) else: assert all(x is pd.NA for x in result.iloc[1]) diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 0b3f368afea5e..cd3c512328139 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -12,7 +12,6 @@ ) -@pytest.mark.filterwarnings("ignore:Falling back") def test_string_array(nullable_string_dtype, any_string_method): method_name, args, kwargs = any_string_method @@ -39,7 +38,7 @@ def test_string_array(nullable_string_dtype, any_string_method): expected.values, skipna=True ): assert result.dtype == "boolean" - result = result.astype(object) + expected = expected.astype("boolean") elif expected.dtype == "bool": assert result.dtype == "boolean" diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index f662dfd7e2b14..c729b910d05a7 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -14,7 +14,7 @@ ) import pandas._testing as tm from pandas.core.strings.accessor import StringMethods -from pandas.tests.strings import object_pyarrow_numpy +from pandas.tests.strings import is_object_or_nan_string_dtype @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) @@ -41,7 +41,9 @@ def test_iter_raises(): def test_count(any_string_dtype): ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype) result = ser.str.count("f[o]+") - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = Series([1, 2, np.nan, 4], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -93,7 +95,8 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) - if any_string_dtype in object_pyarrow_numpy: + empty_inferred_str = Series(dtype="str") + if is_object_or_nan_string_dtype(any_string_dtype): empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) else: @@ -152,7 +155,7 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) - tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii")) + tm.assert_series_equal(empty_inferred_str, empty_bytes.str.decode("ascii")) tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) # ismethods should always return boolean (GH 29624) tm.assert_series_equal(empty_bool, empty.str.isalnum()) @@ -207,14 +210,29 @@ def test_ismethods(method, expected, any_string_dtype): ser = Series( ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype ) - expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) # compare with standard library - expected = [getattr(item, method)() for item in ser] - assert list(result) == expected + expected_stdlib = [getattr(item, method)() for item in ser] + assert list(result) == expected_stdlib + + # with missing value + ser.iloc[[1, 2, 3, 4]] = np.nan + result = getattr(ser.str, method)() + if ser.dtype == "object": + expected = expected.astype(object) + expected.iloc[[1, 2, 3, 4]] = np.nan + elif ser.dtype == "str": + # NaN propagates as False + expected.iloc[[1, 2, 3, 4]] = False + else: + # nullable dtypes propagate NaN + expected.iloc[[1, 2, 3, 4]] = np.nan @pytest.mark.parametrize( @@ -232,7 +250,9 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): ser = Series( ["A", "3", "¼", "★", "፸", "3", "four"], dtype=any_string_dtype # noqa: RUF001 ) - expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -242,6 +262,7 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): assert list(result) == expected +@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize( "method, expected", [ @@ -252,8 +273,14 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): def test_isnumeric_unicode_missing(method, expected, any_string_dtype): values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001 ser = Series(values, dtype=any_string_dtype) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" - expected = Series(expected, dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series(expected, dtype=object).fillna(False).astype(bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -283,7 +310,9 @@ def test_len(any_string_dtype): dtype=any_string_dtype, ) result = ser.str.len() - expected_dtype = "float64" if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + "float64" if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -312,7 +341,9 @@ def test_index(method, sub, start, end, index_or_series, any_string_dtype, expec obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = index_or_series(expected, dtype=expected_dtype) result = getattr(obj.str, method)(sub, start, end) @@ -353,7 +384,9 @@ def test_index_wrong_type_raises(index_or_series, any_string_dtype, method): ) def test_index_missing(any_string_dtype, method, exp): ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype) - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = getattr(ser.str, method)("b") expected = Series(exp + [np.nan], dtype=expected_dtype) @@ -379,6 +412,7 @@ def test_pipe_failures(any_string_dtype): (2, 5, None, ["foo", "bar", np.nan, "baz"]), (0, 3, -1, ["", "", np.nan, ""]), (None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]), + (None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]), (3, 10, 2, ["oto", "ato", np.nan, "aqx"]), (3, 0, -1, ["ofa", "aba", np.nan, "aba"]), ], @@ -531,7 +565,7 @@ def test_string_slice_out_of_bounds(any_string_dtype): def test_encode_decode(any_string_dtype): ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8") result = ser.str.decode("utf-8") - expected = ser.map(lambda x: x.decode("utf-8")).astype(object) + expected = Series(["a", "b", "a\xe4"], dtype="str") tm.assert_series_equal(result, expected) @@ -561,10 +595,34 @@ def test_decode_errors_kwarg(): ser.str.decode("cp1252") result = ser.str.decode("cp1252", "ignore") - expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype(object) + expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype("str") + tm.assert_series_equal(result, expected) + + +def test_decode_string_dtype(string_dtype): + # https://github.com/pandas-dev/pandas/pull/60940 + ser = Series([b"a", b"b"]) + result = ser.str.decode("utf-8", dtype=string_dtype) + expected = Series(["a", "b"], dtype=string_dtype) tm.assert_series_equal(result, expected) +def test_decode_object_dtype(object_dtype): + # https://github.com/pandas-dev/pandas/pull/60940 + ser = Series([b"a", rb"\ud800"]) + result = ser.str.decode("utf-8", dtype=object_dtype) + expected = Series(["a", r"\ud800"], dtype=object_dtype) + tm.assert_series_equal(result, expected) + + +def test_decode_bad_dtype(): + # https://github.com/pandas-dev/pandas/pull/60940 + ser = Series([b"a", b"b"]) + msg = "dtype must be string or object, got dtype='int64'" + with pytest.raises(ValueError, match=msg): + ser.str.decode("utf-8", dtype="int64") + + @pytest.mark.parametrize( "form, expected", [ @@ -716,5 +774,5 @@ def test_get_with_dict_label(): def test_series_str_decode(): # GH 22613 result = Series([b"x", b"y"]).str.decode(encoding="UTF-8", errors="strict") - expected = Series(["x", "y"], dtype="object") + expected = Series(["x", "y"], dtype="str") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 718d1b3ee2e83..80ee0f6e067f9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import ( algos as libalgos, hashtable as ht, @@ -63,6 +65,7 @@ def test_factorize_complex(self): expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=object) tm.assert_numpy_array_equal(uniques, expected_uniques) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("sort", [True, False]) def test_factorize(self, index_or_series_obj, sort): obj = index_or_series_obj @@ -1280,7 +1283,7 @@ def test_value_counts_nat(self): result_dt = algos.value_counts(dt) tm.assert_series_equal(result_dt, exp_dt) - exp_td = Series({np.timedelta64(10000): 1}, name="count") + exp_td = Series([1], index=[np.timedelta64(10000)], name="count") with tm.assert_produces_warning(FutureWarning, match=msg): result_td = algos.value_counts(td) tm.assert_series_equal(result_td, exp_td) @@ -1704,8 +1707,14 @@ class TestHashTable: @pytest.mark.parametrize( "htable, data", [ - (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]), - (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]), + ( + ht.PyObjectHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), + ( + ht.StringHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), (ht.Float64HashTable, np.arange(1000, dtype=np.float64)), (ht.Int64HashTable, np.arange(1000, dtype=np.int64)), (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)), @@ -1713,7 +1722,7 @@ class TestHashTable: ) def test_hashtable_unique(self, htable, data, writable): # output of maker has guaranteed unique elements - s = Series(data) + s = Series(data, dtype=data.dtype) if htable == ht.Float64HashTable: # add NaN for float column s.loc[500] = np.nan @@ -1743,8 +1752,14 @@ def test_hashtable_unique(self, htable, data, writable): @pytest.mark.parametrize( "htable, data", [ - (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]), - (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]), + ( + ht.PyObjectHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), + ( + ht.StringHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), (ht.Float64HashTable, np.arange(1000, dtype=np.float64)), (ht.Int64HashTable, np.arange(1000, dtype=np.int64)), (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)), @@ -1752,7 +1767,7 @@ def test_hashtable_unique(self, htable, data, writable): ) def test_hashtable_factorize(self, htable, writable, data): # output of maker has guaranteed unique elements - s = Series(data) + s = Series(data, dtype=data.dtype) if htable == ht.Float64HashTable: # add NaN for float column s.loc[500] = np.nan @@ -1896,13 +1911,16 @@ def test_strobj_mode(self): tm.assert_series_equal(ser.mode(), exp) @pytest.mark.parametrize("dt", [str, object]) - def test_strobj_multi_char(self, dt): + def test_strobj_multi_char(self, dt, using_infer_string): exp = ["bar"] data = ["foo"] * 2 + ["bar"] * 3 ser = Series(data, dtype=dt) exp = Series(exp, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + if using_infer_string and dt is str: + tm.assert_extension_array_equal(algos.mode(ser.values), exp.values) + else: + tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) tm.assert_series_equal(ser.mode(), exp) def test_datelike_mode(self): diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 51ce73ef54300..d448773c3bd4a 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -23,6 +23,7 @@ DatetimeArray, TimedeltaArray, ) +from pandas.util.version import Version @pytest.fixture @@ -223,7 +224,7 @@ def test_missing_required_dependency(): assert name in output -def test_frame_setitem_dask_array_into_new_col(): +def test_frame_setitem_dask_array_into_new_col(request): # GH#47128 # dask sets "compute.use_numexpr" to False, so catch the current value @@ -231,7 +232,14 @@ def test_frame_setitem_dask_array_into_new_col(): olduse = pd.get_option("compute.use_numexpr") try: + dask = pytest.importorskip("dask") da = pytest.importorskip("dask.array") + if Version(dask.__version__) <= Version("2025.1.0") and Version( + np.__version__ + ) >= Version("2.1"): + request.applymarker( + pytest.mark.xfail(reason="loc.__setitem__ incorrectly mutated column c") + ) dda = da.array([1, 2]) df = DataFrame({"a": ["a", "b"]}) diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index c1d1948d6c31a..52b5f636b1254 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -50,6 +50,20 @@ def test_bad_version(monkeypatch): result = import_optional_dependency("fakemodule") assert result is module + with pytest.raises(ImportError, match="Pandas requires version '1.1.0'"): + import_optional_dependency("fakemodule", min_version="1.1.0") + + with tm.assert_produces_warning(UserWarning): + result = import_optional_dependency( + "fakemodule", errors="warn", min_version="1.1.0" + ) + assert result is None + + result = import_optional_dependency( + "fakemodule", errors="ignore", min_version="1.1.0" + ) + assert result is None + def test_submodule(monkeypatch): # Create a fake module with a submodule diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 6791ac0340640..e7e8f3ac63cd1 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1207,10 +1207,8 @@ def test_out_of_bounds_errors_ignore2(self): # GH#12424 msg = "errors='ignore' is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): - res = to_datetime( - Series(["2362-01-01", np.nan], dtype=object), errors="ignore" - ) - exp = Series(["2362-01-01", np.nan], dtype=object) + res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") + exp = Series(["2362-01-01", np.nan]) tm.assert_series_equal(res, exp) def test_to_datetime_tz(self, cache): @@ -1494,7 +1492,9 @@ def test_datetime_invalid_index(self, values, format): warn, match="Could not infer format", raise_on_extra_warnings=False ): res = to_datetime(values, errors="ignore", format=format) - tm.assert_index_equal(res, Index(values, dtype=object)) + tm.assert_index_equal( + res, Index(values, dtype="object" if format is None else "str") + ) with tm.assert_produces_warning( warn, match="Could not infer format", raise_on_extra_warnings=False @@ -1912,6 +1912,14 @@ def test_unit(self, cache): with pytest.raises(ValueError, match=msg): to_datetime([1], unit="D", format="%Y%m%d", cache=cache) + def test_unit_str(self, cache): + # GH 57051 + # Test that strs aren't dropping precision to 32-bit accidentally. + with tm.assert_produces_warning(FutureWarning): + res = to_datetime(["1704660000"], unit="s", origin="unix") + expected = to_datetime([1704660000], unit="s", origin="unix") + tm.assert_index_equal(res, expected) + def test_unit_array_mixed_nans(self, cache): values = [11111111111111111, 1, 1.0, iNaT, NaT, np.nan, "NaT", ""] result = to_datetime(values, unit="D", errors="ignore", cache=cache) @@ -3399,7 +3407,18 @@ def test_invalid_origin(self, unit): with pytest.raises(ValueError, match=msg): to_datetime("2005-01-01", origin="1960-01-01", unit=unit) - def test_epoch(self, units, epochs, epoch_1960, units_from_epochs): + @pytest.mark.parametrize( + "epochs", + [ + Timestamp(1960, 1, 1), + datetime(1960, 1, 1), + "1960-01-01", + np.datetime64("1960-01-01"), + ], + ) + def test_epoch(self, units, epochs): + epoch_1960 = Timestamp(1960, 1, 1) + units_from_epochs = np.arange(5, dtype=np.int64) expected = Series( [pd.Timedelta(x, unit=units) + epoch_1960 for x in units_from_epochs] ) @@ -3696,7 +3715,7 @@ def test_to_datetime_mixed_not_necessarily_iso8601_raise(): ("errors", "expected"), [ ("coerce", DatetimeIndex(["2020-01-01 00:00:00", NaT])), - ("ignore", Index(["2020-01-01", "01-01-2000"], dtype=object)), + ("ignore", Index(["2020-01-01", "01-01-2000"], dtype="str")), ], ) def test_to_datetime_mixed_not_necessarily_iso8601_coerce(errors, expected): diff --git a/pandas/tests/tseries/offsets/conftest.py b/pandas/tests/tseries/offsets/conftest.py deleted file mode 100644 index 2fc846353dcb5..0000000000000 --- a/pandas/tests/tseries/offsets/conftest.py +++ /dev/null @@ -1,13 +0,0 @@ -import datetime - -import pytest - -from pandas._libs.tslibs import Timestamp - - -@pytest.fixture -def dt(): - """ - Fixture for common Timestamp. - """ - return Timestamp(datetime.datetime(2008, 1, 2)) diff --git a/pandas/tests/tseries/offsets/test_business_quarter.py b/pandas/tests/tseries/offsets/test_business_quarter.py index 44a7f16ab039d..6d7a115054b7f 100644 --- a/pandas/tests/tseries/offsets/test_business_quarter.py +++ b/pandas/tests/tseries/offsets/test_business_quarter.py @@ -9,6 +9,7 @@ import pytest +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( assert_is_on_offset, assert_offset_equal, @@ -54,9 +55,12 @@ def test_repr(self): assert repr(BQuarterBegin(startingMonth=1)) == expected def test_is_anchored(self): - assert BQuarterBegin(startingMonth=1).is_anchored() - assert BQuarterBegin().is_anchored() - assert not BQuarterBegin(2, startingMonth=1).is_anchored() + msg = "BQuarterBegin.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert BQuarterBegin(startingMonth=1).is_anchored() + assert BQuarterBegin().is_anchored() + assert not BQuarterBegin(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -177,9 +181,12 @@ def test_repr(self): assert repr(BQuarterEnd(startingMonth=1)) == expected def test_is_anchored(self): - assert BQuarterEnd(startingMonth=1).is_anchored() - assert BQuarterEnd().is_anchored() - assert not BQuarterEnd(2, startingMonth=1).is_anchored() + msg = "BQuarterEnd.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert BQuarterEnd(startingMonth=1).is_anchored() + assert BQuarterEnd().is_anchored() + assert not BQuarterEnd(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner diff --git a/pandas/tests/tseries/offsets/test_common.py b/pandas/tests/tseries/offsets/test_common.py index 5b80b8b1c4ab4..aa4e22f71ad66 100644 --- a/pandas/tests/tseries/offsets/test_common.py +++ b/pandas/tests/tseries/offsets/test_common.py @@ -250,7 +250,8 @@ def test_sub(date, offset_box, offset2): [BusinessHour, BusinessHour()], ], ) -def test_Mult1(offset_box, offset1, dt): +def test_Mult1(offset_box, offset1): + dt = Timestamp(2008, 1, 2) assert dt + 10 * offset1 == dt + offset_box(10) assert dt + 5 * offset1 == dt + offset_box(5) diff --git a/pandas/tests/tseries/offsets/test_fiscal.py b/pandas/tests/tseries/offsets/test_fiscal.py index 7f8c34bc6832e..824e66a1ddef1 100644 --- a/pandas/tests/tseries/offsets/test_fiscal.py +++ b/pandas/tests/tseries/offsets/test_fiscal.py @@ -7,6 +7,7 @@ import pytest from pandas import Timestamp +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( WeekDay, assert_is_on_offset, @@ -295,15 +296,18 @@ def test_apply(self): class TestFY5253LastOfMonthQuarter: def test_is_anchored(self): - assert makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 - ).is_anchored() - assert makeFY5253LastOfMonthQuarter( - weekday=WeekDay.SAT, startingMonth=3, qtr_with_extra_week=4 - ).is_anchored() - assert not makeFY5253LastOfMonthQuarter( - 2, startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 - ).is_anchored() + msg = "FY5253Quarter.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ).is_anchored() + assert makeFY5253LastOfMonthQuarter( + weekday=WeekDay.SAT, startingMonth=3, qtr_with_extra_week=4 + ).is_anchored() + assert not makeFY5253LastOfMonthQuarter( + 2, startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ).is_anchored() def test_equality(self): assert makeFY5253LastOfMonthQuarter( diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index ddf56e68b1611..62afb8b83d576 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -625,8 +625,11 @@ def test_default_constructor(self, dt): assert (dt + DateOffset(2)) == datetime(2008, 1, 4) def test_is_anchored(self): - assert not DateOffset(2).is_anchored() - assert DateOffset(1).is_anchored() + msg = "DateOffset.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert not DateOffset(2).is_anchored() + assert DateOffset(1).is_anchored() def test_copy(self): assert DateOffset(months=2).copy() == DateOffset(months=2) diff --git a/pandas/tests/tseries/offsets/test_quarter.py b/pandas/tests/tseries/offsets/test_quarter.py index d183645da507d..5fd3ba0a5fb87 100644 --- a/pandas/tests/tseries/offsets/test_quarter.py +++ b/pandas/tests/tseries/offsets/test_quarter.py @@ -9,6 +9,7 @@ import pytest +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( assert_is_on_offset, assert_offset_equal, @@ -53,9 +54,12 @@ def test_repr(self): assert repr(QuarterBegin(startingMonth=1)) == expected def test_is_anchored(self): - assert QuarterBegin(startingMonth=1).is_anchored() - assert QuarterBegin().is_anchored() - assert not QuarterBegin(2, startingMonth=1).is_anchored() + msg = "QuarterBegin.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert QuarterBegin(startingMonth=1).is_anchored() + assert QuarterBegin().is_anchored() + assert not QuarterBegin(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -161,9 +165,12 @@ def test_repr(self): assert repr(QuarterEnd(startingMonth=1)) == expected def test_is_anchored(self): - assert QuarterEnd(startingMonth=1).is_anchored() - assert QuarterEnd().is_anchored() - assert not QuarterEnd(2, startingMonth=1).is_anchored() + msg = "QuarterEnd.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert QuarterEnd(startingMonth=1).is_anchored() + assert QuarterEnd().is_anchored() + assert not QuarterEnd(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index b68b91826bc6f..399b7038d3426 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -339,7 +339,10 @@ def test_tick_equalities(cls): @pytest.mark.parametrize("cls", tick_classes) def test_tick_offset(cls): - assert not cls().is_anchored() + msg = f"{cls.__name__}.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert not cls().is_anchored() @pytest.mark.parametrize("cls", tick_classes) diff --git a/pandas/tests/tseries/offsets/test_week.py b/pandas/tests/tseries/offsets/test_week.py index f42ff091af277..0cd6f769769ae 100644 --- a/pandas/tests/tseries/offsets/test_week.py +++ b/pandas/tests/tseries/offsets/test_week.py @@ -21,6 +21,7 @@ WeekOfMonth, ) +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( WeekDay, assert_is_on_offset, @@ -42,10 +43,13 @@ def test_corner(self): Week(weekday=-1) def test_is_anchored(self): - assert Week(weekday=0).is_anchored() - assert not Week().is_anchored() - assert not Week(2, weekday=2).is_anchored() - assert not Week(2).is_anchored() + msg = "Week.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert Week(weekday=0).is_anchored() + assert not Week().is_anchored() + assert not Week(2, weekday=2).is_anchored() + assert not Week(2).is_anchored() offset_cases = [] # not business week diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 632d3b4cc3c84..82175c67764f8 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -296,6 +296,23 @@ def test_to_datetime_barely_out_of_bounds(): tslib.array_to_datetime(arr) +@pytest.mark.parametrize( + "timestamp", + [ + # Close enough to bounds that scaling micros to nanos overflows + # but adding nanos would result in an in-bounds datetime. + "1677-09-21T00:12:43.145224193", + "1677-09-21T00:12:43.145224999", + # this always worked + "1677-09-21T00:12:43.145225000", + ], +) +def test_to_datetime_barely_inside_bounds(timestamp): + # see gh-57150 + result, _ = tslib.array_to_datetime(np.array([timestamp], dtype=object)) + tm.assert_numpy_array_equal(result, np.array([timestamp], dtype="M8[ns]")) + + class SubDatetime(datetime): pass diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index d8f23156bd4d4..fb05a57056a83 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -17,6 +17,7 @@ from pandas._libs.tslibs.parsing import parse_datetime_string_with_reso from pandas.compat import ( ISMUSL, + is_platform_arm, is_platform_windows, ) import pandas.util._test_decorators as td @@ -26,7 +27,7 @@ @pytest.mark.skipif( - is_platform_windows() or ISMUSL, + is_platform_windows() or ISMUSL or is_platform_arm(), reason="TZ setting incorrect on Windows and MUSL Linux", ) def test_parsing_tzlocal_deprecated(): diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index ef68408305232..8ca55648f3780 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -45,6 +45,7 @@ def test_to_offset_negative(freqstr, expected): assert result.n == expected +@pytest.mark.filterwarnings("ignore:.*'m' is deprecated.*:FutureWarning") @pytest.mark.parametrize( "freqstr", [ @@ -172,3 +173,47 @@ def test_to_offset_pd_timedelta(kwargs, expected): def test_anchored_shortcuts(shortcut, expected): result = to_offset(shortcut) assert result == expected + + +@pytest.mark.parametrize( + "freq_depr", + [ + "2ye-mar", + "2ys", + "2qe", + "2qs-feb", + "2bqs", + "2sms", + "2bms", + "2cbme", + "2me", + "2w", + ], +) +def test_to_offset_lowercase_frequency_deprecated(freq_depr): + # GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.upper()[1:]}' instead." + + with pytest.raises(FutureWarning, match=depr_msg): + to_offset(freq_depr) + + +@pytest.mark.parametrize( + "freq_depr", + [ + "2H", + "2BH", + "2MIN", + "2S", + "2Us", + "2NS", + ], +) +def test_to_offset_uppercase_frequency_deprecated(freq_depr): + # GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.lower()[1:]}' instead." + + with pytest.raises(FutureWarning, match=depr_msg): + to_offset(freq_depr) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index a074898f6046d..dd5218ab9404f 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -111,7 +111,7 @@ def test_empty_dtypes(check_dtype): @pytest.mark.parametrize("check_like", [True, False]) def test_frame_equal_index_mismatch(check_like, obj_fixture, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""{obj_fixture}\\.index are different @@ -131,7 +131,7 @@ def test_frame_equal_index_mismatch(check_like, obj_fixture, using_infer_string) @pytest.mark.parametrize("check_like", [True, False]) def test_frame_equal_columns_mismatch(check_like, obj_fixture, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""{obj_fixture}\\.columns are different @@ -211,10 +211,7 @@ def test_assert_frame_equal_extension_dtype_mismatch(): "\\[right\\]: int[32|64]" ) - # TODO: this shouldn't raise (or should raise a better error message) - # https://github.com/pandas-dev/pandas/issues/56131 - with pytest.raises(AssertionError, match="classes are different"): - tm.assert_frame_equal(left, right, check_dtype=False) + tm.assert_frame_equal(left, right, check_dtype=False) with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(left, right, check_dtype=True) @@ -246,7 +243,6 @@ def test_assert_frame_equal_ignore_extension_dtype_mismatch(): tm.assert_frame_equal(left, right, check_dtype=False) -@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/56131") def test_assert_frame_equal_ignore_extension_dtype_mismatch_cross_class(): # https://github.com/pandas-dev/pandas/issues/35715 left = DataFrame({"a": [1, 2, 3]}, dtype="Int64") @@ -300,9 +296,7 @@ def test_frame_equal_mixed_dtypes(frame_or_series, any_numeric_ea_dtype, indexer dtypes = (any_numeric_ea_dtype, "int64") obj1 = frame_or_series([1, 2], dtype=dtypes[indexer[0]]) obj2 = frame_or_series([1, 2], dtype=dtypes[indexer[1]]) - msg = r'(Series|DataFrame.iloc\[:, 0\] \(column name="0"\) classes) are different' - with pytest.raises(AssertionError, match=msg): - tm.assert_equal(obj1, obj2, check_exact=True, check_dtype=False) + tm.assert_equal(obj1, obj2, check_exact=True, check_dtype=False) def test_assert_frame_equal_check_like_different_indexes(): diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index dc6efdcec380e..ab52d6c8e9f39 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -207,7 +207,7 @@ def test_index_equal_names(name1, name2): def test_index_equal_category_mismatch(check_categorical, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""Index are different diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index f722f619bc456..0d56885a1cb84 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -221,9 +221,9 @@ def test_series_equal_categorical_values_mismatch(rtol, using_infer_string): Series values are different \\(66\\.66667 %\\) \\[index\\]: \\[0, 1, 2\\] \\[left\\]: \\['a', 'b', 'c'\\] -Categories \\(3, string\\): \\[a, b, c\\] +Categories \\(3, str\\): \\[a, b, c\\] \\[right\\]: \\['a', 'c', 'b'\\] -Categories \\(3, string\\): \\[a, b, c\\]""" +Categories \\(3, str\\): \\[a, b, c\\]""" else: msg = """Series are different @@ -258,7 +258,7 @@ def test_series_equal_datetime_values_mismatch(rtol): def test_series_equal_categorical_mismatch(check_categorical, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""Attributes of Series are different @@ -290,10 +290,7 @@ def test_assert_series_equal_extension_dtype_mismatch(): \\[left\\]: Int64 \\[right\\]: int[32|64]""" - # TODO: this shouldn't raise (or should raise a better error message) - # https://github.com/pandas-dev/pandas/issues/56131 - with pytest.raises(AssertionError, match="Series classes are different"): - tm.assert_series_equal(left, right, check_dtype=False) + tm.assert_series_equal(left, right, check_dtype=False) with pytest.raises(AssertionError, match=msg): tm.assert_series_equal(left, right, check_dtype=True) @@ -372,7 +369,6 @@ def test_assert_series_equal_ignore_extension_dtype_mismatch(): tm.assert_series_equal(left, right, check_dtype=False) -@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/56131") def test_assert_series_equal_ignore_extension_dtype_mismatch_cross_class(): # https://github.com/pandas-dev/pandas/issues/35715 left = Series([1, 2, 3], dtype="Int64") @@ -456,3 +452,33 @@ def test_large_unequal_ints(dtype): right = Series([1577840521123543], dtype=dtype) with pytest.raises(AssertionError, match="Series are different"): tm.assert_series_equal(left, right) + + +@pytest.mark.parametrize("dtype", [None, object]) +@pytest.mark.parametrize("check_exact", [True, False]) +@pytest.mark.parametrize("val", [3, 3.5]) +def test_ea_and_numpy_no_dtype_check(val, check_exact, dtype): + # GH#56651 + left = Series([1, 2, val], dtype=dtype) + right = Series(pd.array([1, 2, val])) + tm.assert_series_equal(left, right, check_dtype=False, check_exact=check_exact) + + +def test_assert_series_equal_int_tol(): + # GH#56646 + left = Series([81, 18, 121, 38, 74, 72, 81, 81, 146, 81, 81, 170, 74, 74]) + right = Series([72, 9, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72]) + tm.assert_series_equal(left, right, rtol=1.5) + + tm.assert_frame_equal(left.to_frame(), right.to_frame(), rtol=1.5) + tm.assert_extension_array_equal( + left.astype("Int64").values, right.astype("Int64").values, rtol=1.5 + ) + + +def test_assert_series_equal_index_exact_default(): + # GH#57067 + ser1 = Series(np.zeros(6, dtype=int), [0, 0.2, 0.4, 0.6, 0.8, 1]) + ser2 = Series(np.zeros(6, dtype=int), np.linspace(0, 1, 6)) + tm.assert_series_equal(ser1, ser2) + tm.assert_frame_equal(ser1.to_frame(), ser2.to_frame()) diff --git a/pandas/tests/util/test_shares_memory.py b/pandas/tests/util/test_shares_memory.py index 00a897d574a07..8f1ac93b40247 100644 --- a/pandas/tests/util/test_shares_memory.py +++ b/pandas/tests/util/test_shares_memory.py @@ -1,3 +1,5 @@ +import numpy as np + import pandas.util._test_decorators as td import pandas as pd @@ -20,10 +22,10 @@ def test_shares_memory_string(): # GH#55823 import pyarrow as pa - obj = pd.array(["a", "b"], dtype="string[pyarrow]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=pd.NA)) assert tm.shares_memory(obj, obj) - obj = pd.array(["a", "b"], dtype="string[pyarrow_numpy]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=np.nan)) assert tm.shares_memory(obj, obj) obj = pd.array(["a", "b"], dtype=pd.ArrowDtype(pa.string())) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index fe2da210c6fe9..948565be36b5b 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -71,7 +71,7 @@ def test_sum_object_str_raises(step): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3, step=step) with pytest.raises( - DataError, match="Cannot aggregate non-numeric type: object|string" + DataError, match="Cannot aggregate non-numeric type: object|str" ): # GH#42738, enforced in 2.0 r.sum() diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index b1cc7ec186f19..9ee7ed0c2f3e6 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td @@ -11,8 +12,17 @@ to_datetime, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.fixture(params=["single", "table"]) @@ -446,3 +456,10 @@ def test_table_method_ewm(self, data, method, axis, nogil, parallel, nopython): engine_kwargs=engine_kwargs, engine="numba" ) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("numba") +def test_npfunc_no_warnings(): + df = DataFrame({"col1": [1, 2, 3, 4, 5]}) + with tm.assert_produces_warning(False): + df.col1.rolling(2).apply(np.prod, raw=True, engine="numba") diff --git a/pandas/tests/window/test_online.py b/pandas/tests/window/test_online.py index 14d3a39107bc4..43d55a7992b3c 100644 --- a/pandas/tests/window/test_online.py +++ b/pandas/tests/window/test_online.py @@ -1,15 +1,24 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm + from pandas import ( DataFrame, Series, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] -pytest.importorskip("numba") +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.mark.filterwarnings("ignore") diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index c99fc8a8eb60f..bd0fadeb3e475 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -1,9 +1,12 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, DatetimeIndex, + Index, MultiIndex, NaT, Series, @@ -697,3 +700,16 @@ def test_nat_axis_error(msg, axis): with pytest.raises(ValueError, match=f"{msg} values must not have NaT"): with tm.assert_produces_warning(FutureWarning, match=warn_msg): df.rolling("D", axis=axis).mean() + + +@td.skip_if_no("pyarrow") +def test_arrow_datetime_axis(): + # GH 55849 + expected = Series( + np.arange(5, dtype=np.float64), + index=Index( + date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]" + ), + ) + result = expected.rolling("1D").sum() + tm.assert_series_equal(result, expected) diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py index 573f76a63459b..5f50838d37315 100644 --- a/pandas/util/_exceptions.py +++ b/pandas/util/_exceptions.py @@ -9,6 +9,7 @@ if TYPE_CHECKING: from collections.abc import Generator + from types import FrameType @contextlib.contextmanager @@ -42,15 +43,20 @@ def find_stack_level() -> int: test_dir = os.path.join(pkg_dir, "tests") # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow - frame = inspect.currentframe() - n = 0 - while frame: - fname = inspect.getfile(frame) - if fname.startswith(pkg_dir) and not fname.startswith(test_dir): - frame = frame.f_back - n += 1 - else: - break + frame: FrameType | None = inspect.currentframe() + try: + n = 0 + while frame: + filename = inspect.getfile(frame) + if filename.startswith(pkg_dir) and not filename.startswith(test_dir): + frame = frame.f_back + n += 1 + else: + break + finally: + # See note in + # https://docs.python.org/3/library/inspect.html#inspect.Traceback + del frame return n diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index e39c2f7badb1d..4ede5627c28b9 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -45,7 +45,7 @@ def _get_sys_info() -> dict[str, JSONSerializable]: language_code, encoding = locale.getlocale() return { "commit": _get_commit_hash(), - "python": ".".join([str(i) for i in sys.version_info]), + "python": platform.python_version(), "python-bits": struct.calcsize("P") * 8, "OS": uname_result.system, "OS-release": uname_result.release, @@ -70,33 +70,25 @@ def _get_dependency_info() -> dict[str, JSONSerializable]: "pytz", "dateutil", # install / build, - "setuptools", "pip", "Cython", - # test - "pytest", - "hypothesis", # docs "sphinx", - # Other, need a min version - "blosc", - "feather", - "xlsxwriter", - "lxml.etree", - "html5lib", - "pymysql", - "psycopg2", - "jinja2", # Other, not imported. "IPython", - "pandas_datareader", ] + # Optional dependencies deps.extend(list(VERSIONS)) result: dict[str, JSONSerializable] = {} for modname in deps: - mod = import_optional_dependency(modname, errors="ignore") - result[modname] = get_version(mod) if mod else None + try: + mod = import_optional_dependency(modname, errors="ignore") + except Exception: + # Dependency conflicts may cause a non ImportError + result[modname] = "N/A" + else: + result[modname] = get_version(mod) if mod else None return result diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index a47f622216ef7..cb0b4d549f49e 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -26,7 +26,7 @@ BoolishNoneT = TypeVar("BoolishNoneT", bool, int, None) -def _check_arg_length(fname, args, max_fname_arg_count, compat_args): +def _check_arg_length(fname, args, max_fname_arg_count, compat_args) -> None: """ Checks whether 'args' has length of at most 'compat_args'. Raises a TypeError if that is not the case, similar to in Python when a @@ -46,7 +46,7 @@ def _check_arg_length(fname, args, max_fname_arg_count, compat_args): ) -def _check_for_default_values(fname, arg_val_dict, compat_args): +def _check_for_default_values(fname, arg_val_dict, compat_args) -> None: """ Check that the keys in `arg_val_dict` are mapped to their default values as specified in `compat_args`. @@ -125,7 +125,7 @@ def validate_args(fname, args, max_fname_arg_count, compat_args) -> None: _check_for_default_values(fname, kwargs, compat_args) -def _check_for_invalid_keys(fname, kwargs, compat_args): +def _check_for_invalid_keys(fname, kwargs, compat_args) -> None: """ Checks whether 'kwargs' contains any keys that are not in 'compat_args' and raises a TypeError if there is one. diff --git a/pyproject.toml b/pyproject.toml index 5e65edf81f9c7..9f2c7c0c56295 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,16 +2,13 @@ # Minimum requirements for the build system to execute. # See https://github.com/scipy/scipy/pull/12940 for the AIX issue. requires = [ - "meson-python==0.13.1", - "meson==1.2.1", + "meson-python>=0.13.1", + "meson>=1.2.1,<2", "wheel", - "Cython==3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json - # Any NumPy version should be fine for compiling. Users are unlikely - # to get a NumPy<1.25 so the result will be compatible with all relevant - # NumPy versions (if not it is presumably compatible with their version). - # Pin <2.0 for releases until tested against an RC. But explicitly allow - # testing the `.dev0` nightlies (which require the extra index). - "numpy>1.22.4,<=2.0.0.dev0", + "Cython~=3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json + # Force numpy higher than 2.0, so that built wheels are compatible + # with both numpy 1 and 2 + "numpy>=2.0", "versioneer[toml]" ] @@ -51,6 +48,7 @@ classifiers = [ 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', + 'Programming Language :: Python :: 3.13', 'Topic :: Scientific/Engineering' ] @@ -64,6 +62,7 @@ matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] +pyarrow = ['pyarrow>=10.0.1'] performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4'] computation = ['scipy>=1.10.0', 'xarray>=2022.12.0'] fss = ['fsspec>=2022.11.0'] @@ -155,21 +154,28 @@ setup = ['--vsenv'] # For Windows skip = "cp36-* cp37-* cp38-* pp* *_i686 *_ppc64le *_s390x" build-verbosity = "3" environment = {LDFLAGS="-Wl,--strip-all"} -test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0" +# pytz 2024.2 causing some failures +test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytz<2024.2" test-command = """ PANDAS_CI='1' python -c 'import pandas as pd; \ pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "-n 2", "--no-strict-data-files"]); \ pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ """ - -[tool.cibuildwheel.macos] -archs = "x86_64 arm64" -test-skip = "*_arm64" +free-threaded-support = true +before-build = "PACKAGE_DIR={package} bash {package}/scripts/cibw_before_build.sh" [tool.cibuildwheel.windows] -before-build = "pip install delvewheel" +before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build.sh" repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}" +[[tool.cibuildwheel.overrides]] +select = "*-manylinux_aarch64*" +test-command = """ + PANDAS_CI='1' python -c 'import pandas as pd; \ + pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db and not fails_arm_wheels", "-n 2", "--no-strict-data-files"]); \ + pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ + """ + [[tool.cibuildwheel.overrides]] select = "*-musllinux*" before-test = "apk update && apk add musl-locales" @@ -259,6 +265,8 @@ select = [ "FLY", # flake8-logging-format "G", + # flake8-future-annotations + "FA", ] ignore = [ @@ -469,7 +477,11 @@ disable = [ "unnecessary-lambda", "unused-argument", "unused-variable", - "using-constant-test" + "using-constant-test", + + # disabled on 2.3.x branch + "consider-using-in", + "simplifiable-if-expression", ] [tool.pytest.ini_options] @@ -524,6 +536,10 @@ markers = [ "clipboard: mark a pd.read_clipboard test", "arm_slow: mark a test as slow for arm64 architecture", "skip_ubsan: Tests known to fail UBSAN check", + # TODO: someone should investigate this ... + # these tests only fail in the wheel builder and don't fail in regular + # ARM CI + "fails_arm_wheels: Tests that fail in the ARM wheel build only", ] [tool.mypy] diff --git a/requirements-dev.txt b/requirements-dev.txt index cbfb6336b2e16..712b5e01257ff 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -43,7 +43,7 @@ s3fs>=2022.11.0 scipy>=1.10.0 SQLAlchemy>=2.0.0 tabulate>=0.9.0 -xarray>=2022.12.0 +xarray>=2022.12.0, <=2024.9.0 xlrd>=2.0.1 xlsxwriter>=3.0.5 zstandard>=0.19.0 @@ -53,7 +53,7 @@ moto flask asv>=0.6.1 flake8==6.1.0 -mypy==1.7.1 +mypy==1.8.0 tokenize-rt pre-commit>=3.6.0 gitpython diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh new file mode 100644 index 0000000000000..04333f446a7ff --- /dev/null +++ b/scripts/cibw_before_build.sh @@ -0,0 +1,14 @@ +# Add 3rd party licenses, like numpy does +for file in $PACKAGE_DIR/LICENSES/*; do + cat $file >> $PACKAGE_DIR/LICENSE +done + +# TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13. +FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" +if [[ $FREE_THREADED_BUILD == "True" ]]; then + python -m pip install -U pip + # python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython + # TODO: Remove below and uncomment above once https://github.com/cython/cython/pull/6717 no longer breaks tests + python -m pip install git+https://github.com/cython/cython.git@3276b588720a053c78488e5de788605950f4b136 + python -m pip install ninja meson-python versioneer[toml] numpy +fi diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 5fcf09cd073fe..1e6e8585f0b90 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -26,6 +26,8 @@ EXCLUDE = {"python", "c-compiler", "cxx-compiler"} REMAP_VERSION = {"tzdata": "2022.7"} CONDA_TO_PIP = { + "versioneer": "versioneer[toml]", + "meson": "meson[ninja]", "pytables": "tables", "psycopg2": "psycopg2-binary", "dask-core": "dask", diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 89b67ddd9f5b6..0d724779abfda 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -58,6 +58,7 @@ "_iLocIndexer", # TODO(3.0): GH#55043 - remove upon removal of ArrayManager "_get_option", + "_fill_limit_area_1d", } diff --git a/web/pandas/pdeps/0014-string-dtype.md b/web/pandas/pdeps/0014-string-dtype.md new file mode 100644 index 0000000000000..5b74f71216454 --- /dev/null +++ b/web/pandas/pdeps/0014-string-dtype.md @@ -0,0 +1,375 @@ +# PDEP-14: Dedicated string data type for pandas 3.0 + +- Created: May 3, 2024 +- Status: Accepted +- Discussion: https://github.com/pandas-dev/pandas/pull/58551 +- Author: [Joris Van den Bossche](https://github.com/jorisvandenbossche) +- Revision: 1 + +## Abstract + +This PDEP proposes to introduce a dedicated string dtype that will be used by +default in pandas 3.0: + +* In pandas 3.0, enable a string dtype (`"str"`) by default, using PyArrow if available + or otherwise a string dtype using numpy object-dtype under the hood as fallback. +* The default string dtype will use missing value semantics (using NaN) consistent + with the other default data types. + +This will give users a long-awaited proper string dtype for 3.0, while 1) not +(yet) making PyArrow a _hard_ dependency, but only a dependency used by default, +and 2) leaving room for future improvements (different missing value semantics, +using NumPy 2.0 strings, etc). + +## Background + +Currently, pandas by default stores text data in an `object`-dtype NumPy array. +The current implementation has two primary drawbacks. First, `object` dtype is +not specific to strings: any Python object can be stored in an `object`-dtype +array, not just strings, and seeing `object` as the dtype for a column with +strings is confusing for users. Second: this is not efficient (all string +methods on a Series are eventually calling Python methods on the individual +string objects). + +To solve the first issue, a dedicated extension dtype for string data has +already been +[added in pandas 1.0](https://pandas.pydata.org/docs/whatsnew/v1.0.0.html#dedicated-string-data-type). +This has always been opt-in for now, requiring users to explicitly request the +dtype (with `dtype="string"` or `dtype=pd.StringDtype()`). The array backing +this string dtype was initially almost the same as the default implementation, +i.e. an `object`-dtype NumPy array of Python strings. + +To solve the second issue (performance), pandas contributed to the development +of string kernels in the PyArrow package, and a variant of the string dtype +backed by PyArrow was +[added in pandas 1.3](https://pandas.pydata.org/docs/whatsnew/v1.3.0.html#pyarrow-backed-string-data-type). +This could be specified with the `storage` keyword in the opt-in string dtype +(`pd.StringDtype(storage="pyarrow")`). + +Since its introduction, the `StringDtype` has always been opt-in, and has used +the experimental `pd.NA` sentinel for missing values (which was also [introduced +in pandas 1.0](https://pandas.pydata.org/docs/whatsnew/v1.0.0.html#experimental-na-scalar-to-denote-missing-values)). +However, up to this date, pandas has not yet taken the step to use `pd.NA` for +for any default dtype, and thus the `StringDtype` deviates in missing value +behaviour compared to the default data types. + +In 2023, [PDEP-10](https://pandas.pydata.org/pdeps/0010-required-pyarrow-dependency.html) +proposed to start using a PyArrow-backed string dtype by default in pandas 3.0 +(i.e. infer this type for string data instead of object dtype). To ensure we +could use the variant of `StringDtype` backed by PyArrow instead of Python +objects (for better performance), it proposed to make `pyarrow` a new required +runtime dependency of pandas. + +In the meantime, NumPy has also been working on a native variable-width string +data type, which was made available [starting with NumPy +2.0](https://numpy.org/devdocs/release/2.0.0-notes.html#stringdtype-has-been-added-to-numpy). +This can provide a potential alternative to PyArrow for implementing a string +data type in pandas that is not backed by Python objects. + +After acceptance of PDEP-10, two aspects of the proposal have been under +reconsideration: + +- Based on feedback from users and maintainers from other packages (mostly + around installation complexity and size), it has been considered to relax the + new `pyarrow` requirement to not be a _hard_ runtime dependency. In addition, + NumPy 2.0 could in the future potentially reduce the need to make PyArrow a + required dependency specifically for a dedicated pandas string dtype. +- PDEP-10 did not consider the usage of the experimental `pd.NA` as a + consequence of adopting one of the existing implementations of the + `StringDtype`. + +For the second aspect, another variant of the `StringDtype` was +[introduced in pandas 2.1](https://pandas.pydata.org/docs/whatsnew/v2.1.0.html#whatsnew-210-enhancements-infer-strings) +that is still backed by PyArrow but follows the default missing values semantics +pandas uses for all other default data types (and using `NaN` as the missing +value sentinel) ([GH-54792](https://github.com/pandas-dev/pandas/issues/54792)). +At the time, the `storage` option for this new variant was called +`"pyarrow_numpy"` to disambiguate from the existing `"pyarrow"` option using +`pd.NA` (but this PDEP proposes a better naming scheme, see the "Naming" +subsection below). + +This last dtype variant is what users currently (pandas 2.2) get for string data +when enabling the ``future.infer_string`` option (to enable the behaviour which +is intended to become the default in pandas 3.0). + +## Proposal + +To be able to move forward with a string data type in pandas 3.0, this PDEP proposes: + +1. For pandas 3.0, a `"str"` string dtype is enabled by default, i.e. this + string dtype will be used as the default dtype for text data when creating + pandas objects (e.g. inference in constructors, I/O functions). +2. This default string dtype will follow the same behaviour for missing values + as other default data types, and use `NaN` as the missing value sentinel. +3. The string dtype will use PyArrow if installed, and otherwise falls back to + an in-house functionally-equivalent (but slower) version. This fallback can + reuse (with minor code additions) the existing numpy object-dtype backed + StringArray for its implementation. +4. Installation guidelines are updated to clearly encourage users to install + pyarrow for the default user experience. + +Those string dtypes enabled by default will then no longer be considered as +experimental. + +### Default inference of a string dtype + +By default, pandas will infer this new string dtype instead of object dtype for +string data (when creating pandas objects, such as in constructors or IO +functions). + +In pandas 2.2, the existing `future.infer_string` option can be used to opt-in to the future +default behaviour: + +```python +>>> pd.options.future.infer_string = True +>>> pd.Series(["a", "b", None]) +0 a +1 b +2 NaN +dtype: string +``` + +Right now (pandas 2.2), the existing option only enables the PyArrow-based +future dtype. For the remaining 2.x releases, this option will be expanded to +also work when PyArrow is not installed to enable the object-dtype fallback in +that case. + +### Missing value semantics + +As mentioned in the background section, the original `StringDtype` has always +used the experimental `pd.NA` sentinel for missing values. In addition to using +`pd.NA` as the scalar for a missing value, this essentially means that: + +- String columns follow ["NA-semantics"](https://pandas.pydata.org/docs/user_guide/missing_data.html#na-semantics) + for missing values, where `NA` propagates in boolean operations such as + comparisons or predicates. +- Operations on the string column that give a numeric or boolean result use the + nullable Integer/Float/Boolean data types (e.g. `ser.str.len()` returns the + nullable `"Int64"` / `pd.Int64Dtype()` dtype instead of the numpy `int64` + dtype (or `float64` in case of missing values)). + +However, up to this date, all other default data types still use `NaN` semantics +for missing values. Therefore, this proposal says that a new default string +dtype should also still use the same default missing value semantics and return +default data types when doing operations on the string column, to be consistent +with the other default dtypes at this point. + +In practice, this means that the default string dtype will use `NaN` as +the missing value sentinel, and: + +- String columns will follow NaN-semantics for missing values, where `NaN` gives + False in boolean operations such as comparisons or predicates. +- Operations on the string column that give a numeric or boolean result will use + the default data types (i.e. numpy `int64`/`float64`/`bool`). + +Because the original `StringDtype` implementations already use `pd.NA` and +return masked integer and boolean arrays in operations, a new variant of the +existing dtypes that uses `NaN` and default data types was needed. The original +variant of `StringDtype` using `pd.NA` will continue to be available for those +who were already using it. + +### Object-dtype "fallback" implementation + +To avoid a hard dependency on PyArrow for pandas 3.0, this PDEP proposes to keep +a "fallback" option in case PyArrow is not installed. The original `StringDtype` +backed by a numpy object-dtype array of Python strings can be mostly reused for +this (adding a new variant of the dtype) and a new `StringArray` subclass only +needs minor changes to follow the above-mentioned missing value semantics +([GH-58451](https://github.com/pandas-dev/pandas/pull/58451)). + +For pandas 3.0, this is the most realistic option given this implementation has +already been available for a long time. Beyond 3.0, further improvements such as +using NumPy 2.0 ([GH-58503](https://github.com/pandas-dev/pandas/issues/58503)) +or nanoarrow ([GH-58552](https://github.com/pandas-dev/pandas/issues/58552)) can +still be explored, but at that point that is an implementation detail that +should not have a direct impact on users (except for performance). + +For the original variant of `StringDtype` using `pd.NA`, currently the default +storage is `"python"` (the object-dtype based implementation). Also for this +variant, it is proposed to follow the same logic for determining the default +storage, i.e. default to `"pyarrow"` if available, and otherwise +fall back to `"python"`. + +### Naming + +Given the long history of this topic, the naming of the dtypes is a difficult +topic. + +In the first place, it should be acknowledged that most users should not need to +use storage-specific options. Users are expected to specify a generic name (such +as `"str"` or `"string"`), and that will give them their default string dtype +(which depends on whether PyArrow is installed or not). + +For the generic string alias to specify the dtype, `"string"` is already used +for the `StringDtype` using `pd.NA`. This PDEP proposes to use `"str"` for the +new default `StringDtype` using `NaN`. This ensures backwards compatibility for +code using `dtype="string"`, and was also chosen because `dtype="str"` or +`dtype=str` currently already works to ensure your data is converted to +strings (only using object dtype for the result). + +But for testing purposes and advanced use cases that want control over the exact +variant of the `StringDtype`, we need some way to specify this and distinguish +them from the other string dtypes. + +Currently (pandas 2.2), `StringDtype(storage="pyarrow_numpy")` is used for the new variant using `NaN`, +where the `"pyarrow_numpy"` storage was used to disambiguate from the existing +`"pyarrow"` option using `pd.NA`. However, `"pyarrow_numpy"` is a rather confusing +option and doesn't generalize well. Therefore, this PDEP proposes a new naming +scheme as outlined below, and `"pyarrow_numpy"` will be deprecated as an alias +in pandas 2.3 and removed in pandas 3.0. + +The `storage` keyword of `StringDtype` is kept to disambiguate the underlying +storage of the string data (using pyarrow or python objects), but an additional +`na_value` is introduced to disambiguate the the variants using NA semantics +and NaN semantics. + +Overview of the different ways to specify a dtype and the resulting concrete +dtype of the data: + +| User specification | Concrete dtype | String alias | Note | +|---------------------------------------------|---------------------------------------------------------------|---------------------------------------|----------| +| Unspecified (inference) | `StringDtype(storage="pyarrow"\|"python", na_value=np.nan)` | "str" | (1) | +| `"str"` or `StringDtype(na_value=np.nan)` | `StringDtype(storage="pyarrow"\|"python", na_value=np.nan)` | "str" | (1) | +| `StringDtype("pyarrow", na_value=np.nan)` | `StringDtype(storage="pyarrow", na_value=np.nan)` | "str" | | +| `StringDtype("python", na_value=np.nan)` | `StringDtype(storage="python", na_value=np.nan)` | "str" | | +| `StringDtype("pyarrow")` | `StringDtype(storage="pyarrow", na_value=pd.NA)` | "string[pyarrow]" | | +| `StringDtype("python")` | `StringDtype(storage="python", na_value=pd.NA)` | "string[python]" | | +| `"string"` or `StringDtype()` | `StringDtype(storage="pyarrow"\|"python", na_value=pd.NA)` | "string[pyarrow]" or "string[python]" | (1) | +| `StringDtype("pyarrow_numpy")` | `StringDtype(storage="pyarrow", na_value=np.nan)` | "string[pyarrow_numpy]" | (2) | + +Notes: + +- (1) You get "pyarrow" or "python" depending on pyarrow being installed. +- (2) "pyarrow_numpy" is kept temporarily because this is already in a released + version, but it will be deprecated in 2.x and removed for 3.0. + +For the new default string dtype, only the `"str"` alias can be used to +specify the dtype as a string, i.e. pandas would not provide a way to make the +underlying storage (pyarrow or python) explicit through the string alias. This +string alias is only a convenience shortcut and for most users `"str"` is +sufficient (they don't need to specify the storage), and the explicit +`pd.StringDtype(storage=..., na_value=np.nan)` is still available for more +fine-grained control. + +Also for the existing variant using `pd.NA`, specifying the storage through the +string alias could be deprecated, but that is left for a separate decision. + +## Alternatives + +### Why not delay introducing a default string dtype? + +To avoid introducing a new string dtype while other discussions and changes are +in flux (eventually making pyarrow a required dependency? adopting `pd.NA` as +the default missing value sentinel? using the new NumPy 2.0 capabilities? +overhauling all our dtypes to use a logical data type system?), introducing a +default string dtype could also be delayed until there is more clarity in those +other discussions. Specifically, it would avoid temporarily switching to use +`NaN` for the string dtype, while in a future version we might switch back +to `pd.NA` by default. + +However: + +1. Delaying has a cost: it further postpones introducing a dedicated string + dtype that has significant benefits for users, both in usability as (for the + part of the user base that has PyArrow installed) in performance. +2. In case pandas eventually transitions to use `pd.NA` as the default missing value + sentinel, a migration path for _all_ pandas data types will be needed, and thus + the challenges around this will not be unique to the string dtype and + therefore not a reason to delay this. + +Making this change now for 3.0 will benefit the majority of users, and the PDEP +author believes this is worth the cost of the added complexity around "yet +another dtype" (also for other data types we already have multiple variants). + +### Why not use the existing StringDtype with `pd.NA`? + +Wouldn't adding even more variants of the string dtype make things only more +confusing? Indeed, this proposal unfortunately introduces more variants of the +string dtype. However, the reason for this is to ensure the actual default user +experience is _less_ confusing, and the new string dtype fits better with the +other default data types. + +If the new default string data type would use `pd.NA`, then after some +operations, a user can easily end up with a DataFrame that mixes columns using +`NaN` semantics and columns using `NA` semantics (and thus a DataFrame that +could have columns with two different int64, two different float64, two different +bool, etc dtypes). This would lead to a very confusing default experience. + +With the proposed new variant of the StringDtype, this will ensure that for the +_default_ experience, a user will only see only 1 kind of integer dtype, only +kind of 1 bool dtype, etc. For now, a user should only get columns using `pd.NA` +when explicitly opting into this. + +### Naming alternatives + +An initial version of this PDEP proposed to use the `"string"` alias and the +default `pd.StringDtype()` class constructor for the new default dtype. +However, that caused a lot of discussion around backwards compatibility for +existing users of `dtype=pd.StringDtype()` and `dtype="string"`, that uses +`pd.NA` to represent missing values. + +During the discussion, several alternatives have been brought up. Both +alternative keyword names as using a different constructor. In the end, +this PDEP proposes to use a different string alias (`"str"`) but to keep +using the existing `pd.StringDtype` (with the existing `storage` keyword but +with an additional `na_value` keyword) for now to keep the changes as +minimal as possible, leaving a larger overhaul of the dtype system (potentially +including different constructor functions or namespace) for a future discussion. +See [GH-58613](https://github.com/pandas-dev/pandas/issues/58613) for the full +discussion. + +One consequence is that when using the class constructor for the default dtype, +it has to be used with non-default arguments, i.e. a user needs to specify +`pd.StringDtype(na_value=np.nan)` to get the default dtype using `NaN`. +Therefore, the pandas documentation will focus on the usage of `dtype="str"`. + +## Backward compatibility + +The most visible backwards incompatible change will be that columns with string +data will no longer have an `object` dtype. Therefore, code that assumes +`object` dtype (such as `ser.dtype == object`) will need to be updated. This +change is done as a hard break in a major release, as warning in advance for the +changed inference is deemed too noisy. + +To allow testing code in advance, the +`pd.options.future.infer_string = True` option is available for users. + +Otherwise, the actual string-specific functionality (such as the `.str` accessor +methods) should generally all keep working as is. + +By preserving the current missing value semantics, this proposal is also mostly +backwards compatible on this aspect. When storing strings in object dtype, pandas +however did allow using `None` as the missing value indicator as well (and in +certain cases such as the `shift` method, pandas even introduced this itself). +For all the cases where currently `None` was used as the missing value sentinel, +this will change to consistently use `NaN`. + +### For existing users of `StringDtype` + +Existing code that already opted in to use the `StringDtype` using `pd.NA` +should generally keep working as is. The latest version of this PDEP preserves +the behaviour of `dtype="string"` or `dtype=pd.StringDtype()` to mean the +`pd.NA` variant of the dtype. + +It does propose the change the default storage to `"pyarrow"` (if available) for +the opt-in `pd.NA` variant as well, but this should have limited, if any, +user-visible impact. + +## Timeline + +The future PyArrow-backed string dtype was already made available behind a feature +flag in pandas 2.1 (enabled by `pd.options.future.infer_string = True`). + +The variant using numpy object-dtype can also be backported to the 2.2.x branch +to allow easier testing. It is proposed to release this as 2.3.0 (created from +the 2.2.x branch, given that the main branch already includes many other changes +targeted for 3.0), together with the changes to the naming scheme. + +The 2.3.0 release would then have all future string functionality available +(both the pyarrow and object-dtype based variants of the default string dtype). + +For pandas 3.0, this `future.infer_string` flag becomes enabled by default. + +## PDEP-14 History + +- 3 May 2024: Initial version diff --git a/web/pandas/versions.json b/web/pandas/versions.json index e355005c7c937..2d2599ae8585b 100644 --- a/web/pandas/versions.json +++ b/web/pandas/versions.json @@ -5,11 +5,16 @@ "url": "https://pandas.pydata.org/docs/dev/" }, { - "name": "2.1 (stable)", - "version": "2.1", + "name": "2.2 (stable)", + "version": "2.2", "url": "https://pandas.pydata.org/docs/", "preferred": true }, + { + "name": "2.1", + "version": "2.1", + "url": "https://pandas.pydata.org/pandas-docs/version/2.1/" + }, { "name": "2.0", "version": "2.0",