diff --git a/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md
similarity index 54%
rename from ISSUE_TEMPLATE.md
rename to .github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md
index bcd5e0c1e..11290dc66 100644
--- a/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md
@@ -1,3 +1,15 @@
+<!--
+It is recommended to check that your issue complies with the
+following rules before submitting:
+
+-  Verify that your issue is not being currently addressed by other
+   issues (https://github.com/openml/openml-python/issues)
+   or pull requests (https://github.com/openml/openml-python/pulls).
+
+-  Please ensure all code snippets and error messages are formatted in
+   appropriate code blocks. See https://help.github.com/articles/creating-and-highlighting-code-blocks
+-->
+
 #### Description
 <!-- Example: Joblib Error thrown when calling fit on LatentDirichletAllocation with evaluate_every > 0-->
 
@@ -20,7 +32,10 @@ it in the issue: https://gist.github.com
 
 #### Versions
 <!--
-Please run the following snippet and paste the output below.
+Please include your operating system type and version number, as well
+as your Python, openml, scikit-learn, numpy, and scipy versions. This information
+can be found by running the following code snippet:
+
 import platform; print(platform.platform())
 import sys; print("Python", sys.version)
 import numpy; print("NumPy", numpy.__version__)
@@ -30,4 +45,5 @@ import openml; print("OpenML", openml.__version__)
 -->
 
 
-<!-- Thanks for contributing! -->
\ No newline at end of file
+<!-- Thanks for contributing! -->
+
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 000000000..89ad09697
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,31 @@
+<!--
+Thanks for contributing a pull request to the OpenML python connector! Please ensure you have taken a look at
+the contribution guidelines: https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md#Contributing-Pull-Requests
+
+Please make sure that:
+
+* the title of the pull request is descriptive
+* this pull requests is against the `main` branch
+* for any new functionality, consider adding a relevant example
+* add unit tests for new functionalities
+    * collect files uploaded to test server using _mark_entity_for_removal()
+* add the BSD 3-Clause license to any new file created
+-->
+
+#### Metadata
+* Reference Issue: <!-- Example: Fixes #1234 or NA-->
+* New Tests Added: <!-- Yes/No/NA -->
+* Documentation Updated: <!-- Yes/No/NA -->
+* Change Log Entry: <!-- Short String, example: "Add new function `foo()` to module `bar`"; or "Fixes a bug with `bar`" -->
+
+
+#### Details 
+<!--
+if necessary, please share the following:
+
+* What does this PR implement/fix? Explain your changes.
+* Why is this change necessary? What is the problem it solves?
+* How can I reproduce the issue this PR is solving and its solution?
+* Any other comments?
+-->
+
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 000000000..e5e5092a2
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,9 @@
+version: 2
+
+updates:
+  # This will check for updates to github actions every day
+  # https://docs.github.com/en/enterprise-server@3.4/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "daily"
diff --git a/.github/workflows/dist.yaml b/.github/workflows/dist.yaml
new file mode 100644
index 000000000..ecf6f0a7f
--- /dev/null
+++ b/.github/workflows/dist.yaml
@@ -0,0 +1,48 @@
+name: dist-check
+
+on:
+  workflow_dispatch:
+
+  push:
+    branches:
+      - main
+      - develop
+    tags:
+      - "v*.*.*"
+
+  pull_request:
+    branches:
+      - main
+      - develop
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  dist:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v6
+    - name: Setup Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.10"
+    - name: Build dist
+      run: |
+        pip install build
+        python -m build --sdist
+    - name: Twine check
+      run: |
+        pip install twine
+        last_dist=$(ls -t dist/openml-*.tar.gz | head -n 1)
+        twine check $last_dist
+    - name: Install dist
+      run: |
+        last_dist=$(ls -t dist/openml-*.tar.gz | head -n 1)
+        pip install $last_dist
+    - name: PEP 561 Compliance
+      run: |
+        pip install mypy
+        cd ..  # required to use the installed version of openml
+        if ! python -m mypy -c "import openml"; then exit 1; fi
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
new file mode 100644
index 000000000..1a5a36a87
--- /dev/null
+++ b/.github/workflows/docs.yaml
@@ -0,0 +1,62 @@
+name: Docs
+on:
+  workflow_dispatch:
+
+  push:
+    branches:
+      - main
+      - develop
+    tags:
+      - "v*.*.*"
+
+  pull_request:
+    branches:
+      - main
+      - develop
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-and-deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          pip install -e .[docs,examples]
+      - name: Make docs
+        run: |
+          mkdocs build
+      - name: Deploy to GitHub Pages
+        env:
+          CI: false
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PAGES_BRANCH: gh-pages
+        if: (contains(github.ref, 'develop') || contains(github.ref, 'main')) && github.event_name == 'push'
+        run: |
+          git config user.name doc-bot
+          git config user.email doc-bot@openml.com
+          current_version=$(git tag | sort --version-sort | tail -n 1)
+          # This block will rename previous retitled versions
+          retitled_versions=$(mike list -j | jq ".[] | select(.title != .version) | .version" | tr -d '"')
+          for version in $retitled_versions; do
+            mike retitle "${version}" "${version}"
+          done
+
+          echo "Deploying docs for ${current_version}"
+          mike set-default latest
+          mike deploy \
+            --push \
+            --title "${current_version} (latest)" \
+            --update-aliases \
+            "${current_version}" \
+            "latest"\
+            -b $PAGES_BRANCH
diff --git a/.github/workflows/release_docker.yaml b/.github/workflows/release_docker.yaml
new file mode 100644
index 000000000..fcea357e4
--- /dev/null
+++ b/.github/workflows/release_docker.yaml
@@ -0,0 +1,66 @@
+name: release-docker
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - 'develop'
+      - 'docker'
+    tags:
+      - 'v*'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+
+  docker:
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to DockerHub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Check out the repo
+        uses: actions/checkout@v6
+
+      - name: Extract metadata (tags, labels) for Docker Hub
+        id: meta_dockerhub
+        uses: docker/metadata-action@v5
+        with:
+          images: "openml/openml-python"
+
+      - name: Build and push
+        id: docker_build
+        uses: docker/build-push-action@v6
+        with:
+          context: ./docker/
+          tags: ${{ steps.meta_dockerhub.outputs.tags }}
+          labels: ${{ steps.meta_dockerhub.outputs.labels }}
+          platforms: linux/amd64,linux/arm64
+          push: ${{ github.event_name == 'push' }}
+
+      - name: Update repo description
+        if: ${{ startsWith(github.ref, 'refs/tags/v') }}
+        uses: peter-evans/dockerhub-description@v4
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+          repository: openml/openml-python
+          short-description: "pre-installed openml-python environment"
+          readme-filepath: ./docker/readme.md
+          
+      - name: Image digest
+        run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 000000000..dc0995fc6
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,239 @@
+---
+name: Tests
+
+on:
+  workflow_dispatch:
+
+  push:
+    branches:
+      - main
+      - develop
+    tags:
+      - "v*.*.*"
+
+  pull_request:
+    branches:
+      - main
+      - develop
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    name: (${{ matrix.os }},Py${{ matrix.python-version }},sk${{ matrix.scikit-learn }}${{ matrix.pandas-version != '' && format(',pd:{0}', matrix.pandas-version) || '' }},sk-only:${{ matrix.sklearn-only }})
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+        scikit-learn: ["1.3.*", "1.4.*", "1.5.*", "1.6.*", "1.7.*"]
+        os: [ubuntu-latest]
+        sklearn-only: ["true"]
+
+        exclude:
+          # (python, sklearn) combinations for which there is no PyPI release
+          # scikit-learn 1.3
+          - python-version: "3.13"
+            scikit-learn: "1.3.*"
+          - python-version: "3.14"
+            scikit-learn: "1.3.*"
+          # scikit-learn 1.4
+          - python-version: "3.13"
+            scikit-learn: "1.4.*"
+          - python-version: "3.14"
+            scikit-learn: "1.4.*"
+          # scikit-learn 1.5
+          - python-version: "3.14"
+            scikit-learn: "1.5.*"
+          # scikit-learn 1.6
+          - python-version: "3.14"
+            scikit-learn: "1.6.*"
+          # scikit-learn 1.7 is installed with pandas 3
+          - python-version: "3.10"
+            scikit-learn: "1.7.*"
+
+
+        include:
+          # Full test run on ubuntu, 3.14
+          - os: ubuntu-latest
+            python-version: "3.14"
+            scikit-learn: "1.7.*"
+            sklearn-only: "false"
+
+          # Full test run on Windows
+          - os: windows-latest
+            python-version: "3.12"
+            scikit-learn: "1.5.*"
+            sklearn-only: "false"
+
+          # Coverage run
+          - os: ubuntu-latest
+            python-version: "3.12"
+            scikit-learn: "1.5.*"
+            sklearn-only: "false"
+            code-cov: true
+
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        fetch-depth: 2
+
+    - name: Setup Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install test dependencies, scikit-learn, and pandas
+      shell: bash
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e .[test] scikit-learn==${{ matrix.scikit-learn }}
+
+        # scikit-learn 1.7+ requires pandas 3.x, earlier versions use pandas 2.x
+        version="${{ matrix.scikit-learn }}"
+        major=$(echo "$version" | cut -d. -f1)
+        minor=$(echo "$version" | cut -d. -f2)
+
+        if [[ "$major" -gt 1 ]] || { [[ "$major" -eq 1 ]] && [[ "$minor" -ge 7 ]]; }; then
+          pip install "pandas==3.*"
+        else
+          pip install "pandas==2.*"
+        fi
+
+    - name: Store repository status
+      id: status-before
+      if: matrix.os != 'windows-latest'
+      run: |
+        git_status=$(git status --porcelain -b)
+        echo "BEFORE=$git_status" >> $GITHUB_ENV
+        echo "Repository status before tests: $git_status"
+
+    - name: Clone Services
+      if: matrix.os == 'ubuntu-latest'
+      id: clone-services
+      run: |
+        git clone --depth 1 https://github.com/openml/services.git
+
+    - name: Start Docker Services
+      id: start-services
+      if: matrix.os == 'ubuntu-latest'
+      working-directory: ./services
+      run: |
+        chmod -R a+rw ./data
+        chmod -R a+rw ./logs
+        docker compose --profile rest-api --profile minio --profile evaluation-engine up -d
+
+        echo "Waiting for PHP API to boot..."
+        timeout 60s bash -c 'until [ "$(docker inspect -f {{.State.Health.Status}} openml-php-rest-api)" == "healthy" ]; do sleep 5; done'
+
+        echo "Final Verification: Gateway Connectivity..."
+        curl -sSfL http://localhost:8000/api/v1/xml/data/1 | head -n 15
+
+        docker container ls
+
+    - name: Show installed dependencies
+      run: python -m pip list
+
+    - name: Run tests on Ubuntu Test
+      if: matrix.os == 'ubuntu-latest'
+      env:
+        OPENML_TEST_SERVER_ADMIN_KEY: ${{ secrets.OPENML_TEST_SERVER_ADMIN_KEY }}
+        OPENML_USE_LOCAL_SERVICES: "true"
+      run: |
+        if [ "${{ matrix.code-cov }}" = "true" ]; then
+          codecov="--cov=openml --long --cov-report=xml"
+        fi
+
+        if [ "${{ matrix.sklearn-only }}" = "true" ]; then
+          marks="sklearn and not production_server"
+        else
+          marks="not production_server"
+        fi
+
+        pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
+
+    - name: Run tests on Ubuntu Production
+      if: matrix.os == 'ubuntu-latest'
+      env:
+        OPENML_TEST_SERVER_ADMIN_KEY: ${{ secrets.OPENML_TEST_SERVER_ADMIN_KEY }}
+        OPENML_USE_LOCAL_SERVICES: "true"
+      run: |
+        if [ "${{ matrix.code-cov }}" = "true" ]; then
+          codecov="--cov=openml --long --cov-report=xml"
+        fi
+
+        if [ "${{ matrix.sklearn-only }}" = "true" ]; then
+          marks="sklearn and production_server"
+        else
+          marks="production_server"
+        fi
+
+        pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
+
+    - name: Run tests on Windows
+      if: matrix.os == 'windows-latest'
+      env:
+        OPENML_TEST_SERVER_ADMIN_KEY: ${{ secrets.OPENML_TEST_SERVER_ADMIN_KEY }}
+      run: |  # we need a separate step because of the bash-specific if-statement in the previous one.
+        pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1 -m "not test_server"
+
+    - name: Upload coverage
+      if: matrix.code-cov && always()
+      uses: codecov/codecov-action@v4
+      with:
+        files: coverage.xml
+        token: ${{ secrets.CODECOV_TOKEN }}
+        fail_ci_if_error: true
+        verbose: true
+
+    - name: Dump server logs
+      if: always() && steps.start-services.outcome == 'success'
+      run: |
+        docker logs openml-php-rest-api -t
+
+    - name: Cleanup Docker setup
+      if: always() && steps.clone-services.outcome == 'success'
+      run: |
+        sudo rm -rf services
+
+    - name: Check for files left behind by test
+      if: matrix.os != 'windows-latest' && always()
+      run: |
+        before="${{ env.BEFORE }}"
+        after="$(git status --porcelain -b)"
+        if [[ "$before" != "$after" ]]; then
+            echo "git status from before: $before"
+            echo "git status from after: $after"
+            echo "Not all generated files have been deleted!"
+            exit 1
+        fi
+
+  dummy_windows_py_sk024:
+    name: (windows-latest, Py, sk0.24.*, sk-only:false)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Dummy step
+        run: |
+          echo "This is a temporary dummy job."
+          echo "Always succeeds."
+
+  dummy_windows_py_sk023:
+    name: (ubuntu-latest, Py3.8, sk0.23.1, sk-only:false)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Dummy step
+        run: |
+          echo "This is a temporary dummy job."
+          echo "Always succeeds."
+
+  dummy_docker:
+    name: docker
+    runs-on: ubuntu-latest
+    steps:
+      - name: Dummy step
+        run: |
+          echo "This is a temporary dummy docker job."
+          echo "Always succeeds."
diff --git a/.gitignore b/.gitignore
index 3e5102233..d512c0ee6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,10 @@
 *~
 doc/generated
 examples/.ipynb_checkpoints
+venv
+.uv-lock
+uv.lock
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -15,6 +19,14 @@ doc/auto_examples/
 doc/modules/generated/
 doc/datasets/generated/
 
+# Some stuff from testing?
+tests/files/org/openml/test/datasets/1/
+tests/files/org/openml/test/datasets/2/features.xml.pkl
+tests/files/org/openml/test/datasets/2/qualities.xml.pkl
+tests/files/org/openml/test/locks/
+tests/files/org/openml/test/tasks/1/datasplits.pkl.py3
+tests/files/org/openml/test/tasks/1882/datasplits.pkl.py3
+
 # Distribution / packaging
 
 .Python
@@ -75,6 +87,9 @@ target/
 # IDE
 .idea
 *.swp
+.vscode
+.cursorignore
+.cursorindexingignore
 
 # MYPY
 .mypy_cache
@@ -83,3 +98,17 @@ dmypy.sock
 
 # Tests
 .pytest_cache
+
+# Virtual environments
+oenv/
+venv/
+.env/
+.venv
+.venv/
+
+# Python cache
+__pycache__/
+*.pyc
+
+# Ruff
+.ruff-cache/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..0987bad90
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,48 @@
+default_language_version:
+  python: python3
+files: |
+  (?x)^(
+    openml|
+    tests
+  )/.*\.py$
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.14.10
+    hooks:
+      - id: ruff
+        args: [--fix, --exit-non-zero-on-fix, --no-cache]
+      - id: ruff-format
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.13.0
+    hooks:
+      - id: mypy
+        additional_dependencies:
+          - types-requests
+          - types-python-dateutil
+  - repo: https://github.com/python-jsonschema/check-jsonschema
+    rev: 0.29.4
+    hooks:
+      - id: check-github-workflows
+        files: '^github/workflows/.*\.ya?ml$'
+        types: ["yaml"]
+      - id: check-dependabot
+        files: '^\.github/dependabot\.ya?ml$'
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: check-added-large-files
+        files: ".*"
+      - id: check-case-conflict
+        files: ".*"
+      - id: check-merge-conflict
+        files: ".*"
+      - id: check-yaml
+        files: ".*"
+      - id: end-of-file-fixer
+        files: ".*"
+        types: ["yaml"]
+      - id: check-toml
+        files: ".*"
+        types: ["toml"]
+      - id: debug-statements
+        files: '^src/.*\.py$'
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index beaa3b53e..000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-language: python
-
-sudo: false
-
-cache:
-  apt: true
-  # We use three different cache directory
-  # to work around a Travis bug with multi-platform cache
-  directories:
-  - $HOME/.cache/pip
-  - $HOME/download
-env:
-  global:
-  # Directory where tests are run from
-  - TEST_DIR=/tmp/test_dir/
-  - MODULE=openml
-  matrix:
-  - DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.21.2"
-  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2"
-  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true"
-  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true"
-  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.2"
-  # Checks for older scikit-learn versions (which also don't nicely work with
-  # Python3.7)
-  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2"
-  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" SCIPY_VERSION=1.2.0
-
-# Travis issue
-# https://github.com/travis-ci/travis-ci/issues/8920
-before_install:
- - python -c "import fcntl; fcntl.fcntl(1, fcntl.F_SETFL, 0)"
-
-install: source ci_scripts/install.sh
-script: bash ci_scripts/test.sh
-after_success: source ci_scripts/success.sh && source ci_scripts/create_doc.sh $TRAVIS_BRANCH "doc_result"
-
-# travis will check the deploy on condition, before actually running before_deploy
-# before_deploy: source ci_scripts/create_doc.sh $TRAVIS_BRANCH "doc_result"
-
-# For more info regarding the deploy process and the github token look at:
-# https://docs.travis-ci.com/user/deployment/pages/
-
-deploy:
-  provider: pages
-  skip_cleanup: true
-  github_token: $GITHUB_TOKEN
-  keep-history: true
-  committer-from-gh: true
-  on:
-    all_branches: true
-    condition: $doc_result = "success"
-  local_dir: doc/$TRAVIS_BRANCH
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 000000000..c5454ef6f
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,40 @@
+cff-version: 1.2.0
+message: "If you use this software in a publication, please cite the metadata from preferred-citation."
+preferred-citation:
+  type: article
+  authors:
+  - family-names: "Feurer"
+    given-names: "Matthias"
+    orcid: "https://orcid.org/0000-0001-9611-8588"
+  - family-names: "van Rijn"
+    given-names: "Jan N."
+    orcid: "https://orcid.org/0000-0003-2898-2168"
+  - family-names: "Kadra"
+    given-names: "Arlind"
+  - family-names: "Gijsbers"
+    given-names: "Pieter"
+    orcid: "https://orcid.org/0000-0001-7346-8075"
+  - family-names: "Mallik"
+    given-names: "Neeratyoy"
+    orcid: "https://orcid.org/0000-0002-0598-1608"
+  - family-names: "Ravi"
+    given-names: "Sahithya"
+  - family-names: "Müller"
+    given-names: "Andreas"
+    orcid: "https://orcid.org/0000-0002-2349-9428"
+  - family-names: "Vanschoren"
+    given-names: "Joaquin"
+    orcid: "https://orcid.org/0000-0001-7044-9805"
+  - family-names: "Hutter"
+    given-names: "Frank"
+    orcid: "https://orcid.org/0000-0002-2037-3694"
+  journal: "Journal of Machine Learning Research"
+  title: "OpenML-Python: an extensible Python API for OpenML"
+  abstract: "OpenML is an online platform for open science collaboration in machine learning, used to share datasets and results of machine learning experiments. In this paper, we introduce OpenML-Python, a client API for Python, which opens up the OpenML platform for a wide range of Python-based machine learning tools. It provides easy access to all datasets, tasks and experiments on OpenML from within Python. It also provides functionality to conduct machine learning experiments, upload the results to OpenML, and reproduce results which are stored on OpenML. Furthermore, it comes with a scikit-learn extension and an extension mechanism to easily integrate other machine learning libraries written in Python into the OpenML ecosystem. Source code and documentation are available at https://github.com/openml/openml-python/."
+  volume: 22
+  year: 2021
+  start: 1
+  end: 5
+  pages: 5
+  number: 100
+  url: https://jmlr.org/papers/v22/19-920.html
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5a77dfd58..d194525ef 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,11 +1,137 @@
-How to contribute
------------------
+# Contributing to `openml-python`
+This document describes the workflow on how to contribute to the openml-python package.
+If you are interested in connecting a machine learning package with OpenML (i.e.
+write an openml-python extension) or want to find other ways to contribute, see [this page](https://openml.github.io/openml-python/main/contributing.html#contributing).
+
+## Scope of the package
+
+The scope of the OpenML Python package is to provide a Python interface to
+the OpenML platform which integrates well with Python's scientific stack, most
+notably [numpy](http://www.numpy.org/), [scipy](https://www.scipy.org/) and
+[pandas](https://pandas.pydata.org/).
+To reduce opportunity costs and demonstrate the usage of the package, it also
+implements an interface to the most popular machine learning package written
+in Python, [scikit-learn](http://scikit-learn.org/stable/index.html).
+Thereby it will automatically be compatible with many machine learning
+libraries written in Python.
+
+We aim to keep the package as light-weight as possible, and we will try to
+keep the number of potential installation dependencies as low as possible.
+Therefore, the connection to other machine learning libraries such as
+*pytorch*, *keras* or *tensorflow* should not be done directly inside this
+package, but in a separate package using the OpenML Python connector.
+More information on OpenML Python connectors can be found [here](https://openml.github.io/openml-python/main/contributing.html#contributing).
+
+## Determine what contribution to make
+
+Great! You've decided you want to help out. Now what?
+All contributions should be linked to issues on the [GitHub issue tracker](https://github.com/openml/openml-python/issues).
+In particular for new contributors, the *good first issue* label should help you find
+issues which are suitable for beginners.  Resolving these issues allows you to start
+contributing to the project without much prior knowledge. Your assistance in this area 
+will be greatly appreciated by the more experienced developers as it helps free up 
+their time to concentrate on other issues.
+
+If you encounter a particular part of the documentation or code that you want to improve,
+but there is no related open issue yet, open one first.
+This is important since you can first get feedback or pointers from experienced contributors.
+
+To let everyone know you are working on an issue, please leave a comment that states you will work on the issue
+(or, if you have the permission, *assign* yourself to the issue). This avoids double work!
+
+## Contributing Workflow Overview 
+To contribute to the openml-python package, follow these steps:
+
+0. Determine how you want to contribute (see above).
+1. Set up your local development environment.
+   1. Fork and clone the `openml-python` repository. Then, create a new branch from the ``main`` branch. If you are new to `git`, see our [detailed documentation](#basic-git-workflow), or rely on your favorite IDE.   
+   2. [Install the local dependencies](#install-local-dependencies) to run the tests for your contribution.
+   3. [Test your installation](#testing-your-installation) to ensure everything is set up correctly.
+4. Implement your contribution. If contributing to the documentation, see [here](#contributing-to-the-documentation).
+5. [Create a pull request](#pull-request-checklist). 
+
+### Install Local Dependencies
+
+We recommend following the instructions below to install all requirements locally.
+However, it is also possible to use the [openml-python docker image](https://github.com/openml/openml-python/blob/main/docker/readme.md) for testing and building documentation. Moreover, feel free to use any alternative package managers, such as `pip`.
+
+
+1. To ensure a smooth development experience, we recommend using the `uv` package manager. Thus, first install `uv`. If any Python version already exists on your system, follow the steps below, otherwise see [here](https://docs.astral.sh/uv/getting-started/installation/). 
+    ```bash
+    pip install uv
+    ```
+2. Create a virtual environment using `uv` and activate it. This will ensure that the dependencies for `openml-python` do not interfere with other Python projects on your system. 
+   ```bash
+   uv venv --seed --python 3.8 ~/.venvs/openml-python
+   source ~/.venvs/openml-python/bin/activate
+   pip install uv # Install uv within the virtual environment
+   ```
+3. Then install openml with its test dependencies by running
+   ```bash
+   uv pip install -e .[test]
+   ```
+   from the repository folder.
+   Then configure the pre-commit to be able to run unit tests, as well as [pre-commit](#pre-commit-details) through:
+   ```bash
+   pre-commit install
+   ```
+
+### Testing (Your Installation)
+To test your installation and run the tests for the first time, run the following from the repository folder:
+```bash
+pytest tests
+```
+For Windows systems, you may need to add `pytest` to PATH before executing the command.
+
+Executing a specific unit test can be done by specifying the module, test case, and test.
+You may then run a specific module, test case, or unit test respectively:
+```bash
+pytest tests/test_datasets/test_dataset.py
+pytest tests/test_datasets/test_dataset.py::OpenMLDatasetTest
+pytest tests/test_datasets/test_dataset.py::OpenMLDatasetTest::test_get_data
+```
+
+To test your new contribution, add [unit tests](https://github.com/openml/openml-python/tree/main/tests), and, if needed, [examples](https://github.com/openml/openml-python/tree/main/examples) for any new functionality being introduced. Some notes on unit tests and examples:
+* If a unit test contains an upload to the test server, please ensure that it is followed by a file collection for deletion, to prevent the test server from bulking up. For example, `TestBase._mark_entity_for_removal('data', dataset.dataset_id)`, `TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))`.
+* Please ensure that the example is run on the test server by beginning with the call to `openml.config.start_using_configuration_for_example()`, which is done by default for tests derived from `TestBase`.
+* Add the `@pytest.mark.sklearn` marker to your unit tests if they have a dependency on scikit-learn.
+
+#### Running Tests That Require Admin Privileges
+
+Some tests require admin privileges on the test server and will be automatically skipped unless you provide an admin API key. For regular contributors, the tests will skip gracefully. For core contributors who need to run these tests locally, you can set up the key by exporting the variable as below before running the tests:
+
+```bash
+# For windows
+$env:OPENML_TEST_SERVER_ADMIN_KEY = "admin-key"
+# For linux/mac
+export OPENML_TEST_SERVER_ADMIN_KEY="admin-key"
+```
+
+### Pull Request Checklist
+
+You can go to the `openml-python` GitHub repository to create the pull request by [comparing the branch](https://github.com/openml/openml-python/compare) from your fork with the `main` branch of the `openml-python` repository. When creating a pull request, make sure to follow the comments and structured provided by the template on GitHub.
+
+**An incomplete contribution** -- where you expect to do more work before
+receiving a full review -- should be submitted as a `draft`. These may be useful
+to: indicate you are working on something to avoid duplicated work,
+request broad review of functionality or API, or seek collaborators.
+Drafts often benefit from the inclusion of a
+[task list](https://github.com/blog/1375-task-lists-in-gfm-issues-pulls-comments)
+in the PR description.
+
+--- 
 
-The preferred workflow for contributing to the OpenML python connector is to
+# Appendix
+
+## Basic `git` Workflow
+
+The preferred workflow for contributing to openml-python is to
 fork the [main repository](https://github.com/openml/openml-python) on
-GitHub, clone, check out the branch `develop`, and develop on a new branch
+GitHub, clone, check out the branch `main`, and develop on a new branch
 branch. Steps:
 
+0. Make sure you have git installed, and a GitHub account.
+
 1. Fork the [project repository](https://github.com/openml/openml-python)
    by clicking on the 'Fork' button near the top right of the page. This creates
    a copy of the code under your GitHub user account. For more details on
@@ -15,186 +141,77 @@ branch. Steps:
 local disk:
 
    ```bash
-   $ git clone git@github.com:YourLogin/openml-python.git
-   $ cd openml-python
+   git clone git@github.com:YourLogin/openml-python.git
+   cd openml-python
    ```
 
 3. Switch to the ``develop`` branch:
 
    ```bash
-   $ git checkout develop
+   git checkout main
    ```
 
 3. Create a ``feature`` branch to hold your development changes:
 
    ```bash
-   $ git checkout -b feature/my-feature
+   git checkout -b feature/my-feature
    ```
 
-   Always use a ``feature`` branch. It's good practice to never work on the ``master`` or ``develop`` branch! 
+   Always use a ``feature`` branch. It's good practice to never work on the ``main`` branch! 
    To make the nature of your pull request easily visible, please prepend the name of the branch with the type of changes you want to merge, such as ``feature`` if it contains a new feature, ``fix`` for a bugfix, ``doc`` for documentation and ``maint`` for other maintenance on the package.
 
 4. Develop the feature on your feature branch. Add changed files using ``git add`` and then ``git commit`` files:
 
    ```bash
-   $ git add modified_files
-   $ git commit
+   git add modified_files
+   git commit
    ```
 
    to record your changes in Git, then push the changes to your GitHub account with:
 
    ```bash
-   $ git push -u origin my-feature
+   git push -u origin my-feature
    ```
 
 5. Follow [these instructions](https://help.github.com/articles/creating-a-pull-request-from-a-fork)
-to create a pull request from your fork. This will send an email to the committers.
+to create a pull request from your fork.
 
 (If any of the above seems like magic to you, please look up the
 [Git documentation](https://git-scm.com/documentation) on the web, or ask a friend or another contributor for help.)
 
-Pull Request Checklist
-----------------------
-
-We recommended that your contribution complies with the
-following rules before you submit a pull request:
-
--  Follow the
-   [pep8 style guide](https://www.python.org/dev/peps/pep-0008/).
-   With the following exceptions or additions:
-    - The max line length is 100 characters instead of 80.
-    - When creating a multi-line expression with binary operators, break before the operator.
-    - Add type hints to all function signatures.
-    (note: not all functions have type hints yet, this is work in progress.)
-    - Use the [`str.format`](https://docs.python.org/3/library/stdtypes.html#str.format) over [`printf`](https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting) style formatting.
-     E.g. use `"{} {}".format('hello', 'world')` not `"%s %s" % ('hello', 'world')`.
-     (note: old code may still use `printf`-formatting, this is work in progress.)
-
--  If your pull request addresses an issue, please use the pull request title
-   to describe the issue and mention the issue number in the pull request description. This will make sure a link back to the original issue is
-   created.
-
--  An incomplete contribution -- where you expect to do more work before
-   receiving a full review -- should be submitted as a `draft`. These may be useful
-   to: indicate you are working on something to avoid duplicated work,
-   request broad review of functionality or API, or seek collaborators.
-   Drafts often benefit from the inclusion of a
-   [task list](https://github.com/blog/1375-task-lists-in-gfm-issues-pulls-comments)
-   in the PR description.
-   
-- Add [unit tests](https://github.com/openml/openml-python/tree/develop/tests) and [examples](https://github.com/openml/openml-python/tree/develop/examples) for any new functionality being introduced. 
-    - If an unit test contains an upload to the test server, please ensure that it is followed by a file collection for deletion, to prevent the test server from bulking up. For example, `TestBase._mark_entity_for_removal('data', dataset.dataset_id)`, `TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))`.
-    - Please ensure that the example is run on the test server by beginning with the call to `openml.config.start_using_configuration_for_example()`.      
-
--  All tests pass when running `pytest`. On
-   Unix-like systems, check with (from the toplevel source folder):
-
-      ```bash
-      $ pytest
-      ```
-   
-   For Windows systems, execute the command from an Anaconda Prompt or add `pytest` to PATH before executing the command.
-
--  Documentation and high-coverage tests are necessary for enhancements to be
-   accepted. Bug-fixes or new features should be provided with
-   [non-regression tests](https://en.wikipedia.org/wiki/Non-regression_testing).
-   These tests verify the correct behavior of the fix or feature. In this
-   manner, further modifications on the code base are granted to be consistent
-   with the desired behavior.
-   For the Bug-fixes case, at the time of the PR, this tests should fail for
-   the code base in develop and pass for the PR code.
-
- - Add your changes to the changelog in the file doc/progress.rst.
-
-
-You can also check for common programming errors with the following
-tools:
-
--  Code with good unittest **coverage** (at least 80%), check with:
-
-  ```bash
-  $ pip install pytest pytest-cov
-  $ pytest --cov=. path/to/tests_for_package
-  ```
-
--  No style warnings, check with:
-
-  ```bash
-  $ pip install flake8
-  $ flake8 --ignore E402,W503 --show-source --max-line-length 100
-  ```
-
--  No mypy (typing) issues, check with:
-
-  ```bash
-  $ pip install mypy
-  $ mypy openml --ignore-missing-imports --follow-imports skip
-  ```
-
-Filing bugs
------------
-We use GitHub issues to track all bugs and feature requests; feel free to
-open an issue if you have found a bug or wish to see a feature implemented.
-
-It is recommended to check that your issue complies with the
-following rules before submitting:
-
--  Verify that your issue is not being currently addressed by other
-   [issues](https://github.com/openml/openml-python/issues)
-   or [pull requests](https://github.com/openml/openml-python/pulls).
-
--  Please ensure all code snippets and error messages are formatted in
-   appropriate code blocks.
-   See [Creating and highlighting code blocks](https://help.github.com/articles/creating-and-highlighting-code-blocks).
-
--  Please include your operating system type and version number, as well
-   as your Python, openml, scikit-learn, numpy, and scipy versions. This information
-   can be found by running the following code snippet:
-
-  ```python
-  import platform; print(platform.platform())
-  import sys; print("Python", sys.version)
-  import numpy; print("NumPy", numpy.__version__)
-  import scipy; print("SciPy", scipy.__version__)
-  import sklearn; print("Scikit-Learn", sklearn.__version__)
-  import openml; print("OpenML", openml.__version__)
-  ```
-
-New contributor tips
---------------------
-
-A great way to start contributing to openml-python is to pick an item
-from the list of [Good First Issues](https://github.com/openml/openml-python/labels/Good%20first%20issue)
-in the issue tracker. Resolving these issues allow you to start
-contributing to the project without much prior knowledge. Your
-assistance in this area will be greatly appreciated by the more
-experienced developers as it helps free up their time to concentrate on
-other issues.
-
-Documentation
--------------
-
-We are glad to accept any sort of documentation: function docstrings,
-reStructuredText documents (like this one), tutorials, etc.
-reStructuredText documents live in the source code repository under the
-doc/ directory.
-
-You can edit the documentation using any text editor and then generate
-the HTML output by typing ``make html`` from the doc/ directory.
-The resulting HTML files will be placed in ``build/html/`` and are viewable in
-a web browser. See the ``README`` file in the ``doc/`` directory for more
-information.
-
-For building the documentation, you will need
-[sphinx](http://sphinx.pocoo.org/),
-[sphinx-bootstrap-theme](https://ryan-roemer.github.io/sphinx-bootstrap-theme/),
-[sphinx-gallery](https://sphinx-gallery.github.io/)
-and
-[numpydoc](https://numpydoc.readthedocs.io/en/latest/).
+
+## Pre-commit Details
+[Pre-commit](https://pre-commit.com/) is used for various style checking and code formatting.
+Before each commit, it will automatically run:
+ - [ruff](https://docs.astral.sh/ruff/) a code formatter and linter.
+   This will automatically format your code.
+   Make sure to take a second look after any formatting takes place,
+   if the resulting code is very bloated, consider a (small) refactor.
+ - [mypy](https://mypy.readthedocs.io/en/stable/) a static type checker.
+   In particular, make sure each function you work on has type hints.
+    
+If you want to run the pre-commit tests without doing a commit, run:
+```bash
+$ make check
+```
+or on a system without make, like Windows:
+```bash
+$ pre-commit run --all-files
+```
+Make sure to do this at least once before your first commit to check your setup works.
+
+## Contributing to the Documentation
+
+We welcome all forms of documentation contributions — whether it's Markdown docstrings, tutorials, guides, or general improvements.
+
+Our documentation is written either in Markdown or as a jupyter notebook and lives in the docs/ and examples/ directories of the source code repository.
+
+To preview the documentation locally, you will need to install a few additional dependencies:
 ```bash
-$ pip install sphinx sphinx-bootstrap-theme sphinx-gallery numpydoc
+uv pip install -e .[examples,docs]
 ```
 When dependencies are installed, run
 ```bash
-$ sphinx-build -b html doc YOUR_PREFERRED_OUTPUT_DIRECTORY
+mkdocs serve
 ```
+This will open a preview of the website.
\ No newline at end of file
diff --git a/Makefile b/Makefile
index c36acbe9f..a25e2972c 100644
--- a/Makefile
+++ b/Makefile
@@ -7,9 +7,12 @@ CTAGS ?= ctags
 
 all: clean inplace test
 
+check:
+	pre-commit run --all-files
+
 clean:
 	$(PYTHON) setup.py clean
-	rm -rf dist
+	rm -rf dist openml.egg-info
 
 in: inplace # just a shortcut
 inplace:
@@ -17,11 +20,9 @@ inplace:
 
 test-code: in
 	$(PYTEST) -s -v tests
-test-doc:
-	$(PYTEST) -s -v doc/*.rst
 
 test-coverage:
 	rm -rf coverage .coverage
 	$(PYTEST) -s -v --cov=. tests
 
-test: test-code test-sphinxext test-doc
+test: test-code
diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md
deleted file mode 100644
index 571ae0d1c..000000000
--- a/PULL_REQUEST_TEMPLATE.md
+++ /dev/null
@@ -1,27 +0,0 @@
-<!--
-Thanks for contributing a pull request to the OpenML python connector! Please ensure you have taken a look at
-the contribution guidelines: https://github.com/openml/openml-python/blob/master/CONTRIBUTING.md#Contributing-Pull-Requests
-
-Please make sure that:
-
-* this pull requests is against the `develop` branch
-* you updated all docs, this includes the changelog!
-* for any new function or class added, please add it to doc/api.rst
-    * the list of classes and functions should be alphabetical 
-* for any new functionality, consider adding a relevant example
-* add unit tests for new functionalities
-    * collect files uploaded to test server using _mark_entity_for_removal()
--->
-
-#### Reference Issue
-<!-- Example: Fixes #1234 -->
-
-
-#### What does this PR implement/fix? Explain your changes.
-
-
-#### How should this PR be tested?
-
-
-#### Any other comments?
-
diff --git a/README.md b/README.md
index b5e4d6c0c..974c9fa53 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,102 @@
+
+
+<div align="center">
+
+<div id="user-content-toc">
+  <ul align="center" style="list-style: none;">
+    <summary>
+      <img src="https://github.com/openml/openml.org/blob/master/app/public/static/svg/logo.svg" width="50" alt="OpenML Logo"/>
+      <h1>OpenML-Python</h1>
+      <img src="https://github.com/openml/docs/blob/master/docs/img/python.png" width="50" alt="Python Logo"/>
+    </summary>
+  </ul>
+</div>
+
+## The Python API for a World of Data and More :dizzy:
+
+[![Latest Release](https://img.shields.io/github/v/release/openml/openml-python)](https://github.com/openml/openml-python/releases)
+[![Python Versions](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12%20%7C%203.13%20%7C%203.14-blue)](https://pypi.org/project/openml/)
+[![Downloads](https://static.pepy.tech/badge/openml)](https://pepy.tech/project/openml)
 [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
+<!-- Add green badges for CI and precommit -->
+
+[Installation](https://openml.github.io/openml-python/main/#how-to-get-openml-for-python) | [Documentation](https://openml.github.io/openml-python) | [Contribution guidelines](https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md)
+</div>
+
+OpenML-Python provides an easy-to-use and straightforward Python interface for [OpenML](http://openml.org), an online platform for open science collaboration in machine learning.
+It can download or upload data from OpenML, such as datasets and machine learning experiment results.
+
+## :joystick: Minimal Example
+
+Use the following code to get the [credit-g](https://www.openml.org/search?type=data&sort=runs&status=active&id=31) [dataset](https://docs.openml.org/concepts/data/):
+
+```python
+import openml
+
+dataset = openml.datasets.get_dataset("credit-g") # or by ID get_dataset(31)
+X, y, categorical_indicator, attribute_names = dataset.get_data(target="class")
+```
+
+Get a [task](https://docs.openml.org/concepts/tasks/) for [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31):
+
+```python
+import openml
+
+task = openml.tasks.get_task(31)
+dataset = task.get_dataset()
+X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name)
+# get splits for the first fold of 10-fold cross-validation
+train_indices, test_indices = task.get_train_test_split_indices(fold=0)
+```
+
+Use an [OpenML benchmarking suite](https://docs.openml.org/concepts/benchmarking/) to get a curated list of machine-learning tasks:
+```python
+import openml
+
+suite = openml.study.get_suite("amlb-classification-all")  # Get a curated list of tasks for classification
+for task_id in suite.tasks:
+    task = openml.tasks.get_task(task_id)
+```
+
+## :magic_wand: Installation
+
+OpenML-Python is supported on Python 3.10 - 3.14 and is available on Linux, MacOS, and Windows.
+
+You can install OpenML-Python with:
+
+```bash
+pip install openml
+```
+
+## :page_facing_up: Citing OpenML-Python
 
-A python interface for [OpenML](http://openml.org). You can find the documentation on the [openml-python website](https://openml.github.io/openml-python).
+If you use OpenML-Python in a scientific publication, we would appreciate a reference to the following paper:
 
-Please commit to the right branches following the gitflow pattern:
-http://nvie.com/posts/a-successful-git-branching-model/
+[Matthias Feurer, Jan N. van Rijn, Arlind Kadra, Pieter Gijsbers, Neeratyoy Mallik, Sahithya Ravi, Andreas Müller, Joaquin Vanschoren, Frank Hutter<br/>
+**OpenML-Python: an extensible Python API for OpenML**<br/>
+Journal of Machine Learning Research, 22(100):1−5, 2021](https://www.jmlr.org/papers/v22/19-920.html)
 
-Master branch:
+Bibtex entry:
+```bibtex
+@article{JMLR:v22:19-920,
+  author  = {Matthias Feurer and Jan N. van Rijn and Arlind Kadra and Pieter Gijsbers and Neeratyoy Mallik and Sahithya Ravi and Andreas Müller and Joaquin Vanschoren and Frank Hutter},
+  title   = {OpenML-Python: an extensible Python API for OpenML},
+  journal = {Journal of Machine Learning Research},
+  year    = {2021},
+  volume  = {22},
+  number  = {100},
+  pages   = {1--5},
+  url     = {http://jmlr.org/papers/v22/19-920.html}
+}
+```
+## :handshake: Contributing
 
-[![Build Status](https://travis-ci.org/openml/openml-python.svg?branch=master)](https://travis-ci.org/openml/openml-python)
-[![Code Health](https://landscape.io/github/openml/openml-python/master/landscape.svg)](https://landscape.io/github/openml/openml-python/master)
-[![Coverage Status](https://coveralls.io/repos/github/openml/openml-python/badge.svg?branch=master)](https://coveralls.io/github/openml/openml-python?branch=master)
+We welcome contributions from both new and experienced developers!
 
-Development branch:
+If you would like to contribute to OpenML-Python, please read our  
+[Contribution Guidelines](https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md).
 
-[![Build Status](https://travis-ci.org/openml/openml-python.svg?branch=develop)](https://travis-ci.org/openml/openml-python)
-[![Code Health](https://landscape.io/github/openml/openml-python/master/landscape.svg)](https://landscape.io/github/openml/openml-python/master)
-[![Coverage Status](https://coveralls.io/repos/github/openml/openml-python/badge.svg?branch=develop)](https://coveralls.io/github/openml/openml-python?branch=develop)
+If you are new to open-source development, a great way to get started is by
+looking at issues labeled **"good first issue"** in our GitHub issue tracker.
+These tasks are beginner-friendly and help you understand the project structure,
+development workflow, and how to submit a pull request.
diff --git a/appveyor.yml b/appveyor.yml
deleted file mode 100644
index 8a8da9963..000000000
--- a/appveyor.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-clone_folder: C:\\projects\\openml-python
-
-environment:
-# global:
-#     CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\appveyor\\scikit-learn-contrib\\run_with_env.cmd"
-
- matrix:
-    - PYTHON: "C:\\Python35-x64"
-      PYTHON_VERSION: "3.5"
-      PYTHON_ARCH: "64"
-      MINICONDA: "C:\\Miniconda35-x64"
-
-matrix:
-    fast_finish: true
-
-
-install:
-  # Miniconda is pre-installed in the worker build
-  - "SET PATH=%MINICONDA%;%MINICONDA%\\Scripts;%PATH%"
-  - "python -m pip install -U pip"
-
-  # Check that we have the expected version and architecture for Python
-  - "python --version"
-  - "python -c \"import struct; print(struct.calcsize('P') * 8)\""
-  - "pip --version"
-
-  # Remove cygwin because it clashes with conda
-  # see http://help.appveyor.com/discussions/problems/3712-git-remote-https-seems-to-be-broken
-  - rmdir C:\\cygwin /s /q
-
-  # Update previous packages and install the build and runtime dependencies of the project.
-  - conda update conda --yes
-  - conda update --all --yes
-
-  # Install the build and runtime dependencies of the project.
-  - "cd C:\\projects\\openml-python"
-  - "pip install .[examples,test]"
-  - conda install --quiet --yes scikit-learn=0.20.0
-
-
-# Not a .NET project, we build scikit-learn in the install step instead
-build: false
-
-test_script:
-  - "cd C:\\projects\\openml-python"
-  - "%CMD_IN_ENV% pytest -n 4 --timeout=600 --timeout-method=thread -sv --ignore='test_OpenMLDemo.py'"
diff --git a/appveyor/run_with_env.cmd b/appveyor/run_with_env.cmd
deleted file mode 100644
index 5da547c49..000000000
--- a/appveyor/run_with_env.cmd
+++ /dev/null
@@ -1,88 +0,0 @@
-:: To build extensions for 64 bit Python 3, we need to configure environment
-:: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of:
-:: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1)
-::
-:: To build extensions for 64 bit Python 2, we need to configure environment
-:: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of:
-:: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0)
-::
-:: 32 bit builds, and 64-bit builds for 3.5 and beyond, do not require specific
-:: environment configurations.
-::
-:: Note: this script needs to be run with the /E:ON and /V:ON flags for the
-:: cmd interpreter, at least for (SDK v7.0)
-::
-:: More details at:
-:: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows
-:: http://stackoverflow.com/a/13751649/163740
-::
-:: Author: Olivier Grisel
-:: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/
-::
-:: Notes about batch files for Python people:
-::
-:: Quotes in values are literally part of the values:
-::      SET FOO="bar"
-:: FOO is now five characters long: " b a r "
-:: If you don't want quotes, don't include them on the right-hand side.
-::
-:: The CALL lines at the end of this file look redundant, but if you move them
-:: outside of the IF clauses, they do not run properly in the SET_SDK_64==Y
-:: case, I don't know why.
-@ECHO OFF
-
-SET COMMAND_TO_RUN=%*
-SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows
-SET WIN_WDK=c:\Program Files (x86)\Windows Kits\10\Include\wdf
-
-:: Extract the major and minor versions, and allow for the minor version to be
-:: more than 9.  This requires the version number to have two dots in it.
-SET MAJOR_PYTHON_VERSION=%PYTHON_VERSION:~0,1%
-IF "%PYTHON_VERSION:~3,1%" == "." (
-    SET MINOR_PYTHON_VERSION=%PYTHON_VERSION:~2,1%
-) ELSE (
-    SET MINOR_PYTHON_VERSION=%PYTHON_VERSION:~2,2%
-)
-
-:: Based on the Python version, determine what SDK version to use, and whether
-:: to set the SDK for 64-bit.
-IF %MAJOR_PYTHON_VERSION% == 2 (
-    SET WINDOWS_SDK_VERSION="v7.0"
-    SET SET_SDK_64=Y
-) ELSE (
-    IF %MAJOR_PYTHON_VERSION% == 3 (
-        SET WINDOWS_SDK_VERSION="v7.1"
-        IF %MINOR_PYTHON_VERSION% LEQ 4 (
-            SET SET_SDK_64=Y
-        ) ELSE (
-            SET SET_SDK_64=N
-            IF EXIST "%WIN_WDK%" (
-                :: See: https://connect.microsoft.com/VisualStudio/feedback/details/1610302/
-                REN "%WIN_WDK%" 0wdf
-            )
-        )
-    ) ELSE (
-        ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%"
-        EXIT 1
-    )
-)
-
-IF %PYTHON_ARCH% == 64 (
-    IF %SET_SDK_64% == Y (
-        ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture
-        SET DISTUTILS_USE_SDK=1
-        SET MSSdk=1
-        "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION%
-        "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release
-        ECHO Executing: %COMMAND_TO_RUN%
-        call %COMMAND_TO_RUN% || EXIT 1
-    ) ELSE (
-        ECHO Using default MSVC build environment for 64 bit architecture
-        ECHO Executing: %COMMAND_TO_RUN%
-        call %COMMAND_TO_RUN% || EXIT 1
-    )
-) ELSE (
-    ECHO Using default MSVC build environment for 32 bit architecture
-    ECHO Executing: %COMMAND_TO_RUN%
-    call %COMMAND_TO_RUN% || EXIT 1
-)
diff --git a/ci_scripts/create_doc.sh b/ci_scripts/create_doc.sh
deleted file mode 100644
index c9dd800a0..000000000
--- a/ci_scripts/create_doc.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-set -euo pipefail
-
-# Check if DOCPUSH is set
-if ! [[ -z ${DOCPUSH+x} ]]; then
-
-    if [[ "$DOCPUSH" == "true" ]]; then
-
-        # install documentation building dependencies
-        pip install matplotlib seaborn sphinx pillow sphinx-gallery sphinx_bootstrap_theme cython numpydoc nbformat nbconvert
-
-        # $1 is the branch name
-        # $2 is the global variable where we set the script status
-
-        if ! { [ $1 = "master" ] || [ $1 = "develop" ]; }; then
-            { echo "Not one of the allowed branches"; exit 0; }
-        fi
-
-        # delete any previous documentation folder
-        if [ -d doc/$1 ]; then
-            rm -rf doc/$1
-        fi
-
-        # create the documentation
-        cd doc && make html 2>&1
-
-        # create directory with branch name
-        # the documentation for dev/stable from git will be stored here
-        mkdir $1
-
-        # get previous documentation from github
-        git clone https://github.com/openml/openml-python.git --branch gh-pages --single-branch
-
-        # copy previous documentation
-        cp -r openml-python/. $1
-        rm -rf openml-python
-
-        # if the documentation for the branch exists, remove it
-        if [ -d $1/$1 ]; then
-            rm -rf $1/$1
-        fi
-
-        # copy the updated documentation for this branch
-        mkdir $1/$1
-        cp -r build/html/. $1/$1
-
-        # takes a variable name as an argument and assigns the script outcome to a
-        # variable with the given name. If it got this far, the script was successful
-        function set_return() {
-            # $1 is the variable where we save the script outcome
-            local __result=$1
-            local  status='success'
-            eval $__result="'$status'"
-        }
-
-        set_return "$2"
-    fi
-fi
-# Workaround for travis failure
-set +u
diff --git a/ci_scripts/flake8_diff.sh b/ci_scripts/flake8_diff.sh
deleted file mode 100755
index d74577341..000000000
--- a/ci_scripts/flake8_diff.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-# Update /CONTRIBUTING.md if these commands change.
-# The reason for not advocating using this script directly is that it
-# might not work out of the box on Windows.
-flake8 --ignore E402,W503 --show-source --max-line-length 100 $options
-mypy openml --ignore-missing-imports --follow-imports skip
diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
deleted file mode 100644
index ee8ec3b14..000000000
--- a/ci_scripts/install.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-# Deactivate the travis-provided virtual environment and setup a
-# conda-based environment instead
-deactivate
-
-# Use the miniconda installer for faster download / install of conda
-# itself
-pushd .
-cd
-mkdir -p download
-cd download
-echo "Cached in $HOME/download :"
-ls -l
-echo
-if [[ ! -f miniconda.sh ]]
-   then
-   wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
-       -O miniconda.sh
-   fi
-chmod +x miniconda.sh && ./miniconda.sh -b -p $HOME/miniconda
-cd ..
-export PATH=/home/travis/miniconda/bin:$PATH
-conda update --yes conda
-popd
-
-# Configure the conda environment and put it in the path using the
-# provided versions
-conda create -n testenv --yes python=$PYTHON_VERSION pip
-source activate testenv
-
-if [[ -v SCIPY_VERSION ]]; then
-    conda install --yes scipy=$SCIPY_VERSION
-fi
-
-python --version
-pip install -e '.[test]'
-python -c "import numpy; print('numpy %s' % numpy.__version__)"
-python -c "import scipy; print('scipy %s' % scipy.__version__)"
-
-if [[ "$EXAMPLES" == "true" ]]; then
-    pip install -e '.[examples]'
-fi
-if [[ "$DOCTEST" == "true" ]]; then
-    pip install sphinx_bootstrap_theme
-fi
-if [[ "$COVERAGE" == "true" ]]; then
-    pip install codecov pytest-cov
-fi
-if [[ "$RUN_FLAKE8" == "true" ]]; then
-    pip install flake8 mypy
-fi
-
-# Install scikit-learn last to make sure the openml package installation works
-# from a clean environment without scikit-learn.
-pip install scikit-learn==$SKLEARN_VERSION
diff --git a/ci_scripts/success.sh b/ci_scripts/success.sh
deleted file mode 100644
index dbeb18e58..000000000
--- a/ci_scripts/success.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-set -e
-
-if [[ "$COVERAGE" == "true" ]]; then
-    # Need to run coveralls from a git checkout, so we copy .coverage
-    # from TEST_DIR where pytest has been run
-    cp $TEST_DIR/.coverage $TRAVIS_BUILD_DIR
-    cd $TRAVIS_BUILD_DIR
-    # Ignore coveralls failures as the coveralls server is not
-    # very reliable but we don't want travis to report a failure
-    # in the github UI just because the coverage report failed to
-    # be published.
-    codecov || echo "Codecov upload failed"
-fi
diff --git a/ci_scripts/test.sh b/ci_scripts/test.sh
deleted file mode 100644
index 9e7bc1326..000000000
--- a/ci_scripts/test.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-set -e
-
-# check status and branch before running the unit tests
-before="`git status --porcelain -b`"
-before="$before"
-# storing current working directory
-curr_dir=`pwd`
-
-run_tests() {
-    # Get into a temp directory to run test from the installed scikit learn and
-    # check if we  do not leave artifacts
-    mkdir -p $TEST_DIR
-
-    cwd=`pwd`
-    test_dir=$cwd/tests
-    doctest_dir=$cwd/doc
-
-    cd $TEST_DIR
-    if [[ "$EXAMPLES" == "true" ]]; then
-        pytest -sv $test_dir/test_examples/
-    elif [[ "$DOCTEST" == "true" ]]; then
-        python -m doctest $doctest_dir/usage.rst
-    fi
-
-    if [[ "$COVERAGE" == "true" ]]; then
-        PYTEST_ARGS='--cov=openml'
-    else
-        PYTEST_ARGS=''
-    fi
-
-    pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread -sv --ignore='test_OpenMLDemo.py' $PYTEST_ARGS $test_dir
-}
-
-if [[ "$RUN_FLAKE8" == "true" ]]; then
-    source ci_scripts/flake8_diff.sh
-fi
-
-if [[ "$SKIP_TESTS" != "true" ]]; then
-    run_tests
-fi
-
-# changing directory to stored working directory
-cd $curr_dir
-# check status and branch after running the unit tests
-# compares with $before to check for remaining files
-after="`git status --porcelain -b`"
-if [[ "$before" != "$after" ]]; then
-    echo "All generated files have not been deleted!"
-    exit 1
-fi
\ No newline at end of file
diff --git a/doc/.nojekyll b/doc/.nojekyll
deleted file mode 100644
index 8b1378917..000000000
--- a/doc/.nojekyll
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/doc/Makefile b/doc/Makefile
deleted file mode 100644
index 767a9927b..000000000
--- a/doc/Makefile
+++ /dev/null
@@ -1,181 +0,0 @@
-# Makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-PAPER         =
-BUILDDIR      = build
-
-# User-friendly check for sphinx-build
-ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
-$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
-endif
-
-# Internal variables.
-PAPEROPT_a4     = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-# the i18n builder cannot share the environment and doctrees with the others
-I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-
-.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
-
-all: html
-
-help:
-	@echo "Please use \`make <target>' where <target> is one of"
-	@echo "  html       to make standalone HTML files"
-	@echo "  dirhtml    to make HTML files named index.html in directories"
-	@echo "  singlehtml to make a single large HTML file"
-	@echo "  pickle     to make pickle files"
-	@echo "  json       to make JSON files"
-	@echo "  htmlhelp   to make HTML files and a HTML help project"
-	@echo "  qthelp     to make HTML files and a qthelp project"
-	@echo "  devhelp    to make HTML files and a Devhelp project"
-	@echo "  epub       to make an epub"
-	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
-	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
-	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
-	@echo "  text       to make text files"
-	@echo "  man        to make manual pages"
-	@echo "  texinfo    to make Texinfo files"
-	@echo "  info       to make Texinfo files and run them through makeinfo"
-	@echo "  gettext    to make PO message catalogs"
-	@echo "  changes    to make an overview of all changed/added/deprecated items"
-	@echo "  xml        to make Docutils-native XML files"
-	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
-	@echo "  linkcheck  to check all external links for integrity"
-	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
-
-clean:
-	rm -rf $(BUILDDIR)/*
-	rm -rf generated/
-	rm -rf examples/
-
-html:
-	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
-
-dirhtml:
-	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
-
-singlehtml:
-	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
-	@echo
-	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
-
-pickle:
-	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
-	@echo
-	@echo "Build finished; now you can process the pickle files."
-
-json:
-	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
-	@echo
-	@echo "Build finished; now you can process the JSON files."
-
-htmlhelp:
-	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
-	@echo
-	@echo "Build finished; now you can run HTML Help Workshop with the" \
-	      ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
-	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
-	@echo
-	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
-	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/OpenML.qhcp"
-	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/OpenML.qhc"
-
-devhelp:
-	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
-	@echo
-	@echo "Build finished."
-	@echo "To view the help file:"
-	@echo "# mkdir -p $$HOME/.local/share/devhelp/OpenML"
-	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/OpenML"
-	@echo "# devhelp"
-
-epub:
-	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
-	@echo
-	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
-
-latex:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo
-	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
-	@echo "Run \`make' in that directory to run these through (pdf)latex" \
-	      "(use \`make latexpdf' here to do that automatically)."
-
-latexpdf:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through pdflatex..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-latexpdfja:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through platex and dvipdfmx..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-text:
-	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
-	@echo
-	@echo "Build finished. The text files are in $(BUILDDIR)/text."
-
-man:
-	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
-	@echo
-	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
-
-texinfo:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo
-	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
-	@echo "Run \`make' in that directory to run these through makeinfo" \
-	      "(use \`make info' here to do that automatically)."
-
-info:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo "Running Texinfo files through makeinfo..."
-	make -C $(BUILDDIR)/texinfo info
-	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
-
-gettext:
-	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
-	@echo
-	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
-
-changes:
-	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
-	@echo
-	@echo "The overview file is in $(BUILDDIR)/changes."
-
-linkcheck:
-	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
-	@echo
-	@echo "Link check complete; look for any errors in the above output " \
-	      "or in $(BUILDDIR)/linkcheck/output.txt."
-
-doctest:
-	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
-	@echo "Testing of doctests in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/doctest/output.txt."
-
-xml:
-	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
-	@echo
-	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
-
-pseudoxml:
-	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
-	@echo
-	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/doc/_static/codehighlightstyle.css b/doc/_static/codehighlightstyle.css
deleted file mode 100644
index ab16693ee..000000000
--- a/doc/_static/codehighlightstyle.css
+++ /dev/null
@@ -1,7 +0,0 @@
-.highlight .n { color: #000000 } /* code */
-.highlight .c1 { color: #1d8908 } /* comments */
-.highlight .mi { color: #0d9fe3; font-weight: bold } /* integers */
-.highlight .s1 { color: #d73c00 } /* string */
-.highlight .o { color: #292929 } /* operators */
- /* Background color for code highlights. Color for bash highlights */
-pre { background-color: #fbfbfb; color: #000000 }
diff --git a/doc/_templates/class.rst b/doc/_templates/class.rst
deleted file mode 100644
index 307b0199c..000000000
--- a/doc/_templates/class.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-:mod:`{{module}}`.{{objname}}
-{{ underline }}==============
-
-.. currentmodule:: {{ module }}
-
-.. autoclass:: {{ objname }}
diff --git a/doc/_templates/class_without_init.rst b/doc/_templates/class_without_init.rst
deleted file mode 100644
index 79ff2cf80..000000000
--- a/doc/_templates/class_without_init.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-:mod:`{{module}}`.{{objname}}
-{{ underline }}==============
-
-.. currentmodule:: {{ module }}
-
-.. autoclass:: {{ objname }}
-
-.. include:: {{module}}.{{objname}}.examples
-
-.. raw:: html
-
-    <div class="clearer"></div>
diff --git a/doc/_templates/function.rst b/doc/_templates/function.rst
deleted file mode 100644
index d8c9bd480..000000000
--- a/doc/_templates/function.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-:mod:`{{module}}`.{{objname}}
-{{ underline }}====================
-
-.. currentmodule:: {{ module }}
-
-.. autofunction:: {{ objname }}
-
-.. raw:: html
-
-    <div class="clearer"></div>
diff --git a/doc/_templates/layout.html b/doc/_templates/layout.html
deleted file mode 100644
index 11777457e..000000000
--- a/doc/_templates/layout.html
+++ /dev/null
@@ -1,23 +0,0 @@
-{% extends "!layout.html" %}
-
-{# Custom CSS overrides #}
-{# set bootswatch_css_custom = ['_static/my-styles.css'] #}
-
-{# Add github banner (from: https://github.com/blog/273-github-ribbons). #}
-{% block header %}
-  {{ super() }}
-  <a href="https://github.com/openml/openml-python"
-     class="visible-desktop hidden-xs"><img
-    id="gh-banner"
-    style="position: absolute; top: 50px; right: 0; border: 0;"
-    src="https://s3.amazonaws.com/github/ribbons/forkme_right_red_aa0000.png"
-    alt="Fork me on GitHub"></a>
-  <script>
-    // Adjust banner height.
-    $(function () {
-      var navHeight = $(".navbar .container").css("height");
-      $("#gh-banner").css("top", navHeight);
-    });
-  </script>
-{% endblock %}
-
diff --git a/doc/api.rst b/doc/api.rst
deleted file mode 100644
index 7979c7bfc..000000000
--- a/doc/api.rst
+++ /dev/null
@@ -1,166 +0,0 @@
-:orphan:
-
-.. _api:
-
-APIs
-****
-
-Top-level Classes
------------------
-.. currentmodule:: openml
-
-.. autosummary::
-   :toctree: generated/
-   :template: class.rst
-
-   OpenMLBenchmarkSuite
-   OpenMLClassificationTask
-   OpenMLClusteringTask
-   OpenMLDataFeature
-   OpenMLDataset
-   OpenMLEvaluation
-   OpenMLFlow
-   OpenMLLearningCurveTask
-   OpenMLParameter
-   OpenMLRegressionTask
-   OpenMLRun
-   OpenMLSetup
-   OpenMLSplit
-   OpenMLStudy
-   OpenMLSupervisedTask
-   OpenMLTask
-
-.. _api_extensions:
-
-Extensions
-----------
-
-.. currentmodule:: openml.extensions
-
-.. autosummary::
-   :toctree: generated/
-   :template: class.rst
-
-   Extension
-   sklearn.SklearnExtension
-
-.. currentmodule:: openml.extensions
-
-.. autosummary::
-   :toctree: generated/
-   :template: function.rst
-
-    get_extension_by_flow
-    get_extension_by_model
-    register_extension
-
-
-Modules
--------
-
-:mod:`openml.datasets`: Dataset Functions
------------------------------------------
-.. currentmodule:: openml.datasets
-
-.. autosummary::
-   :toctree: generated/
-   :template: function.rst
-
-    attributes_arff_from_df
-    check_datasets_active
-    create_dataset
-    get_dataset
-    get_datasets
-    list_datasets
-    list_qualities
-    status_update
-
-:mod:`openml.evaluations`: Evaluation Functions
------------------------------------------------
-.. currentmodule:: openml.evaluations
-
-.. autosummary::
-   :toctree: generated/
-   :template: function.rst
-
-    list_evaluations
-    list_evaluation_measures
-
-:mod:`openml.flows`: Flow Functions
------------------------------------
-.. currentmodule:: openml.flows
-
-.. autosummary::
-   :toctree: generated/
-   :template: function.rst
-
-    assert_flows_equal
-    flow_exists
-    get_flow
-    list_flows
-
-:mod:`openml.runs`: Run Functions
-----------------------------------
-.. currentmodule:: openml.runs
-
-.. autosummary::
-   :toctree: generated/
-   :template: function.rst
-
-    get_run
-    get_runs
-    get_run_trace
-    initialize_model_from_run
-    initialize_model_from_trace
-    list_runs
-    run_model_on_task
-    run_flow_on_task
-    run_exists
-
-:mod:`openml.setups`: Setup Functions
--------------------------------------
-.. currentmodule:: openml.setups
-
-.. autosummary::
-   :toctree: generated/
-   :template: function.rst
-
-    get_setup
-    initialize_model
-    list_setups
-    setup_exists
-
-:mod:`openml.study`: Study Functions
-------------------------------------
-.. currentmodule:: openml.study
-
-.. autosummary::
-   :toctree: generated/
-   :template: function.rst
-
-   attach_to_study
-   attach_to_suite
-   create_benchmark_suite
-   create_study
-   delete_study
-   delete_suite
-   detach_from_study
-   detach_from_suite
-   get_study
-   get_suite
-   list_studies
-   list_suites
-   update_study_status
-   update_suite_status
-
-:mod:`openml.tasks`: Task Functions
------------------------------------
-.. currentmodule:: openml.tasks
-
-.. autosummary::
-   :toctree: generated/
-   :template: function.rst
-
-    get_task
-    get_tasks
-    list_tasks
diff --git a/doc/conf.py b/doc/conf.py
deleted file mode 100644
index 03a2ec0db..000000000
--- a/doc/conf.py
+++ /dev/null
@@ -1,358 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# OpenML documentation build configuration file, created by
-# sphinx-quickstart on Wed Nov 26 10:46:10 2014.
-#
-# This file is execfile()d with the current directory set to its
-# containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-import os
-import sys
-import sphinx_bootstrap_theme
-import time
-import openml
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-# sys.path.insert(0, os.path.abspath('.')# )
-
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
-
-# -- General configuration ------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-# needs_sphinx = '1.0'
-
-#  Add any Sphinx extension module names here, as strings. They can be
-#  extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-#  ones.
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.autosummary',
-    'sphinx.ext.doctest',
-    'sphinx.ext.coverage',
-    'sphinx.ext.mathjax',
-    'sphinx.ext.ifconfig',
-    'sphinx.ext.autosectionlabel',
-    'sphinx_gallery.gen_gallery',
-    'numpydoc'
-]
-
-autosummary_generate = True
-numpydoc_show_class_members = False
-
-autodoc_default_flags = ['members', 'inherited-members']
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix of source filenames.
-source_suffix = '.rst'
-
-# The encoding of source files.
-# source_encoding = 'utf-8-sig'
-
-# The master toctree document.
-master_doc = 'index'
-
-# General information about the project.
-project = u'OpenML'
-copyright = (
-    u'2014-{}, the OpenML-Python team.'.format(time.strftime("%Y,%m,%d,%H,%M,%S").split(',')[0])
-)
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
-# The short X.Y version.
-version = openml.__version__
-# The full version, including alpha/beta/rc tags.
-release = openml.__version__
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-# language = None
-
-# There are two options for replacing |today|: either, you set today to some
-# non-false value, then it is used:
-# today = ''
-# Else, today_fmt is used as the format for a strftime call.
-# today_fmt = '%B %d, %Y'
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-exclude_patterns = ['_build', '_templates', '_static']
-
-# The reST default role (used for this markup: `text`) to use for all
-# documents.
-# default_role = None
-
-# If true, '()' will be appended to :func: etc. cross-reference text.
-# add_function_parentheses = True
-
-# If true, the current module name will be prepended to all description
-# unit titles (such as .. function::).
-# add_module_names = True
-
-# If true, sectionauthor and moduleauthor directives will be shown in the
-# output. They are ignored by default.
-# show_authors = False
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
-
-# A list of ignored prefixes for module index sorting.
-# modindex_common_prefix = []
-
-# If true, keep warnings as "system message" paragraphs in the built documents.
-# keep_warnings = False
-
-
-# -- Options for HTML output ----------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-html_theme = 'bootstrap'
-
-html_theme_options = {
-    # Navigation bar title. (Default: ``project`` value)
-    'navbar_title': "OpenML",
-
-    # Tab name for entire site. (Default: "Site")
-    # 'navbar_site_name': "Site",
-
-    # A list of tuples containting pages to link to.  The value should
-    # be in the form [(name, page), ..]
-    'navbar_links': [
-        ('Start', 'index'),
-        ('User Guide', 'usage'),
-        ('API', 'api'),
-        ('Examples', 'examples/index'),
-        ('Contributing', 'contributing'),
-        ('Changelog', 'progress'),
-    ],
-
-    # Render the next and previous page links in navbar. (Default: true)
-    'navbar_sidebarrel': False,
-
-    # Render the current pages TOC in the navbar. (Default: true)
-    'navbar_pagenav': False,
-
-    # Tab name for the current pages TOC. (Default: "Page")
-    'navbar_pagenav_name': "On this page",
-
-    # Global TOC depth for "site" navbar tab. (Default: 1)
-    # Switching to -1 shows all levels.
-    'globaltoc_depth': 1,
-
-    # Include hidden TOCs in Site navbar?
-    #
-    # Note: If this is "false", you cannot have mixed ``:hidden:`` and
-    # non-hidden ``toctree`` directives in the same page, or else the build
-    # will break.
-    #
-    # Values: "true" (default) or "false"
-    'globaltoc_includehidden': "false",
-
-    # HTML navbar class (Default: "navbar") to attach to <div> element.
-    # For black navbar, do "navbar navbar-inverse"
-    'navbar_class': "navbar",
-
-    # Fix navigation bar to top of page?
-    # Values: "true" (default) or "false"
-    'navbar_fixed_top': "true",
-
-    # Location of link to source.
-    # Options are "nav" (default), "footer" or anything else to exclude.
-    'source_link_position': "None",
-
-    # Bootswatch (http://bootswatch.com/) theme.
-    #
-    # Options are nothing with "" (default) or the name of a valid theme
-    # such as "amelia" or "cosmo".
-    'bootswatch_theme': "flatly",
-
-    # Choose Bootstrap version.
-    # Values: "3" (default) or "2" (in quotes)
-    'bootstrap_version': "3",
-}
-
-# Add any paths that contain custom themes here, relative to this directory.
-html_theme_path = sphinx_bootstrap_theme.get_html_theme_path()
-
-# The name for this set of Sphinx documents.  If None, it defaults to
-# "<project> v<release> documentation".
-# html_title = None
-
-# A shorter title for the navigation bar.  Default is the same as html_title.
-# html_short_title = None
-
-# The name of an image file (relative to this directory) to place at the top
-# of the sidebar.
-# html_logo = None
-
-# The name of an image file (within the static path) to use as favicon of the
-# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
-# pixels large.
-# html_favicon = None
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
-
-# Add any extra paths that contain custom files (such as robots.txt or
-# .htaccess) here, relative to this directory. These files are copied
-# directly to the root of the documentation.
-# html_extra_path = []
-
-# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
-# using the given strftime format.
-# html_last_updated_fmt = '%b %d, %Y'
-
-# If true, SmartyPants will be used to convert quotes and dashes to
-# typographically correct entities.
-# html_use_smartypants = True
-
-# Custom sidebar templates, maps document names to template names.
-html_sidebars = {'**': ['localtoc.html']}
-
-# Additional templates that should be rendered to pages, maps page names to
-# template names.
-# html_additional_pages = {}
-
-# If false, no module index is generated.
-# html_domain_indices = True
-
-# If false, no index is generated.
-# html_use_index = True
-
-# If true, the index is split into individual pages for each letter.
-# html_split_index = False
-
-# If true, links to the reST sources are added to the pages.
-# html_show_sourcelink = True
-
-# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-# html_show_sphinx = True
-
-# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-# html_show_copyright = True
-
-# If true, an OpenSearch description file will be output, and all pages will
-# contain a <link> tag referring to it.  The value of this option must be the
-# base URL from which the finished HTML is served.
-# html_use_opensearch = ''
-
-# This is the file name suffix for HTML files (e.g. ".xhtml").
-# html_file_suffix = None
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'OpenMLdoc'
-
-
-# -- Options for LaTeX output ---------------------------------------------
-
-latex_elements = {
-    # The paper size ('letterpaper' or 'a4paper').
-    # 'papersize': 'letterpaper',
-
-    # The font size ('10pt', '11pt' or '12pt').
-    # 'pointsize': '10pt',
-
-    # Additional stuff for the LaTeX preamble.
-    # 'preamble': '',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-latex_documents = [('index', 'OpenML.tex', u'OpenML Documentation',
-                    u'Matthias Feurer', 'manual'), ]
-
-# The name of an image file (relative to this directory) to place at the top of
-# the title page.
-# latex_logo = None
-
-# For "manual" documents, if this is true, then toplevel headings are parts,
-# not chapters.
-# latex_use_parts = False
-
-# If true, show page references after internal links.
-# latex_show_pagerefs = False
-
-# If true, show URL addresses after external links.
-# latex_show_urls = False
-
-# Documents to append as an appendix to all manuals.
-# latex_appendices = []
-
-# If false, no module index is generated.
-# latex_domain_indices = True
-
-
-# -- Options for manual page output ---------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
-    ('index', 'openml', u'OpenML Documentation',
-     [u'Matthias Feurer'], 1)
-]
-
-# If true, show URL addresses after external links.
-# man_show_urls = False
-
-
-# -- Options for Texinfo output -------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-texinfo_documents = [
-    ('index', 'OpenML', u'OpenML Documentation',
-     u'Matthias Feurer', 'OpenML', 'One line description of project.',
-     'Miscellaneous'),
-]
-
-# Documents to append as an appendix to all manuals.
-# texinfo_appendices = []
-
-# If false, no module index is generated.
-# texinfo_domain_indices = True
-
-# How to display URL addresses: 'footnote', 'no', or 'inline'.
-# texinfo_show_urls = 'footnote'
-
-# If true, do not generate a @detailmenu in the "Top" node's menu.
-# texinfo_no_detailmenu = False
-
-# prefix each section label with the name of the document it is in,
-# in order to avoid ambiguity when there are multiple same section
-# labels in different documents.
-autosectionlabel_prefix_document = True
-# Sphinx-gallery configuration.
-sphinx_gallery_conf = {
-    # disable mini galleries clustered by the used functions
-    'backreferences_dir': False,
-    # path to the examples
-    'examples_dirs': '../examples',
-    # path where to save gallery generated examples
-    'gallery_dirs': 'examples',
-    # compile execute examples in the examples dir
-    'filename_pattern': '.*example.py$|.*tutorial.py$',
-    # TODO: fix back/forward references for the examples.
-}
-
-
-def setup(app):
-    app.add_stylesheet("codehighlightstyle.css")
diff --git a/doc/contributing.rst b/doc/contributing.rst
deleted file mode 100644
index e614c8a25..000000000
--- a/doc/contributing.rst
+++ /dev/null
@@ -1,162 +0,0 @@
-:orphan:
-
-.. _contributing:
-
-
-============
-Contributing
-============
-
-Contribution to the OpenML package is highly appreciated. Currently,
-there is a lot of work left on implementing API calls,
-testing them and providing examples to allow new users to easily use the
-OpenML package. See the :ref:`issues` section for open tasks.
-
-Please mark yourself as contributor in a github issue if you start working on
-something to avoid duplicate work. If you're part of the OpenML organization
-you can use github's assign feature, otherwise you can just leave a comment.
-
-.. _scope:
-
-Scope of the package
-====================
-
-The scope of the OpenML python package is to provide a python interface to
-the OpenML platform which integrates well with pythons scientific stack, most
-notably `numpy <http://www.numpy.org/>`_ and `scipy <https://www.scipy.org/>`_.
-To reduce opportunity costs and demonstrate the usage of the package, it also
-implements an interface to the most popular machine learning package written
-in python, `scikit-learn <http://scikit-learn.org/stable/index.html>`_.
-Thereby it will automatically be compatible with many machine learning
-libraries written in Python.
-
-We aim to keep the package as light-weight as possible and we will try to
-keep the number of potential installation dependencies as low as possible.
-Therefore, the connection to other machine learning libraries such as
-*pytorch*, *keras* or *tensorflow* should not be done directly inside this
-package, but in a separate package using the OpenML python connector.
-
-.. _issues:
-
-Open issues and potential todos
-===============================
-
-We collect open issues and feature requests in an `issue tracker on github <https://github.com/openml/openml-python/issues>`_.
-The issue tracker contains issues marked as *Good first issue*, which shows
-issues which are good for beginners. We also maintain a somewhat up-to-date
-`roadmap <https://github.com/openml/openml-python/issues/410>`_ which
-contains longer-term goals.
-
-.. _how_to_contribute:
-
-How to contribute
-=================
-
-There are many ways to contribute to the development of the OpenML python
-connector and OpenML in general. We welcome all kinds of contributions,
-especially:
-
-* Source code which fixes an issue, improves usability or implements a new
-  feature.
-* Improvements to the documentation, which can be found in the ``doc``
-  directory.
-* New examples - current examples can be found in the ``examples`` directory.
-* Bug reports - if something doesn't work for you or is cumbersome, please
-  open a new issue to let us know about the problem.
-* Use the package and spread the word.
-* `Cite OpenML <https://www.openml.org/cite>`_ if you use it in a scientific
-  publication.
-* Visit one of our `hackathons <https://meet.openml.org/>`_.
-* Check out how to `contribute to the main OpenML project <https://github.com/openml/OpenML/blob/master/CONTRIBUTING.md>`_.
-
-Contributing code
-~~~~~~~~~~~~~~~~~
-
-Our guidelines on code contribution can be found in `this file <https://github.com/openml/openml-python/blob/master/CONTRIBUTING.md>`_.
-
-.. _installation:
-
-Installation
-============
-
-Installation from github
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-The package source code is available from
-`github <https://github.com/openml/openml-python>`_ and can be obtained with:
-
-.. code:: bash
-
-    git clone https://github.com/openml/openml-python.git
-
-
-Once you cloned the package, change into the new directory.
-If you are a regular user, install with
-
-.. code:: bash
-
-    pip install -e .
-
-If you are a contributor, you will also need to install test dependencies
-
-.. code:: bash
-
-    pip install -e ".[test]"
-
-
-Testing
-=======
-
-From within the directory of the cloned package, execute:
-
-.. code:: bash
-
-    pytest tests/
-
-Executing a specific test can be done by specifying the module, test case, and test.
-To obtain a hierarchical list of all tests, run
-
-.. code:: bash
-
-    pytest --collect-only
-
-.. code:: bash
-
-    <Module 'tests/test_datasets/test_dataset.py'>
-      <UnitTestCase 'OpenMLDatasetTest'>
-        <TestCaseFunction 'test_dataset_format_constructor'>
-        <TestCaseFunction 'test_get_data'>
-        <TestCaseFunction 'test_get_data_rowid_and_ignore_and_target'>
-        <TestCaseFunction 'test_get_data_with_ignore_attributes'>
-        <TestCaseFunction 'test_get_data_with_rowid'>
-        <TestCaseFunction 'test_get_data_with_target'>
-      <UnitTestCase 'OpenMLDatasetTestOnTestServer'>
-        <TestCaseFunction 'test_tagging'>
-
-
-To run a specific module, add the module name, for instance:
-
-.. code:: bash
-
-    pytest tests/test_datasets/test_dataset.py
-
-To run a specific unit test case, add the test case name, for instance:
-
-.. code:: bash
-
-    pytest tests/test_datasets/test_dataset.py::OpenMLDatasetTest
-
-To run a specific unit test, add the test name, for instance:
-
-.. code:: bash
-
-    pytest tests/test_datasets/test_dataset.py::OpenMLDatasetTest::test_get_data
-
-Happy testing!
-
-
-Connecting new machine learning libraries
-=========================================
-
-Coming soon - please stay tuned!
-
diff --git a/doc/index.rst b/doc/index.rst
deleted file mode 100644
index 96e534705..000000000
--- a/doc/index.rst
+++ /dev/null
@@ -1,86 +0,0 @@
-.. OpenML documentation master file, created by
-   sphinx-quickstart on Wed Nov 26 10:46:10 2014.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
-======
-OpenML
-======
-
-**Collaborative Machine Learning in Python**
-
-Welcome to the documentation of the OpenML Python API, a connector to the
-collaborative machine learning platform `OpenML.org <https://www.openml.org>`_.
-The OpenML Python package allows to use datasets and tasks from OpenML together
-with scikit-learn and share the results online.
-
--------
-Example
--------
-
-.. code:: python
-
-    import openml
-    from sklearn import impute, tree, pipeline
-
-    # Define a scikit-learn classifier or pipeline
-    clf = pipeline.Pipeline(
-        steps=[
-            ('imputer', impute.SimpleImputer()),
-            ('estimator', tree.DecisionTreeClassifier())
-        ]
-    )
-    # Download the OpenML task for the german credit card dataset with 10-fold
-    # cross-validation.
-    task = openml.tasks.get_task(31)
-    # Run the scikit-learn model on the task.
-    run = openml.runs.run_model_on_task(clf, task)
-    # Publish the experiment on OpenML (optional, requires an API key.
-    # You can get your own API key by signing up to OpenML.org)
-    run.publish()
-    print('View the run online: %s/run/%d' % (openml.config.server, run.run_id))
-
-You can find more examples in our `examples gallery <examples/index.html>`_.
-
-----------------------------
-How to get OpenML for python
-----------------------------
-You can install the OpenML package via `pip`:
-
-.. code:: bash
-
-    pip install openml
-
-For more advanced installation information, please see the
-:ref:`installation` section.
-
--------
-Content
--------
-
-* :ref:`usage`
-* :ref:`api`
-* `Examples <examples/index.html>`_
-* :ref:`contributing`
-* :ref:`progress`
-
--------------------
-Further information
--------------------
-
-* `OpenML documentation <https://docs.openml.org/>`_
-* `OpenML client APIs <https://docs.openml.org/APIs/>`_
-* `OpenML developer guide <https://docs.openml.org/developers/>`_
-* `Contact information <https://www.openml.org/contact>`_
-* `Citation request <https://www.openml.org/cite>`_
-* `OpenML blog <https://medium.com/open-machine-learning>`_
-* `OpenML twitter account <https://twitter.com/open_ml>`_
-
-------------
-Contributing
-------------
-
-Contribution to the OpenML package is highly appreciated. The OpenML package
-currently has a 1/4 position for the development and all help possible is
-needed to extend and maintain the package, create new examples and improve
-the usability. Please see the :ref:`contributing` page for more information.
diff --git a/doc/progress.rst b/doc/progress.rst
deleted file mode 100644
index 5ce263fce..000000000
--- a/doc/progress.rst
+++ /dev/null
@@ -1,118 +0,0 @@
-:orphan:
-
-.. _progress:
-
-=========
-Changelog
-=========
-
-0.10.0
-~~~~~~
-* FIX #261: Test server is cleared of all files uploaded during unit testing.
-* FIX #447: All files created by unit tests no longer persist in local.
-* FIX #608: Fixing dataset_id referenced before assignment error in get_run function.
-* FIX #447: All files created by unit tests are deleted after the completion of all unit tests.
-* FIX #589: Fixing a bug that did not successfully upload the columns to ignore when creating and publishing a dataset.
-* FIX #608: Fixing dataset_id referenced before assignment error in get_run function.
-* DOC #639: More descriptive documention for function to convert array format.
-* ADD #687: Adds a function to retrieve the list of evaluation measures available.
-* ADD #695: A function to retrieve all the data quality measures available.
-* ADD #412: Add a function to trim flow names for scikit-learn flows.
-* ADD #715: `list_evaluations` now has an option to sort evaluations by score (value).
-* ADD #722: Automatic reinstantiation of flow in `run_model_on_task`. Clearer errors if that's not possible.
-* MAINT #726: Update examples to remove deprecation warnings from scikit-learn
-
-0.9.0
-~~~~~
-* ADD #560: OpenML-Python can now handle regression tasks as well.
-* ADD #620, #628, #632, #649, #682: Full support for studies and distinguishes suites from studies.
-* ADD #607: Tasks can now be created and uploaded.
-* ADD #647, #673: Introduced the extension interface. This provides an easy way to create a hook for machine learning packages to perform e.g. automated runs.
-* ADD #548, #646, #676: Support for Pandas DataFrame and SparseDataFrame
-* ADD #662: Results of listing functions can now be returned as pandas.DataFrame.
-* ADD #59: Datasets can now also be retrieved by name.
-* ADD #672: Add timing measurements for runs, when possible.
-* ADD #661: Upload time and error messages now displayed with `list_runs`.
-* ADD #644: Datasets can now be downloaded 'lazily', retrieving only metadata at first, and the full dataset only when necessary.
-* ADD #659: Lazy loading of task splits.
-* ADD #516: `run_flow_on_task` flow uploading is now optional.
-* ADD #680: Adds `openml.config.start_using_configuration_for_example` (and resp. stop) to easily connect to the test server.
-* ADD #75, #653: Adds a pretty print for objects of the top-level classes.
-* FIX #642: `check_datasets_active` now correctly also returns active status of deactivated datasets.
-* FIX #304, #636: Allow serialization of numpy datatypes and list of lists of more types (e.g. bools, ints) for flows.
-* FIX #651: Fixed a bug that would prevent openml-python from finding the user's config file.
-* FIX #693: OpenML-Python uses liac-arff instead of scipy.io for loading task splits now.
-* DOC #678: Better color scheme for code examples in documentation.
-* DOC #681: Small improvements and removing list of missing functions.
-* DOC #684: Add notice to examples that connect to the test server.
-* DOC #688: Add new example on retrieving evaluations.
-* DOC #691: Update contributing guidelines to use Github draft feature instead of tags in title.
-* DOC #692: All functions are documented now.
-* MAINT #184: Dropping Python2 support.
-* MAINT #596: Fewer dependencies for regular pip install.
-* MAINT #652: Numpy and Scipy are no longer required before installation.
-* MAINT #655: Lazy loading is now preferred in unit tests.
-* MAINT #667: Different tag functions now share code.
-* MAINT #666: More descriptive error message for `TypeError` in `list_runs`.
-* MAINT #668: Fix some type hints.
-* MAINT #677: `dataset.get_data` now has consistent behavior in its return type.
-* MAINT #686: Adds ignore directives for several `mypy` folders.
-* MAINT #629, #630: Code now adheres to single PEP8 standard.
-
-0.8.0
-~~~~~
-
-* ADD #440: Improved dataset upload.
-* ADD #545, #583: Allow uploading a dataset from a pandas DataFrame.
-* ADD #528: New functions to update the status of a dataset.
-* ADD #523: Support for scikit-learn 0.20's new ColumnTransformer.
-* ADD #459: Enhanced support to store runs on disk prior to uploading them to
-  OpenML.
-* ADD #564: New helpers to access the structure of a flow (and find its
-  subflows).
-* ADD #618: The software will from now on retry to connect to the server if a
-  connection failed. The number of retries can be configured.
-* FIX #538: Support loading clustering tasks.
-* FIX #464: Fixes a bug related to listing functions (returns correct listing
-  size).
-* FIX #580: Listing function now works properly when there are less results
-  than requested.
-* FIX #571: Fixes an issue where tasks could not be downloaded in parallel.
-* FIX #536: Flows can now be printed when the flow name is None.
-* FIX #504: Better support for hierarchical hyperparameters when uploading
-  scikit-learn's grid and random search.
-* FIX #569: Less strict checking of flow dependencies when loading flows.
-* FIX #431: Pickle of task splits are no longer cached.
-* DOC #540: More examples for dataset uploading.
-* DOC #554: Remove the doubled progress entry from the docs.
-* MAINT #613: Utilize the latest updates in OpenML evaluation listings.
-* MAINT #482: Cleaner interface for handling search traces.
-* MAINT #557: Continuous integration works for scikit-learn 0.18-0.20.
-* MAINT #542: Continuous integration now runs python3.7 as well.
-* MAINT #535: Continuous integration now enforces PEP8 compliance for new code.
-* MAINT #527: Replace deprecated nose by pytest.
-* MAINT #510: Documentation is now built by travis-ci instead of circle-ci.
-* MAINT: Completely re-designed documentation built on sphinx gallery.
-* MAINT #462: Appveyor CI support.
-* MAINT #477: Improve error handling for issue
-  `#479 <https://github.com/openml/openml-python/pull/479>`_:
-  the OpenML connector fails earlier and with a better error message when
-  failing to create a flow from the OpenML description.
-* MAINT #561: Improve documentation on running specific unit tests.
-
-0.4.-0.7
-~~~~~~~~
-
-There is no changelog for these versions.
-
-0.3.0
-~~~~~
-
-* Add this changelog
-* 2nd example notebook PyOpenML.ipynb
-* Pagination support for list datasets and list tasks
-
-Prior
-~~~~~
-
-There is no changelog for prior versions.
diff --git a/doc/usage.rst b/doc/usage.rst
deleted file mode 100644
index b607c1433..000000000
--- a/doc/usage.rst
+++ /dev/null
@@ -1,136 +0,0 @@
-:orphan:
-
-.. _usage:
-
-.. role:: bash(code)
-   :language: bash
-
-.. role:: python(code)
-   :language: python
-
-**********
-User Guide
-**********
-
-This document will guide you through the most important use cases, functions
-and classes in the OpenML Python API. Throughout this document, we will use
-`pandas <http://pandas.pydata.org/>`_ to format and filter tables.
-
-~~~~~~~~~~~~~~~~~~~~~~
-Installation & Set up
-~~~~~~~~~~~~~~~~~~~~~~
-
-The OpenML Python package is a connector to `OpenML <https://www.openml.org/>`_.
-It allows to use and share datasets and tasks, run
-machine learning algorithms on them and then share the results online.
-
-The following tutorial gives a short introduction on how to install and set up
-the OpenML python connector, followed up by a simple example.
-
-* `Introduction <examples/introduction_tutorial.html>`_
-
-
-~~~~~~~~~~~~
-Key concepts
-~~~~~~~~~~~~
-
-OpenML contains several key concepts which it needs to make machine learning
-research shareable. A machine learning experiment consists of one or several
-**runs**, which describe the performance of an algorithm (called a **flow** in
-OpenML), its hyperparameter settings (called a **setup**) on a **task**. A
-**Task** is the combination of a **dataset**, a split and an evaluation
-metric. In this user guide we will go through listing and exploring existing
-**tasks** to actually running machine learning algorithms on them. In a further
-user guide we will examine how to search through **datasets** in order to curate
-a list of **tasks**.
-
-A further explanation is given in the
-`OpenML user guide <https://openml.github.io/OpenML/#concepts>`_.
-
-~~~~~~~~~~~~~~~~~~
-Working with tasks
-~~~~~~~~~~~~~~~~~~
-
-You can think of a task as an experimentation protocol, describing how to apply
-a machine learning model to a dataset in a way that it is comparable with the
-results of others (more on how to do that further down). Tasks are containers,
-defining which dataset to use, what kind of task we're solving (regression,
-classification, clustering, etc...) and which column to predict. Furthermore,
-it also describes how to split the dataset into a train and test set, whether
-to use several disjoint train and test splits (cross-validation) and whether
-this should be repeated several times. Also, the task defines a target metric
-for which a flow should be optimized.
-
-Below you can find our tutorial regarding tasks and if you want to know more
-you can read the `OpenML guide <https://docs.openml.org/#tasks>`_:
-
-* `Tasks <examples/tasks_tutorial.html>`_
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Running machine learning algorithms and uploading results
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In order to upload and share results of running a machine learning algorithm
-on a task, we need to create an :class:`~openml.OpenMLRun`. A run object can
-be created by running a :class:`~openml.OpenMLFlow` or a scikit-learn compatible
-model on a task. We will focus on the simpler example of running a
-scikit-learn model.
-
-Flows are descriptions of something runable which does the machine learning.
-A flow contains all information to set up the necessary machine learning
-library and its dependencies as well as all possible parameters.
-
-A run is the outcome of running a flow on a task. It contains all parameter
-settings for the flow, a setup string (most likely a command line call) and all
-predictions of that run. When a run is uploaded to the server, the server
-automatically calculates several metrics which can be used to compare the
-performance of different flows to each other.
-
-So far, the OpenML python connector works only with estimator objects following
-the `scikit-learn estimator API <http://scikit-learn.org/dev/developers/contributing.html#apis-of-scikit-learn-objects>`_.
-Those can be directly run on a task, and a flow will automatically be created or
-downloaded from the server if it already exists.
-
-The next tutorial covers how to train different machine learning models,
-how to run machine learning models on OpenML data and how to share the results:
-
-* `Flows and Runs <examples/flows_and_runs_tutorial.html>`_
-
-~~~~~~~~
-Datasets
-~~~~~~~~
-
-OpenML provides a large collection of datasets and the benchmark
-"`OpenML100 <https://docs.openml.org/benchmark/>`_" which consists of a curated
-list of datasets.
-
-You can find the dataset that best fits your requirements by making use of the
-available metadata. The tutorial which follows explains how to get a list of
-datasets, how to filter the list to find the dataset that suits your
-requirements and how to download a dataset:
-
-* `Filter and explore datasets <examples/datasets_tutorial.html>`_
-
-OpenML is about sharing machine learning results and the datasets they were
-obtained on. Learn how to share your datasets in the following tutorial:
-
-* `Upload a dataset <examples/create_upload_tutorial.html>`_
-
-~~~~~~~~~~~~~~~~~~~~~~~
-Extending OpenML-Python
-~~~~~~~~~~~~~~~~~~~~~~~
-
-OpenML-Python provides an extension interface to connect other machine learning libraries than
-scikit-learn to OpenML. Please check the :ref:`api_extensions` and use the
-scikit-learn extension in :class:`openml.extensions.sklearn.SklearnExtension` as a starting point.
-
-~~~~~~~~~~~~~~~
-Advanced topics
-~~~~~~~~~~~~~~~
-
-We are working on tutorials for the following topics:
-
-* Querying datasets (TODO)
-* Creating tasks (TODO)
-* Working offline (TODO)
-* Analyzing large amounts of results (TODO)
diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 000000000..a84723309
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,21 @@
+# Dockerfile to build an image with preinstalled dependencies
+# Useful building docs or running unix tests from a Windows host.
+FROM python:3.10
+
+RUN git clone  https://github.com/openml/openml-python.git openml
+WORKDIR openml
+RUN python -m venv venv
+RUN venv/bin/pip install wheel setuptools
+RUN venv/bin/pip install -e .[test,examples,docs,examples_unix]
+
+WORKDIR /
+RUN mkdir scripts
+ADD startup.sh scripts/
+ADD readme.md /
+
+# Due to the nature of the Docker container it might often be built from Windows.
+# It is typical to have the files with \r\n line-ending, we want to remove it for the unix image.
+RUN sed -i 's/\r//g' scripts/startup.sh
+
+# overwrite the default `python` entrypoint
+ENTRYPOINT ["/bin/bash", "/scripts/startup.sh"]
diff --git a/docker/readme.md b/docker/readme.md
new file mode 100644
index 000000000..d0af9d9fe
--- /dev/null
+++ b/docker/readme.md
@@ -0,0 +1,131 @@
+# OpenML Python Container
+
+This docker container has the latest version of openml-python downloaded and pre-installed.
+It can also be used by developers to run unit tests or build the docs in 
+a fresh and/or isolated unix environment. 
+This document contains information about:
+
+ 1. [Usage](#usage): how to use the image and its main modes.
+ 2. [Using local or remote code](#using-local-or-remote-code): useful when testing your own latest changes.
+ 3. [Versions](#versions): identify which image to use.
+ 4. [Development](#for-developers): information about the Docker image for developers.
+
+*note:* each docker image is shipped with a readme, which you can read with:
+`docker run --entrypoint=/bin/cat openml/openml-python:TAG readme.md`
+
+## Usage
+
+There are three main ways to use the image: running a pre-installed Python environment,
+running tests, and building documentation.
+
+### Running `Python` with pre-installed `OpenML-Python` (default):
+
+To run `Python` with a pre-installed `OpenML-Python` environment run:
+
+```text
+docker run -it openml/openml-python
+```
+
+this accepts the normal `Python` arguments, e.g.:
+
+```text
+docker run openml/openml-python -c "import openml; print(openml.__version__)"
+```
+
+if you want to run a local script, it needs to be mounted first. Mount it into the
+`openml` folder:
+
+```
+docker run -v PATH/TO/FILE:/openml/MY_SCRIPT.py openml/openml-python MY_SCRIPT.py
+```
+
+### Running unit tests
+
+You can run the unit tests by passing `test` as the first argument.
+It also requires a local or remote repository to be specified, which is explained 
+[below]((#using-local-or-remote-code). For this example, we specify to test the
+`develop` branch:
+
+```text
+docker run openml/openml-python test develop
+```
+
+### Building documentation
+
+You can build the documentation by passing `doc` as the first argument, 
+you should [mount]((https://docs.docker.com/storage/bind-mounts/#start-a-container-with-a-bind-mount)) 
+an output directory in which the docs will be stored. You also need to provide a remote
+or local repository as explained in [the section below]((#using-local-or-remote-code).
+In this example, we build documentation for the `develop` branch.
+On Windows:
+
+```text
+    docker run --mount type=bind,source="E:\\files/output",destination="/output" openml/openml-python doc develop
+```
+
+on Linux:
+```text
+    docker run --mount type=bind,source="./output",destination="/output" openml/openml-python doc develop
+```
+    
+see [the section below]((#using-local-or-remote-code) for running against local changes
+or a remote branch.
+
+*Note: you can forgo mounting an output directory to test if the docs build successfully,
+but the result will only be available within the docker container under `/openml/docs/build`.*
+
+## Using local or remote code
+
+You can build docs or run tests against your local repository or a Github repository.
+In the examples below, change the `source` to match the location of your local repository.
+
+### Using a local repository
+
+To use a local directory, mount it in the `/code` directory,  on Windows:
+
+```text
+    docker run --mount type=bind,source="E:\\repositories/openml-python",destination="/code" openml/openml-python test
+```
+
+on Linux:
+```text
+    docker run --mount type=bind,source="/Users/pietergijsbers/repositories/openml-python",destination="/code" openml/openml-python test
+```
+
+when building docs, you also need to mount an output directory as shown above, so add both:
+
+```text
+docker run --mount type=bind,source="./output",destination="/output" --mount type=bind,source="/Users/pietergijsbers/repositories/openml-python",destination="/code" openml/openml-python doc
+```
+
+### Using a Github repository
+Building from a remote repository requires you to specify a branch.
+The branch may be specified by name directly if it exists on the original repository (https://github.com/openml/openml-python/):
+
+    docker run --mount type=bind,source=PATH_TO_OUTPUT,destination=/output openml/openml-python [test,doc] BRANCH
+
+Where `BRANCH` is the name of the branch for which to generate the documentation.
+It is also possible to build the documentation from the branch on a fork,
+in this case the `BRANCH` should be specified as `GITHUB_NAME#BRANCH` (e.g. 
+`PGijsbers#my_feature_branch`) and the name of the forked repository should be `openml-python`.
+
+## For developers
+This section contains some notes about the structure of the image, 
+intended for those who want to work on it.
+
+### Added Directories
+The `openml/openml-python` image is built on a vanilla `python:3` image.
+Additionally, it contains the following files are directories:
+
+ - `/openml`: contains the openml-python repository in the state with which the image 
+   was built by default. If working with a `BRANCH`, this repository will be set to 
+   the `HEAD` of `BRANCH`.
+ - `/openml/venv/`: contains the used virtual environment for `doc` and `test`. It has
+   `openml-python` dependencies pre-installed.  When invoked with `doc` or `test`, the 
+   dependencies will be updated based on the `setup.py` of the `BRANCH` or mounted `/code`.
+ - `/scripts/startup.sh`: the entrypoint of the image. Takes care of the automated features (e.g. `doc` and `test`).
+
+## Building the image
+To build the image yourself, execute `docker build -f Dockerfile .` from the `docker`
+directory of the `openml-python` repository. It will use the `startup.sh` as is, so any 
+local changes will be present in the image.
diff --git a/docker/startup.sh b/docker/startup.sh
new file mode 100644
index 000000000..34a5c61f3
--- /dev/null
+++ b/docker/startup.sh
@@ -0,0 +1,80 @@
+# Entry script to switch between the different Docker functionalities.
+# By default, execute Python with OpenML pre-installed
+#
+# Entry script to allow docker to be ran for bash, tests and docs.
+# The script assumes a code repository can be mounted to ``/code`` and an output directory to ``/output``.
+# Executes ``mode`` on ``branch`` or the provided ``code`` directory.
+# $1: Mode, optional. Options:
+#        - test: execute unit tests
+#        - doc: build documentation, requires a mounted ``output`` directory if built from a branch.
+#        - if not provided: execute bash.
+# $2: Branch, optional.
+#        Mutually exclusive with mounting a ``code`` directory.
+#        Can be a branch on a Github fork, specified with the USERNAME#BRANCH format.
+#        The test or doc build is executed on this branch.
+
+if [[ ! ( $1 = "doc" || $1 = "test" ) ]]; then
+  cd openml
+  source venv/bin/activate
+  python "$@"
+  exit 0
+fi
+
+# doc and test modes require mounted directories and/or specified branches
+if ! [ -d "/code" ] && [ -z "$2" ]; then
+  echo "To perform $1 a code repository must be mounted to '/code' or a branch must be specified." >> /dev/stderr
+  exit 1
+fi
+if [ -d "/code" ] && [ -n "$2" ]; then
+  # We want to avoid switching the git environment from within the docker container
+  echo "You can not specify a branch for a mounted code repository." >> /dev/stderr
+  exit 1
+fi
+if [ "$1" == "doc" ]  && [ -n "$2" ] && ! [ -d "/output" ]; then
+    echo "To build docs from an online repository, you need to mount an output directory." >> /dev/stderr
+    exit 1
+fi
+
+if [ -n "$2" ]; then
+  # if a branch is provided, we will pull it into the `openml` local repository that was created with the image.
+  cd openml
+  if [[ $2 == *#* ]]; then
+    # If a branch is specified on a fork (with NAME#BRANCH format), we have to construct the url before pulling
+    # We add a trailing '#' delimiter so the second element doesn't get the trailing newline from <<<
+    readarray -d '#' -t fork_name_and_branch<<<"$2#"
+    fork_url="https://github.com/${fork_name_and_branch[0]}/openml-python.git"
+    fork_branch="${fork_name_and_branch[1]}"
+    echo git fetch "$fork_url" "$fork_branch":branch_from_fork
+    git fetch "$fork_url" "$fork_branch":branch_from_fork
+    branch=branch_from_fork
+  else
+    git fetch origin "$2"
+    branch=$2
+  fi
+  if ! git checkout "$branch" ; then
+    echo "Could not checkout $branch. If the branch lives on a fork, specify it as USER#BRANCH. Make sure to push the branch." >> /dev/stderr
+    exit 1
+  fi
+  git pull
+  code_dir="/openml"
+else
+  code_dir="/code"
+fi
+
+source /openml/venv/bin/activate
+cd $code_dir
+# The most recent ``main`` is already installed, but we want to update any outdated dependencies
+pip install -e .[test,examples,docs,examples_unix]
+
+if [ "$1" == "test" ]; then
+  pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread --dist load -sv
+fi
+
+if [ "$1" == "doc" ]; then
+  cd doc
+  make html
+  make linkcheck
+  if [ -d "/output" ]; then
+    cp -r /openml/doc/build /output
+  fi
+fi
\ No newline at end of file
diff --git a/docs/contributing.md b/docs/contributing.md
new file mode 100644
index 000000000..39072d64e
--- /dev/null
+++ b/docs/contributing.md
@@ -0,0 +1,22 @@
+# Contributing
+
+Contribution to the OpenML package is highly appreciated in all forms.
+In particular, a few ways to contribute to openml-python are:
+
+-   A direct contribution to the package, by means of improving the
+    code, documentation or examples. To get started, see [this
+    file](https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md)
+    with details on how to set up your environment to develop for
+    openml-python.
+-   A contribution to an openml-python extension. An extension package
+    allows OpenML to interface with a machine learning package (such
+    as scikit-learn or keras). These extensions are hosted in separate
+    repositories and may have their own guidelines. For more
+    information, see also [extensions](extensions.md).
+-   Bug reports. If something doesn't work for you or is cumbersome,
+    please open a new issue to let us know about the problem.
+-   [Cite OpenML](https://www.openml.org/terms) if you use it in a
+    scientific publication.
+-   Visit one of our [hackathons](https://www.openml.org/meet).
+-   Contribute to another OpenML project, such as [the main OpenML
+    project](https://github.com/openml/OpenML/blob/master/CONTRIBUTING.md).
diff --git a/docs/details.md b/docs/details.md
new file mode 100644
index 000000000..bf4b0cd2b
--- /dev/null
+++ b/docs/details.md
@@ -0,0 +1,76 @@
+# Advanced User Guide
+
+This document highlights some of the more advanced features of
+`openml-python`. 
+
+## Configuration
+
+The configuration file resides in a directory `.config/openml` in the
+home directory of the user and is called config (More specifically, it
+resides in the [configuration directory specified by the XDGB Base
+Directory
+Specification](https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html)).
+It consists of `key = value` pairs which are separated by newlines. The
+following keys are defined:
+
+- apikey: required to access the server.
+- server: the server to connect to (default: `http://www.openml.org`).
+          For connection to the test server, set this to `test.openml.org`.
+- cachedir: the root folder where the cache file directories should be created.
+    If not given, will default to `~/.openml/cache`
+- avoid_duplicate_runs: if set to `True` (default), when certain functions
+            are called a lookup is performed to see if there already
+            exists such a run on the server. If so, download those
+            results instead.
+- retry_policy: Defines how to react when the server is unavailable or
+            experiencing high load. It determines both how often to
+            attempt to reconnect and how quickly to do so. Please don't
+            use `human` in an automated script that you run more than
+            one instance of, it might increase the time to complete your
+            jobs and that of others. One of:
+            -   human (default): For people running openml in interactive
+                fashion. Try only a few times, but in quick succession.
+            -   robot: For people using openml in an automated fashion. Keep
+                trying to reconnect for a longer time, quickly increasing
+                the time between retries.
+
+- connection_n_retries: number of times to retry a request if they fail. 
+Default depends on retry_policy (5 for `human`, 50 for `robot`)
+- verbosity: the level of output:
+      -   0: normal output
+      -   1: info output
+      -   2: debug output
+
+This file is easily configurable by the `openml` command line interface.
+To see where the file is stored, and what its values are, use openml
+configure none. 
+
+## Docker
+
+It is also possible to try out the latest development version of
+`openml-python` with docker:
+
+``` bash
+docker run -it openml/openml-python
+```
+
+See the [openml-python docker
+documentation](https://github.com/openml/openml-python/blob/main/docker/readme.md)
+for more information.
+
+## Key concepts
+
+OpenML contains several key concepts which it needs to make machine
+learning research shareable. A machine learning experiment consists of
+one or several **runs**, which describe the performance of an algorithm
+(called a **flow** in OpenML), its hyperparameter settings (called a
+**setup**) on a **task**. A **Task** is the combination of a
+**dataset**, a split and an evaluation metric. In this user guide we
+will go through listing and exploring existing **tasks** to actually
+running machine learning algorithms on them. In a further user guide we
+will examine how to search through **datasets** in order to curate a
+list of **tasks**.
+
+A further explanation is given in the [OpenML user
+guide](https://docs.openml.org/concepts/).
+
diff --git a/docs/developer_setup.md b/docs/developer_setup.md
new file mode 100644
index 000000000..55a73fef9
--- /dev/null
+++ b/docs/developer_setup.md
@@ -0,0 +1,210 @@
+# OpenML Local Development Environment Setup
+
+This guide outlines the standard procedures for setting up a local development environment for the OpenML ecosystem. It covers the configuration of the backend servers (API v1 and API v2) and the Python Client SDK.
+
+OpenML currently has two backend architecture:
+
+* **API v1**: The PHP-based server currently serving production traffic.
+* **API v2**: The Python-based server (FastAPI) currently under active development.
+
+> Note on Migration: API v1 is projected to remain operational through at least 2026. API v2 is the target architecture for future development.
+
+## 1. API v1 Setup (PHP Backend)
+
+This section details the deployment of the legacy PHP backend.
+
+### Prerequisites
+
+* **Docker**: Docker Desktop (Ensure the daemon is running).
+* **Version Control**: Git.
+
+### Installation Steps
+
+#### 1. Clone the Repository
+
+Retrieve the OpenML services source code:
+
+```bash
+git clone https://github.com/openml/services
+cd services
+```
+
+#### 2. Configure File Permissions
+
+To ensure the containerized PHP service can write to the local filesystem, initialize the data directory permissions.
+
+From the repository root:
+
+```bash
+chown -R www-data:www-data data/php
+```
+
+If the `www-data` user does not exist on the host system, grant full permissions as a fallback:
+
+```bash
+chmod -R 777 data/php
+```
+
+#### 3. Launch Services
+
+Initialize the container stack:
+
+```bash
+docker compose --profile all up -d
+```
+
+#### Warning: Container Conflicts
+
+If API v2 (Python backend) containers are present on the system, name conflicts may occur. To resolve this, stop and remove existing containers before launching API v1:
+
+```bash
+docker compose --profile all down
+docker compose --profile all up -d
+```
+
+#### 4. Verification
+
+Validate the deployment by accessing the flow endpoint. A successful response will return structured JSON data.
+
+* **Endpoint**: http://localhost:8080/api/v1/json/flow/181
+
+### Client Configuration
+
+To direct the `openml-python` client to the local API v1 instance, modify the configuration as shown below. The API key corresponds to the default key located in `services/config/php/.env`.
+
+```python
+import openml
+from openml_sklearn.extension import SklearnExtension
+from sklearn.neighbors import KNeighborsClassifier
+
+# Configure client to use local Docker instance
+openml.config.server = "http://localhost:8080/api/v1/xml"
+openml.config.apikey = "AD000000000000000000000000000000"
+
+# Test flow publication
+clf = KNeighborsClassifier(n_neighbors=3)
+extension = SklearnExtension()
+knn_flow = extension.model_to_flow(clf)
+
+knn_flow.publish()
+```
+
+## 2. API v2 Setup (Python Backend)
+
+This section details the deployment of the FastAPI backend.
+
+### Prerequisites
+
+* **Docker**: Docker Desktop (Ensure the daemon is running).
+* **Version Control**: Git.
+
+### Installation Steps
+
+#### 1. Clone the Repository
+
+Retrieve the API v2 source code:
+
+```bash
+git clone https://github.com/openml/server-api
+cd server-api
+```
+
+#### 2. Launch Services
+
+Build and start the container stack:
+
+```bash
+docker compose --profile all up
+```
+
+#### 3. Verification
+
+Validate the deployment using the following endpoints:
+
+* **Task Endpoint**: http://localhost:8001/tasks/31
+* **Swagger UI (Documentation)**: http://localhost:8001/docs
+
+## 3. Python SDK (`openml-python`) Setup
+
+This section outlines the environment setup for contributing to the OpenML Python client.
+
+### Installation Steps
+
+#### 1. Clone the Repository
+
+```bash
+git clone https://github.com/openml/openml-python
+cd openml-python
+```
+
+#### 2. Environment Initialization
+
+Create an isolated virtual environment (example using Conda):
+
+```bash
+conda create -n openml-python-dev python=3.12
+conda activate openml-python-dev
+```
+
+#### 3. Install Dependencies
+
+Install the package in editable mode, including development and documentation dependencies:
+
+```bash
+python -m pip install -e ".[dev,docs]"
+```
+
+#### 4. Configure Quality Gates
+
+Install pre-commit hooks to enforce coding standards:
+
+```bash
+pre-commit install
+pre-commit run --all-files
+```
+
+## 4. Testing Guidelines
+
+The OpenML Python SDK utilizes `pytest` markers to categorize tests based on dependencies and execution context.
+
+| Marker            | Description                                                                 |
+|-------------------|-----------------------------------------------------------------------------|
+| `sklearn`          | Tests requiring `scikit-learn`. Skipped if the library is missing.          |
+| `production_server`| Tests that interact with the live OpenML server (real API calls).         |
+| `test_server`     | Tests requiring the OpenML test server environment.                       |
+
+### Execution Examples
+
+Run the full test suite:
+
+```bash
+pytest
+```
+
+Run a specific subset (e.g., `scikit-learn` tests):
+
+```bash
+pytest -m sklearn
+```
+
+Exclude production tests (local only):
+
+```bash
+pytest -m "not production_server"
+```
+
+### Admin Privilege Tests
+
+Certain tests require administrative privileges on the test server. These are skipped automatically unless an admin API key is provided via environment variables.
+
+#### Windows (PowerShell):
+
+```shell
+$env:OPENML_TEST_SERVER_ADMIN_KEY = "admin-key"
+```
+
+#### Linux/macOS:
+
+```bash
+export OPENML_TEST_SERVER_ADMIN_KEY="admin-key"
+```
diff --git a/docs/extensions.md b/docs/extensions.md
new file mode 100644
index 000000000..858447440
--- /dev/null
+++ b/docs/extensions.md
@@ -0,0 +1,160 @@
+# Extensions
+
+OpenML-Python provides an extension interface to connect other machine
+learning libraries than scikit-learn to OpenML. Please check the
+[`api_extensions`](../reference/extensions/extension_interface/) and use the scikit-learn
+extension as a starting point.
+
+## List of extensions
+
+Here is a list of currently maintained OpenML extensions:
+
+-   [openml-sklearn](https://github.com/openml/openml-sklearn)
+-   [openml-keras](https://github.com/openml/openml-keras)
+-   [openml-pytorch](https://github.com/openml/openml-pytorch)
+-   [openml-tensorflow (for tensorflow
+    2+)](https://github.com/openml/openml-tensorflow)
+
+## Connecting new machine learning libraries
+
+### Content of the Library
+
+To leverage support from the community and to tap in the potential of
+OpenML, interfacing with popular machine learning libraries is
+essential. The OpenML-Python package is capable of downloading meta-data
+and results (data, flows, runs), regardless of the library that was used
+to upload it. However, in order to simplify the process of uploading
+flows and runs from a specific library, an additional interface can be
+built. The OpenML-Python team does not have the capacity to develop and
+maintain such interfaces on its own. For this reason, we have built an
+extension interface to allows others to contribute back. Building a
+suitable extension for therefore requires an understanding of the
+current OpenML-Python support.
+
+[This tutorial](../examples/Basics/simple_flows_and_runs_tutorial) shows how the scikit-learn 
+extension works with OpenML-Python.
+
+#### API
+
+-   The extension scripts must import the openml-python package
+    and be able to interface with any function from the API.
+-   The extension has to be defined as a Python class and must inherit
+    from [`openml.extensions.Extension`](../reference/extensions/extension_interface/#openml.extensions.extension_interface.Extension).
+-   This class needs to have all the functions from `openml.extensions.Extension` overloaded as required.
+-   The redefined functions should have adequate and appropriate
+    docstrings. The sklearn Extension API is a good example to follow.
+
+#### Interfacing with OpenML-Python
+
+Once the new extension class has been defined, the openml-python module
+to [`openml.extensions.register_extension`](../reference/extensions/functions/#openml.extensions.functions.register_extension)
+must be called to allow OpenML-Python to interface the new extension.
+
+The following methods should get implemented. Although the documentation
+in the extension interface should always be leading, here
+we list some additional information and best practices. 
+Note that most methods are relatively simple
+and can be implemented in several lines of code.
+
+-   General setup (required)
+    -   `can_handle_flow`: Takes as
+        argument an OpenML flow, and checks whether this can be handled
+        by the current extension. The OpenML database consists of many
+        flows, from various workbenches (e.g., scikit-learn, Weka, mlr).
+        This method is called before a model is being deserialized.
+        Typically, the flow-dependency field is used to check whether
+        the specific library is present, and no unknown libraries are
+        present there.
+    -   `can_handle_model`: Similar as
+        `can_handle_flow`:, except that in
+        this case a Python object is given. As such, in many cases, this
+        method can be implemented by checking whether this adheres to a
+        certain base class.
+-   Serialization and De-serialization (required)
+    -   `flow_to_model`: deserializes the
+        OpenML Flow into a model (if the library can indeed handle the
+        flow). This method has an important interplay with
+        `model_to_flow`. Running these
+        two methods in succession should result in exactly the same
+        model (or flow). This property can be used for unit testing
+        (e.g., build a model with hyperparameters, make predictions on a
+        task, serialize it to a flow, deserialize it back, make it
+        predict on the same task, and check whether the predictions are
+        exactly the same.) The example in the scikit-learn interface
+        might seem daunting, but note that here some complicated design
+        choices were made, that allow for all sorts of interesting
+        research questions. It is probably good practice to start easy.
+    -   `model_to_flow`: The inverse of `flow_to_model`. Serializes a
+        model into an OpenML Flow. The flow should preserve the class,
+        the library version, and the tunable hyperparameters.
+    -   `get_version_information`: Return
+        a tuple with the version information of the important libraries.
+    -   `create_setup_string`: No longer
+        used, and will be deprecated soon.
+-   Performing runs (required)
+    -   `is_estimator`: Gets as input a
+        class, and checks whether it has the status of estimator in the
+        library (typically, whether it has a train method and a predict
+        method).
+    -   `seed_model`: Sets a random seed to the model.
+    -   `_run_model_on_fold`: One of the
+        main requirements for a library to generate run objects for the
+        OpenML server. Obtains a train split (with labels) and a test
+        split (without labels) and the goal is to train a model on the
+        train split and return the predictions on the test split. On top
+        of the actual predictions, also the class probabilities should
+        be determined. For classifiers that do not return class
+        probabilities, this can just be the hot-encoded predicted label.
+        The predictions will be evaluated on the OpenML server. Also,
+        additional information can be returned, for example,
+        user-defined measures (such as runtime information, as this can
+        not be inferred on the server). Additionally, information about
+        a hyperparameter optimization trace can be provided.
+    -   `obtain_parameter_values`:
+        Obtains the hyperparameters of a given model and the current
+        values. Please note that in the case of a hyperparameter
+        optimization procedure (e.g., random search), you only should
+        return the hyperparameters of this procedure (e.g., the
+        hyperparameter grid, budget, etc) and that the chosen model will
+        be inferred from the optimization trace.
+    -   `check_if_model_fitted`: Check
+        whether the train method of the model has been called (and as
+        such, whether the predict method can be used).
+-   Hyperparameter optimization (optional)
+    -   `instantiate_model_from_hpo_class`: If a given run has recorded the hyperparameter
+        optimization trace, then this method can be used to
+        reinstantiate the model with hyperparameters of a given
+        hyperparameter optimization iteration. Has some similarities
+        with `flow_to_model` (as this
+        method also sets the hyperparameters of a model). Note that
+        although this method is required, it is not necessary to
+        implement any logic if hyperparameter optimization is not
+        implemented. Simply raise a `NotImplementedError`
+        then.
+
+### Hosting the library
+
+Each extension created should be a stand-alone repository, compatible
+with the [OpenML-Python repository](https://github.com/openml/openml-python). 
+The extension repository should work off-the-shelf with *OpenML-Python* installed.
+
+Create a public Github repo with the following directory structure:
+
+    | [repo name]
+    |    |-- [extension name]
+    |    |    |-- __init__.py
+    |    |    |-- extension.py
+    |    |    |-- config.py (optionally)
+
+### Recommended
+
+-   Test cases to keep the extension up to date with the
+    Openml-Python upstream changes.
+-   Documentation of the extension API, especially if any new
+    functionality added to OpenML-Python\'s extension design.
+-   Examples to show how the new extension interfaces and works with
+    OpenML-Python.
+-   Create a PR to add the new extension to the OpenML-Python API
+    documentation.
+
+Happy contributing!
diff --git a/docs/images/openml_icon.png b/docs/images/openml_icon.png
new file mode 100644
index 000000000..4808572ff
Binary files /dev/null and b/docs/images/openml_icon.png differ
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 000000000..1058c3956
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,103 @@
+# OpenML
+
+**The Python API for a World of Data and More**
+
+Welcome to the documentation of the OpenML Python API, a connector to
+the collaborative machine learning platform
+[OpenML.org](https://www.openml.org). 
+OpenML-Python can download or upload data from OpenML, such as datasets
+and machine learning experiment results.
+
+If you are new to OpenML, we recommend checking out the [OpenML documentation](https://docs.openml.org/)
+to get familiar with the concepts and features of OpenML. In particular, we recommend 
+reading more about the [OpenML concepts](https://docs.openml.org/concepts/). 
+
+## :joystick: Minimal Examples
+
+Use the following code to get the [credit-g](https://www.openml.org/search?type=data&sort=runs&status=active&id=31) [dataset](https://docs.openml.org/concepts/data/):
+
+```python
+import openml
+
+dataset = openml.datasets.get_dataset("credit-g") # or by ID get_dataset(31)
+X, y, categorical_indicator, attribute_names = dataset.get_data(target="class")
+```
+
+Get a [task](https://docs.openml.org/concepts/tasks/) for [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31):
+
+```python
+import openml
+
+task = openml.tasks.get_task(31)
+dataset = task.get_dataset()
+X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name)
+# get splits for the first fold of 10-fold cross-validation
+train_indices, test_indices = task.get_train_test_split_indices(fold=0)
+```
+
+Use an [OpenML benchmarking suite](https://docs.openml.org/concepts/benchmarking/) to get a curated list of machine-learning tasks:
+```python
+import openml
+
+suite = openml.study.get_suite("amlb-classification-all")  # Get a curated list of tasks for classification
+for task_id in suite.tasks:
+    task = openml.tasks.get_task(task_id)
+```
+Find more examples in the navbar at the top.
+
+## :magic_wand: Installation
+
+OpenML-Python is available on Linux, MacOS, and Windows.
+
+You can install OpenML-Python with:
+
+```bash
+pip install openml
+```
+
+For more advanced installation information, please see the
+["Introduction"](../examples/Basics/introduction_tutorial) example.
+
+
+## Further information
+
+-   [OpenML documentation](https://docs.openml.org/)
+-   [OpenML client APIs](https://docs.openml.org/APIs/)
+-   [OpenML developer guide](https://docs.openml.org/contributing/)
+-   [Contact information](https://www.openml.org/contact)
+-   [Citation request](https://www.openml.org/cite)
+-   [OpenML blog](https://medium.com/open-machine-learning)
+-   [OpenML twitter account](https://twitter.com/open_ml)
+
+
+## Contributing
+
+Contributing to the OpenML package is highly appreciated. Please see the
+["Contributing"](contributing.md) page for more information.
+
+## Citing OpenML-Python
+
+If you use OpenML-Python in a scientific publication, we would
+appreciate a reference to our JMLR-MLOSS paper 
+["OpenML-Python: an extensible Python API for OpenML"](https://www.jmlr.org/papers/v22/19-920.html):
+
+=== "Bibtex"
+
+    ```bibtex
+    @article{JMLR:v22:19-920,
+        author  = {Matthias Feurer and Jan N. van Rijn and Arlind Kadra and Pieter Gijsbers and Neeratyoy Mallik and Sahithya Ravi and Andreas MÃ¼ller and Joaquin Vanschoren and Frank Hutter},
+        title   = {OpenML-Python: an extensible Python API for OpenML},
+        journal = {Journal of Machine Learning Research},
+        year    = {2021},
+        volume  = {22},
+        number  = {100},
+        pages   = {1--5},
+        url     = {http://jmlr.org/papers/v22/19-920.html}
+    }
+    ```
+
+=== "MLA"
+
+    Feurer, Matthias, et al. 
+    "OpenML-Python: an extensible Python API for OpenML."
+    _Journal of Machine Learning Research_ 22.100 (2021):1−5.
diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css
new file mode 100644
index 000000000..d0c4f79d8
--- /dev/null
+++ b/docs/stylesheets/extra.css
@@ -0,0 +1,3 @@
+.jp-InputArea-prompt, .jp-InputPrompt {
+    display: none !important;
+}
diff --git a/examples/Advanced/configure_logging.py b/examples/Advanced/configure_logging.py
new file mode 100644
index 000000000..60b789846
--- /dev/null
+++ b/examples/Advanced/configure_logging.py
@@ -0,0 +1,50 @@
+# %% [markdown]
+# This tutorial explains openml-python logging, and shows how to configure it.
+# Openml-python uses the [Python logging module](https://docs.python.org/3/library/logging.html)
+# to provide users with log messages. Each log message is assigned a level of importance, see
+# the table in Python's logging tutorial
+# [here](https://docs.python.org/3/howto/logging.html#when-to-use-logging).
+#
+# By default, openml-python will print log messages of level `WARNING` and above to console.
+# All log messages (including `DEBUG` and `INFO`) are also saved in a file, which can be
+# found in your cache directory (see also the
+# [introduction tutorial](../Basics/introduction_tutorial).
+# These file logs are automatically deleted if needed, and use at most 2MB of space.
+#
+# It is possible to configure what log levels to send to console and file.
+# When downloading a dataset from OpenML, a `DEBUG`-level message is written:
+
+# %%
+import openml
+
+openml.datasets.get_dataset("iris", version=1)
+
+# %% [markdown]
+# With default configuration, the above example will show no output to console.
+# However, in your cache directory you should find a file named 'openml_python.log',
+# which has a DEBUG message written to it. It should be either like
+# "[DEBUG] [10:46:19:openml.datasets.dataset] Saved dataset 61: iris to file ..."
+# or like
+# "[DEBUG] [10:49:38:openml.datasets.dataset] Data pickle file already exists and is up to date."
+# , depending on whether or not you had downloaded iris before.
+# The processed log levels can be configured programmatically:
+
+# %%
+import logging
+
+openml.config.set_console_log_level(logging.DEBUG)
+openml.config.set_file_log_level(logging.WARNING)
+openml.datasets.get_dataset("iris", version=1)
+
+# %% [markdown]
+# Now the log level that was previously written to file should also be shown in the console.
+# The message is now no longer written to file as the `file_log` was set to level `WARNING`.
+#
+# It is also possible to specify the desired log levels through the configuration file.
+# This way you will not need to set them on each script separately.
+# Add the  line **verbosity = NUMBER** and/or **file_verbosity = NUMBER** to the config file,
+# where 'NUMBER' should be one of:
+#
+# * 0: `logging.WARNING` and up.
+# * 1: `logging.INFO` and up.
+# * 2: `logging.DEBUG` and up (i.e. all messages).
diff --git a/examples/Advanced/create_upload_tutorial.py b/examples/Advanced/create_upload_tutorial.py
new file mode 100644
index 000000000..46ec96319
--- /dev/null
+++ b/examples/Advanced/create_upload_tutorial.py
@@ -0,0 +1,303 @@
+# %% [markdown]
+# A tutorial on how to create and upload a dataset to OpenML.
+
+# %%
+import numpy as np
+import pandas as pd
+import sklearn.datasets
+from scipy.sparse import coo_matrix
+
+import openml
+from openml.datasets.functions import create_dataset
+
+# %%
+openml.config.start_using_configuration_for_example()
+
+# %% [markdown]
+# Below we will cover the following cases of the dataset object:
+#
+# * A numpy array
+# * A list
+# * A pandas dataframe
+# * A sparse matrix
+# * A pandas sparse dataframe
+
+# %% [markdown]
+# ## Dataset is a numpy array
+# A numpy array can contain lists in the case of dense data or it can contain
+# OrderedDicts in the case of sparse data.
+#
+# # Prepare dataset
+# Load an example dataset from scikit-learn which we will upload to OpenML.org
+# via the API.
+
+# %%
+diabetes = sklearn.datasets.load_diabetes()
+name = "Diabetes(scikit-learn)"
+X = diabetes.data
+y = diabetes.target
+attribute_names = diabetes.feature_names
+description = diabetes.DESCR
+
+# %% [markdown]
+# OpenML does not distinguish between the attributes and targets on the data
+# level and stores all data in a single matrix.
+#
+# The target feature is indicated as meta-data of the dataset (and tasks on
+# that data).
+
+# %%
+data = np.concatenate((X, y.reshape((-1, 1))), axis=1)
+attribute_names = list(attribute_names)
+attributes = [(attribute_name, "REAL") for attribute_name in attribute_names] + [
+    ("class", "INTEGER")
+]
+citation = (
+    "Bradley Efron, Trevor Hastie, Iain Johnstone and "
+    "Robert Tibshirani (2004) (Least Angle Regression) "
+    "Annals of Statistics (with discussion), 407-499"
+)
+paper_url = "https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf"
+
+# %% [markdown]
+# ## Create the dataset object
+# The definition of all fields can be found in the XSD files describing the
+# expected format:
+#
+# https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd
+
+#  %%
+diabetes_dataset = create_dataset(
+    # The name of the dataset (needs to be unique).
+    # Must not be longer than 128 characters and only contain
+    # a-z, A-Z, 0-9 and the following special characters: _\-\.(),
+    name=name,
+    # Textual description of the dataset.
+    description=description,
+    # The person who created the dataset.
+    creator="Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani",
+    # People who contributed to the current version of the dataset.
+    contributor=None,
+    # The date the data was originally collected, given by the uploader.
+    collection_date="09-01-2012",
+    # Language in which the data is represented.
+    # Starts with 1 upper case letter, rest lower case, e.g. 'English'.
+    language="English",
+    # License under which the data is/will be distributed.
+    licence="BSD (from scikit-learn)",
+    # Name of the target. Can also have multiple values (comma-separated).
+    default_target_attribute="class",
+    # The attribute that represents the row-id column, if present in the
+    # dataset.
+    row_id_attribute=None,
+    # Attribute or list of attributes that should be excluded in modelling, such as
+    # identifiers and indexes. E.g. "feat1" or ["feat1","feat2"]
+    ignore_attribute=None,
+    # How to cite the paper.
+    citation=citation,
+    # Attributes of the data
+    attributes=attributes,
+    data=data,
+    # A version label which is provided by the user.
+    version_label="test",
+    original_data_url="https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html",
+    paper_url=paper_url,
+)
+
+# %%
+
+diabetes_dataset.publish()
+print(f"URL for dataset: {diabetes_dataset.openml_url}")
+
+# %% [markdown]
+# ## Dataset is a list
+# A list can contain lists in the case of dense data or it can contain
+# OrderedDicts in the case of sparse data.
+#
+# Weather dataset:
+# https://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html
+
+# %%
+data = [
+    ["sunny", 85, 85, "FALSE", "no"],
+    ["sunny", 80, 90, "TRUE", "no"],
+    ["overcast", 83, 86, "FALSE", "yes"],
+    ["rainy", 70, 96, "FALSE", "yes"],
+    ["rainy", 68, 80, "FALSE", "yes"],
+    ["rainy", 65, 70, "TRUE", "no"],
+    ["overcast", 64, 65, "TRUE", "yes"],
+    ["sunny", 72, 95, "FALSE", "no"],
+    ["sunny", 69, 70, "FALSE", "yes"],
+    ["rainy", 75, 80, "FALSE", "yes"],
+    ["sunny", 75, 70, "TRUE", "yes"],
+    ["overcast", 72, 90, "TRUE", "yes"],
+    ["overcast", 81, 75, "FALSE", "yes"],
+    ["rainy", 71, 91, "TRUE", "no"],
+]
+
+attribute_names = [
+    ("outlook", ["sunny", "overcast", "rainy"]),
+    ("temperature", "REAL"),
+    ("humidity", "REAL"),
+    ("windy", ["TRUE", "FALSE"]),
+    ("play", ["yes", "no"]),
+]
+
+description = (
+    "The weather problem is a tiny dataset that we will use repeatedly"
+    " to illustrate machine learning methods. Entirely fictitious, it "
+    "supposedly concerns the conditions that are suitable for playing "
+    "some unspecified game. In general, instances in a dataset are "
+    "characterized by the values of features, or attributes, that measure "
+    "different aspects of the instance. In this case there are four "
+    "attributes: outlook, temperature, humidity, and windy. "
+    "The outcome is whether to play or not."
+)
+
+citation = (
+    "I. H. Witten, E. Frank, M. A. Hall, and ITPro,"
+    "Data mining practical machine learning tools and techniques, "
+    "third edition. Burlington, Mass.: Morgan Kaufmann Publishers, 2011"
+)
+
+weather_dataset = create_dataset(
+    name="Weather",
+    description=description,
+    creator="I. H. Witten, E. Frank, M. A. Hall, and ITPro",
+    contributor=None,
+    collection_date="01-01-2011",
+    language="English",
+    licence=None,
+    default_target_attribute="play",
+    row_id_attribute=None,
+    ignore_attribute=None,
+    citation=citation,
+    attributes=attribute_names,
+    data=data,
+    version_label="example",
+)
+
+
+# %%
+weather_dataset.publish()
+print(f"URL for dataset: {weather_dataset.openml_url}")
+
+# %% [markdown]
+# ## Dataset is a pandas DataFrame
+# It might happen that your dataset is made of heterogeneous data which can usually
+# be stored as a Pandas DataFrame. DataFrames offer the advantage of
+# storing the type of data for each column as well as the attribute names.
+# Therefore, when providing a Pandas DataFrame, OpenML can infer this
+# information without needing to explicitly provide it when calling the
+# function :func:`openml.datasets.create_dataset`. In this regard, you only
+# need to pass ``'auto'`` to the ``attributes`` parameter.
+
+# %%
+df = pd.DataFrame(data, columns=[col_name for col_name, _ in attribute_names])
+
+# enforce the categorical column to have a categorical dtype
+df["outlook"] = df["outlook"].astype("category")
+df["windy"] = df["windy"].astype("bool")
+df["play"] = df["play"].astype("category")
+print(df.info())
+
+# %% [markdown]
+# We enforce the column 'outlook' and 'play' to be a categorical
+# dtype while the column 'windy' is kept as a boolean column. 'temperature'
+# and 'humidity' are kept as numeric columns. Then, we can
+# call :func:`openml.datasets.create_dataset` by passing the dataframe and
+# fixing the parameter ``attributes`` to ``'auto'``.
+
+# %%
+weather_dataset = create_dataset(
+    name="Weather",
+    description=description,
+    creator="I. H. Witten, E. Frank, M. A. Hall, and ITPro",
+    contributor=None,
+    collection_date="01-01-2011",
+    language="English",
+    licence=None,
+    default_target_attribute="play",
+    row_id_attribute=None,
+    ignore_attribute=None,
+    citation=citation,
+    attributes="auto",
+    data=df,
+    version_label="example",
+)
+
+# %%
+weather_dataset.publish()
+print(f"URL for dataset: {weather_dataset.openml_url}")
+
+# %% [markdown]
+# ## Dataset is a sparse matrix
+
+# %%
+sparse_data = coo_matrix(
+    ([0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]))
+)
+
+column_names = [
+    ("input1", "REAL"),
+    ("input2", "REAL"),
+    ("y", "REAL"),
+]
+
+xor_dataset = create_dataset(
+    name="XOR",
+    description="Dataset representing the XOR operation",
+    creator=None,
+    contributor=None,
+    collection_date=None,
+    language="English",
+    licence=None,
+    default_target_attribute="y",
+    row_id_attribute=None,
+    ignore_attribute=None,
+    citation=None,
+    attributes=column_names,
+    data=sparse_data,
+    version_label="example",
+)
+
+
+# %%
+xor_dataset.publish()
+print(f"URL for dataset: {xor_dataset.openml_url}")
+
+
+# %% [markdown]
+# ## Dataset is a pandas dataframe with sparse columns
+
+sparse_data = coo_matrix(
+    ([1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]))
+)
+column_names = ["input1", "input2", "y"]
+df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
+print(df.info())
+
+xor_dataset = create_dataset(
+    name="XOR",
+    description="Dataset representing the XOR operation",
+    creator=None,
+    contributor=None,
+    collection_date=None,
+    language="English",
+    licence=None,
+    default_target_attribute="y",
+    row_id_attribute=None,
+    ignore_attribute=None,
+    citation=None,
+    attributes="auto",
+    data=df,
+    version_label="example",
+)
+
+# %%
+
+xor_dataset.publish()
+print(f"URL for dataset: {xor_dataset.openml_url}")
+
+# %%
+openml.config.stop_using_configuration_for_example()
diff --git a/examples/Advanced/datasets_tutorial.py b/examples/Advanced/datasets_tutorial.py
new file mode 100644
index 000000000..cc57686d0
--- /dev/null
+++ b/examples/Advanced/datasets_tutorial.py
@@ -0,0 +1,165 @@
+# %% [markdown]
+# How to list and download datasets.
+
+# %%
+import pandas as pd
+
+import openml
+from openml.datasets import edit_dataset, fork_dataset, get_dataset
+
+# %% [markdown]
+# ## Exercise 0
+#
+# * List datasets and return a dataframe
+
+# %%
+datalist = openml.datasets.list_datasets()
+datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]
+
+print(f"First 10 of {len(datalist)} datasets...")
+datalist.head(n=10)
+
+# The same can be done with lesser lines of code
+openml_df = openml.datasets.list_datasets()
+openml_df.head(n=10)
+
+# %% [markdown]
+# ## Exercise 1
+#
+# * Find datasets with more than 10000 examples.
+# * Find a dataset called 'eeg_eye_state'.
+# * Find all datasets with more than 50 classes.
+
+# %%
+datalist[datalist.NumberOfInstances > 10000].sort_values(["NumberOfInstances"]).head(n=20)
+
+# %%
+datalist.query('name == "eeg-eye-state"')
+
+# %%
+datalist.query("NumberOfClasses > 50")
+
+# %% [markdown]
+# ## Download datasets
+
+# %%
+# This is done based on the dataset ID.
+dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1)
+
+# Print a summary
+print(
+    f"This is dataset '{dataset.name}', the target feature is '{dataset.default_target_attribute}'"
+)
+print(f"URL: {dataset.url}")
+print(dataset.description[:500])
+
+# %% [markdown]
+# Get the actual data.
+#
+# openml-python returns data as pandas dataframes (stored in the `eeg` variable below),
+# and also some additional metadata that we don't care about right now.
+
+# %%
+eeg, *_ = dataset.get_data()
+
+# %% [markdown]
+# You can optionally choose to have openml separate out a column from the
+# dataset. In particular, many datasets for supervised problems have a set
+# `default_target_attribute` which may help identify the target variable.
+
+# %%
+X, y, categorical_indicator, attribute_names = dataset.get_data(
+    target=dataset.default_target_attribute
+)
+print(X.head())
+print(X.info())
+
+# %% [markdown]
+# Sometimes you only need access to a dataset's metadata.
+# In those cases, you can download the dataset without downloading the
+# data file. The dataset object can be used as normal.
+# Whenever you use any functionality that requires the data,
+# such as `get_data`, the data will be downloaded.
+# Starting from 0.15, not downloading data will be the default behavior instead.
+# The data will be downloading automatically when you try to access it through
+# openml objects, e.g., using `dataset.features`.
+
+# %%
+dataset = openml.datasets.get_dataset(1471)
+
+# %% [markdown]
+# ## Exercise 2
+# * Explore the data visually.
+
+# %%
+eegs = eeg.sample(n=1000)
+_ = pd.plotting.scatter_matrix(
+    X.iloc[:100, :4],
+    c=y[:100],
+    figsize=(10, 10),
+    marker="o",
+    hist_kwds={"bins": 20},
+    alpha=0.8,
+    cmap="plasma",
+)
+
+
+# %% [markdown]
+# ## Edit a created dataset
+# This example uses the test server, to avoid editing a dataset on the main server.
+
+# %%
+openml.config.start_using_configuration_for_example()
+# %% [markdown]
+# Edit non-critical fields, allowed for all authorized users:
+# description, creator, contributor, collection_date, language, citation,
+# original_data_url, paper_url
+
+# %%
+desc = (
+    "This data sets consists of 3 different types of irises' "
+    "(Setosa, Versicolour, and Virginica) petal and sepal length,"
+    " stored in a 150x4 numpy.ndarray"
+)
+did = 128
+data_id = edit_dataset(
+    did,
+    description=desc,
+    creator="R.A.Fisher",
+    collection_date="1937",
+    citation="The use of multiple measurements in taxonomic problems",
+    language="English",
+)
+edited_dataset = get_dataset(data_id)
+print(f"Edited dataset ID: {data_id}")
+
+
+# %% [markdown]
+# Editing critical fields (default_target_attribute, row_id_attribute, ignore_attribute) is allowed
+# only for the dataset owner. Further, critical fields cannot be edited if the dataset has any
+# tasks associated with it. To edit critical fields of a dataset (without tasks) owned by you,
+# configure the API key:
+# openml.config.apikey = 'FILL_IN_OPENML_API_KEY'
+# This example here only shows a failure when trying to work on a dataset not owned by you:
+
+# %%
+try:
+    data_id = edit_dataset(1, default_target_attribute="shape")
+except openml.exceptions.OpenMLServerException as e:
+    print(e)
+
+# %% [markdown]
+# ## Fork dataset
+# Used to create a copy of the dataset with you as the owner.
+# Use this API only if you are unable to edit the critical fields (default_target_attribute,
+# ignore_attribute, row_id_attribute) of a dataset through the edit_dataset API.
+# After the dataset is forked, you can edit the new version of the dataset using edit_dataset.
+
+# %%
+data_id = fork_dataset(1)
+print(data_id)
+data_id = edit_dataset(data_id, default_target_attribute="shape")
+print(f"Forked dataset ID: {data_id}")
+
+# %%
+openml.config.stop_using_configuration_for_example()
diff --git a/examples/fetch_evaluations_tutorial.py b/examples/Advanced/fetch_evaluations_tutorial.py
similarity index 51%
rename from examples/fetch_evaluations_tutorial.py
rename to examples/Advanced/fetch_evaluations_tutorial.py
index 10511c540..97b8d1bef 100644
--- a/examples/fetch_evaluations_tutorial.py
+++ b/examples/Advanced/fetch_evaluations_tutorial.py
@@ -1,113 +1,112 @@
-"""
-====================
-Fetching Evaluations
-====================
-
-Evalutions contain a concise summary of the results of all runs made. Each evaluation
-provides information on the dataset used, the flow applied, the setup used, the metric
-evaluated, and the result obtained on the metric, for each such run made. These collection
-of results can be used for efficient benchmarking of an algorithm and also allow transparent
-reuse of results from previous experiments on similar parameters.
-
-In this example, we shall do the following:
-
-* Retrieve evaluations based on different metrics
-* Fetch evaluations pertaining to a specific task
-* Sort the obtained results in descending order of the metric
-* Plot a cumulative distribution function for the evaluations
-* Compare the top 10 performing flows based on the evaluation performance
-"""
-
-############################################################################
+# %% [markdown]
+# Evaluations contain a concise summary of the results of all runs made. Each evaluation
+# provides information on the dataset used, the flow applied, the setup used, the metric
+# evaluated, and the result obtained on the metric, for each such run made. These collection
+# of results can be used for efficient benchmarking of an algorithm and also allow transparent
+# reuse of results from previous experiments on similar parameters.
+#
+# In this example, we shall do the following:
+#
+# * Retrieve evaluations based on different metrics
+# * Fetch evaluations pertaining to a specific task
+# * Sort the obtained results in descending order of the metric
+# * Plot a cumulative distribution function for the evaluations
+# * Compare the top 10 performing flows based on the evaluation performance
+# * Retrieve evaluations with hyperparameter settings
+
+# %%
 import openml
 
-############################################################################
-# Listing evaluations
-# *******************
+# %% [markdown]
+# ## Listing evaluations
 # Evaluations can be retrieved from the database in the chosen output format.
 # Required filters can be applied to retrieve results from runs as required.
 
 # We shall retrieve a small set (only 10 entries) to test the listing function for evaluations
-openml.evaluations.list_evaluations(function='predictive_accuracy', size=10,
-                                    output_format='dataframe')
+
+# %%
+openml.evaluations.list_evaluations(function="predictive_accuracy", size=10)
 
 # Using other evaluation metrics, 'precision' in this case
-evals = openml.evaluations.list_evaluations(function='precision', size=10,
-                                            output_format='dataframe')
+evals = openml.evaluations.list_evaluations(
+    function="precision", size=10, output_format="dataframe"
+)
 
 # Querying the returned results for precision above 0.98
 print(evals[evals.value > 0.98])
 
-#############################################################################
-# Viewing a sample task
-# =====================
+# %% [markdown]
+# ## Viewing a sample task
 # Over here we shall briefly take a look at the details of the task.
-
 # We will start by displaying a simple *supervised classification* task:
-task_id = 167140        # https://www.openml.org/t/167140
+
+# %%
+task_id = 167140  # https://www.openml.org/t/167140
 task = openml.tasks.get_task(task_id)
 print(task)
 
-#############################################################################
-# Obtaining all the evaluations for the task
-# ==========================================
+# %% [markdown]
+# ## Obtaining all the evaluations for the task
 # We'll now obtain all the evaluations that were uploaded for the task
 # we displayed previously.
 # Note that we now filter the evaluations based on another parameter 'task'.
 
-metric = 'predictive_accuracy'
-evals = openml.evaluations.list_evaluations(function=metric, task=[task_id],
-                                            output_format='dataframe')
+# %%
+metric = "predictive_accuracy"
+evals = openml.evaluations.list_evaluations(
+    function=metric, tasks=[task_id], output_format="dataframe"
+)
 # Displaying the first 10 rows
 print(evals.head(n=10))
 # Sorting the evaluations in decreasing order of the metric chosen
-evals = evals.sort_values(by='value', ascending=False)
+evals = evals.sort_values(by="value", ascending=False)
 print("\nDisplaying head of sorted dataframe: ")
 print(evals.head())
 
-#############################################################################
-# Obtaining CDF of metric for chosen task
-# ***************************************
+# %% [markdown]
+# ## Obtaining CDF of metric for chosen task
 # We shall now analyse how the performance of various flows have been on this task,
 # by seeing the likelihood of the accuracy obtained across all runs.
 # We shall now plot a cumulative distributive function (CDF) for the accuracies obtained.
 
+# %%
 from matplotlib import pyplot as plt
 
 
-def plot_cdf(values, metric='predictive_accuracy'):
+def plot_cdf(values, metric="predictive_accuracy"):
     max_val = max(values)
-    n, bins, patches = plt.hist(values, density=True, histtype='step',
-                                cumulative=True, linewidth=3)
+    _, _, patches = plt.hist(values, density=True, histtype="step", cumulative=True, linewidth=3)
     patches[0].set_xy(patches[0].get_xy()[:-1])
     plt.xlim(max(0, min(values) - 0.1), 1)
-    plt.title('CDF')
+    plt.title("CDF")
     plt.xlabel(metric)
-    plt.ylabel('Likelihood')
-    plt.grid(b=True, which='major', linestyle='-')
+    plt.ylabel("Likelihood")
+    plt.grid(visible=True, which="major", linestyle="-")
     plt.minorticks_on()
-    plt.grid(b=True, which='minor', linestyle='--')
-    plt.axvline(max_val, linestyle='--', color='gray')
-    plt.text(max_val, 0, "%.3f" % max_val, fontsize=9)
+    plt.grid(visible=True, which="minor", linestyle="--")
+    plt.axvline(max_val, linestyle="--", color="gray")
+    plt.text(max_val, 0, f"{max_val:.3f}", fontsize=9)
     plt.show()
 
 
 plot_cdf(evals.value, metric)
+
+# %% [markdown]
 # This CDF plot shows that for the given task, based on the results of the
 # runs uploaded, it is almost certain to achieve an accuracy above 52%, i.e.,
 # with non-zero probability. While the maximum accuracy seen till now is 96.5%.
 
-#############################################################################
-# Comparing top 10 performing flows
-# *********************************
+# %% [markdown]
+# ## Comparing top 10 performing flows
 # Let us now try to see which flows generally performed the best for this task.
 # For this, we shall compare the top performing flows.
 
+# %%
 import numpy as np
 import pandas as pd
 
 
-def plot_flow_compare(evaluations, top_n=10, metric='predictive_accuracy'):
+def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
     # Collecting the top 10 performing unique flow_id
     flow_ids = evaluations.flow_id.unique()[:top_n]
 
@@ -117,24 +116,26 @@ def plot_flow_compare(evaluations, top_n=10, metric='predictive_accuracy'):
     for i in range(len(flow_ids)):
         flow_values = evaluations[evaluations.flow_id == flow_ids[i]].value
         df = pd.concat([df, flow_values], ignore_index=True, axis=1)
-    fig, axs = plt.subplots()
+    _, axs = plt.subplots()
     df.boxplot()
-    axs.set_title('Boxplot comparing ' + metric + ' for different flows')
+    axs.set_title("Boxplot comparing " + metric + " for different flows")
     axs.set_ylabel(metric)
-    axs.set_xlabel('Flow ID')
+    axs.set_xlabel("Flow ID")
     axs.set_xticklabels(flow_ids)
-    axs.grid(which='major', linestyle='-', linewidth='0.5', color='gray', axis='y')
+    axs.grid(which="major", linestyle="-", linewidth="0.5", color="gray", axis="y")
     axs.minorticks_on()
-    axs.grid(which='minor', linestyle='--', linewidth='0.5', color='gray', axis='y')
+    axs.grid(which="minor", linestyle="--", linewidth="0.5", color="gray", axis="y")
     # Counting the number of entries for each flow in the data frame
     #   which gives the number of runs for each flow
     flow_freq = list(df.count(axis=0, numeric_only=True))
     for i in range(len(flow_ids)):
-        axs.text(i + 1.05, np.nanmin(df.values), str(flow_freq[i]) + '\nrun(s)', fontsize=7)
+        axs.text(i + 1.05, np.nanmin(df.values), str(flow_freq[i]) + "\nrun(s)", fontsize=7)
     plt.show()
 
 
 plot_flow_compare(evals, metric=metric, top_n=10)
+
+# %% [markdown]
 # The boxplots below show how the flows perform across multiple runs on the chosen
 # task. The green horizontal lines represent the median accuracy of all the runs for
 # that flow (number of runs denoted at the bottom of the boxplots). The higher the
@@ -142,8 +143,39 @@ def plot_flow_compare(evaluations, top_n=10, metric='predictive_accuracy'):
 # are in the descending order of the higest accuracy value seen under that flow.
 
 # Printing the corresponding flow names for the top 10 performing flow IDs
+
+# %%
 top_n = 10
 flow_ids = evals.flow_id.unique()[:top_n]
 flow_names = evals.flow_name.unique()[:top_n]
 for i in range(top_n):
     print((flow_ids[i], flow_names[i]))
+
+# %% [markdown]
+# ## Obtaining evaluations with hyperparameter settings
+# We'll now obtain the evaluations of a task and a flow with the hyperparameters
+
+# List evaluations in descending order based on predictive_accuracy with
+# hyperparameters
+
+# %%
+evals_setups = openml.evaluations.list_evaluations_setups(
+    function="predictive_accuracy",
+    tasks=[31],
+    size=100,
+    sort_order="desc",
+)
+
+print(evals_setups.head())
+
+# %% [markdown]
+# Return evaluations for flow_id in descending order based on predictive_accuracy
+# with hyperparameters. parameters_in_separate_columns returns parameters in
+# separate columns
+
+# %%
+evals_setups = openml.evaluations.list_evaluations_setups(
+    function="predictive_accuracy", flows=[6767], size=100, parameters_in_separate_columns=True
+)
+
+print(evals_setups.head(10))
diff --git a/examples/Advanced/study_tutorial.py b/examples/Advanced/study_tutorial.py
new file mode 100644
index 000000000..6912efd06
--- /dev/null
+++ b/examples/Advanced/study_tutorial.py
@@ -0,0 +1,120 @@
+# %% [markdown]
+# How to list, download and upload benchmark studies.
+# In contrast to
+# [benchmark suites](https://docs.openml.org/benchmark/#benchmarking-suites) which
+# hold a list of tasks, studies hold a list of runs. As runs contain all information on flows and
+# tasks, all required information about a study can be retrieved.
+
+# %%
+import uuid
+
+from sklearn.ensemble import RandomForestClassifier
+
+import openml
+
+# %% [markdown]
+# ##  Listing studies
+#
+# * Use the output_format parameter to select output type
+# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
+#   easier-to-work-with data structure
+
+# %%
+studies = openml.study.list_studies(status="all")
+print(studies.head(n=10))
+
+
+# %% [markdown]
+# ## Downloading studies
+# This is done based on the study ID.
+
+# %%
+study = openml.study.get_study(123)
+print(study)
+
+# %% [markdown]
+# Studies also features a description:
+
+# %%
+print(study.description)
+
+# %% [markdown]
+# Studies are a container for runs:
+
+# %%
+print(study.runs)
+
+# %% [markdown]
+# And we can use the evaluation listing functionality to learn more about
+# the evaluations available for the conducted runs:
+
+# %%
+evaluations = openml.evaluations.list_evaluations(
+    function="predictive_accuracy",
+    study=study.study_id,
+    output_format="dataframe",
+)
+print(evaluations.head())
+
+# %% [markdown]
+# We'll use the test server for the rest of this tutorial.
+
+# %%
+openml.config.start_using_configuration_for_example()
+
+# %% [markdown]
+# ## Uploading studies
+#
+# Creating a study is as simple as creating any kind of other OpenML entity.
+# In this examples we'll create a few runs for the OpenML-100 benchmark
+# suite which is available on the OpenML test server.
+
+# <div class="admonition warning">
+#     <p class="admonition-title">Warning</p>
+#     <p>
+#         For the rest of this tutorial, we will require the `openml-sklearn` package.
+#         Install it with `pip install openml-sklearn`.
+#     </p>
+# </div>
+
+# %%
+# Get sklearn extension to run sklearn models easily on OpenML tasks.
+from openml_sklearn import SklearnExtension
+
+extension = SklearnExtension()
+
+# Model to be used
+clf = RandomForestClassifier()
+
+# We'll create a study with one run on 3 datasets present in the suite
+tasks = [115, 259, 307]
+
+# To verify
+# https://test.openml.org/api/v1/study/1
+suite = openml.study.get_suite("OpenML100")
+print(all(t_id in suite.tasks for t_id in tasks))
+
+run_ids = []
+for task_id in tasks:
+    task = openml.tasks.get_task(task_id)
+    run = openml.runs.run_model_on_task(clf, task)
+    run.publish()
+    run_ids.append(run.run_id)
+
+# The study needs a machine-readable and unique alias. To obtain this,
+# we simply generate a random uuid.
+alias = uuid.uuid4().hex
+
+new_study = openml.study.create_study(
+    name="Test-Study",
+    description="Test study for the Python tutorial on studies",
+    run_ids=run_ids,
+    alias=alias,
+    benchmark_suite=suite.study_id,
+)
+new_study.publish()
+print(new_study)
+
+
+# %%
+openml.config.stop_using_configuration_for_example()
diff --git a/examples/Advanced/suites_tutorial.py b/examples/Advanced/suites_tutorial.py
new file mode 100644
index 000000000..8459510ef
--- /dev/null
+++ b/examples/Advanced/suites_tutorial.py
@@ -0,0 +1,92 @@
+# %% [markdown]
+# How to list, download and upload benchmark suites.
+
+# %%
+import uuid
+
+import numpy as np
+
+import openml
+
+# %% [markdown]
+# ## Listing suites
+#
+# * Use the output_format parameter to select output type
+# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
+#   easier-to-work-with data structure
+
+# %%
+suites = openml.study.list_suites(status="all")
+print(suites.head(n=10))
+
+# %% [markdown]
+# ## Downloading suites
+# This is done based on the dataset ID.
+
+# %%
+suite = openml.study.get_suite(99)
+print(suite)
+
+# %% [markdown]
+# Suites also feature a description:
+
+# %%
+print(suite.description)
+
+# %% [markdown]
+# Suites are a container for tasks:
+
+# %%
+print(suite.tasks)
+
+# %% [markdown]
+# And we can use the task listing functionality to learn more about them:
+
+# %%
+tasks = openml.tasks.list_tasks()
+
+# %% [markdown]
+# Using ``@`` in
+# [pd.DataFrame.query](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html)
+# accesses variables outside of the current dataframe.
+
+# %%
+tasks = tasks.query("tid in @suite.tasks")
+print(tasks.describe().transpose())
+
+# %% [markdown]
+# We'll use the test server for the rest of this tutorial.
+
+# %%
+openml.config.start_using_configuration_for_example()
+
+# %% [markdown]
+# ## Uploading suites
+#
+# Uploading suites is as simple as uploading any kind of other OpenML
+# entity - the only reason why we need so much code in this example is
+# because we upload some random data.
+
+# We'll take a random subset of at least ten tasks of all available tasks on
+# the test server:
+
+# %%
+all_tasks = list(openml.tasks.list_tasks()["tid"])
+task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))  # noqa: NPY002
+
+# The study needs a machine-readable and unique alias. To obtain this,
+# we simply generate a random uuid.
+
+alias = uuid.uuid4().hex
+
+new_suite = openml.study.create_benchmark_suite(
+    name="Test-Suite",
+    description="Test suite for the Python tutorial on benchmark suites",
+    task_ids=task_ids_for_suite,
+    alias=alias,
+)
+new_suite.publish()
+print(new_suite)
+
+# %%
+openml.config.stop_using_configuration_for_example()
diff --git a/examples/Advanced/task_manual_iteration_tutorial.py b/examples/Advanced/task_manual_iteration_tutorial.py
new file mode 100644
index 000000000..1e630e213
--- /dev/null
+++ b/examples/Advanced/task_manual_iteration_tutorial.py
@@ -0,0 +1,172 @@
+# %% [markdown]
+# Tasks define a target and a train/test split, which we can use for benchmarking.
+
+# %%
+import openml
+
+# %% [markdown]
+# For this tutorial we will use the famous King+Rook versus King+Pawn on A7 dataset, which has
+# the dataset ID 3 ([dataset on OpenML](https://www.openml.org/d/3)), and for which there exist
+# tasks with all important estimation procedures. It is small enough (less than 5000 samples) to
+# efficiently use it in an example.
+#
+# We will first start with ([task 233](https://www.openml.org/t/233)), which is a task with a
+# holdout estimation procedure.
+
+# %%
+task_id = 233
+task = openml.tasks.get_task(task_id)
+
+# %% [markdown]
+# Now that we have a task object we can obtain the number of repetitions, folds and samples as
+# defined by the task:
+
+# %%
+n_repeats, n_folds, n_samples = task.get_split_dimensions()
+
+# %% [markdown]
+# * ``n_repeats``: Number of times the model quality estimation is performed
+# * ``n_folds``: Number of folds per repeat
+# * ``n_samples``: How many data points to use. This is only relevant for learning curve tasks
+#
+# A list of all available estimation procedures is available
+# [here](https://www.openml.org/search?q=%2520measure_type%3Aestimation_procedure&type=measure).
+#
+# Task ``233`` is a simple task using the holdout estimation procedure and therefore has only a
+# single repeat, a single fold and a single sample size:
+
+# %%
+print(
+    f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}."
+)
+
+# %% [markdown]
+# We can now retrieve the train/test split for this combination of repeats, folds and number of
+# samples (indexing is zero-based). Usually, one would loop over all repeats, folds and sample
+# sizes, but we can neglect this here as there is only a single repetition.
+
+# %%
+train_indices, test_indices = task.get_train_test_split_indices(
+    repeat=0,
+    fold=0,
+    sample=0,
+)
+
+print(train_indices.shape, train_indices.dtype)
+print(test_indices.shape, test_indices.dtype)
+
+# %% [markdown]
+# And then split the data based on this:
+
+# %%
+X, y = task.get_X_and_y()
+X_train = X.iloc[train_indices]
+y_train = y.iloc[train_indices]
+X_test = X.iloc[test_indices]
+y_test = y.iloc[test_indices]
+
+print(
+    f"X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}, X_test.shape: {X_test.shape}, y_test.shape: {y_test.shape}"
+)
+
+# %% [markdown]
+# Obviously, we can also retrieve cross-validation versions of the dataset used in task ``233``:
+
+# %%
+task_id = 3
+task = openml.tasks.get_task(task_id)
+X, y = task.get_X_and_y()
+n_repeats, n_folds, n_samples = task.get_split_dimensions()
+print(
+    f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}."
+)
+
+# %% [markdown]
+# And then perform the aforementioned iteration over all splits:
+
+# %%
+for repeat_idx in range(n_repeats):
+    for fold_idx in range(n_folds):
+        for sample_idx in range(n_samples):
+            train_indices, test_indices = task.get_train_test_split_indices(
+                repeat=repeat_idx,
+                fold=fold_idx,
+                sample=sample_idx,
+            )
+            X_train = X.iloc[train_indices]
+            y_train = y.iloc[train_indices]
+            X_test = X.iloc[test_indices]
+            y_test = y.iloc[test_indices]
+
+            print(
+                f"Repeat #{repeat_idx}, fold #{fold_idx}, samples {sample_idx}: X_train.shape: {X_train.shape}, "
+                f"y_train.shape {y_train.shape}, X_test.shape {X_test.shape}, y_test.shape {y_test.shape}"
+            )
+
+# %% [markdown]
+# And also versions with multiple repeats:
+
+# %%
+task_id = 1767
+task = openml.tasks.get_task(task_id)
+X, y = task.get_X_and_y()
+n_repeats, n_folds, n_samples = task.get_split_dimensions()
+print(
+    f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}."
+)
+
+# %% [markdown]
+# And then again perform the aforementioned iteration over all splits:
+
+# %%
+for repeat_idx in range(n_repeats):
+    for fold_idx in range(n_folds):
+        for sample_idx in range(n_samples):
+            train_indices, test_indices = task.get_train_test_split_indices(
+                repeat=repeat_idx,
+                fold=fold_idx,
+                sample=sample_idx,
+            )
+            X_train = X.iloc[train_indices]
+            y_train = y.iloc[train_indices]
+            X_test = X.iloc[test_indices]
+            y_test = y.iloc[test_indices]
+
+            print(
+                f"Repeat #{repeat_idx}, fold #{fold_idx}, samples {sample_idx}: X_train.shape: {X_train.shape}, "
+                f"y_train.shape {y_train.shape}, X_test.shape {X_test.shape}, y_test.shape {y_test.shape}"
+            )
+
+# %% [markdown]
+# And finally a task based on learning curves:
+
+# %%
+task_id = 1702
+task = openml.tasks.get_task(task_id)
+X, y = task.get_X_and_y()
+n_repeats, n_folds, n_samples = task.get_split_dimensions()
+print(
+    f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}."
+)
+
+# %% [markdown]
+# And then again perform the aforementioned iteration over all splits:
+
+# %%
+for repeat_idx in range(n_repeats):
+    for fold_idx in range(n_folds):
+        for sample_idx in range(n_samples):
+            train_indices, test_indices = task.get_train_test_split_indices(
+                repeat=repeat_idx,
+                fold=fold_idx,
+                sample=sample_idx,
+            )
+            X_train = X.iloc[train_indices]
+            y_train = y.iloc[train_indices]
+            X_test = X.iloc[test_indices]
+            y_test = y.iloc[test_indices]
+
+            print(
+                f"Repeat #{repeat_idx}, fold #{fold_idx}, samples {sample_idx}: X_train.shape: {X_train.shape}, "
+                f"y_train.shape {y_train.shape}, X_test.shape {X_test.shape}, y_test.shape {y_test.shape}"
+            )
diff --git a/examples/Advanced/tasks_tutorial.py b/examples/Advanced/tasks_tutorial.py
new file mode 100644
index 000000000..dff7293ad
--- /dev/null
+++ b/examples/Advanced/tasks_tutorial.py
@@ -0,0 +1,201 @@
+# %% [markdown]
+# A tutorial on how to list and download tasks.
+
+# %%
+import openml
+from openml.tasks import TaskType
+
+# %% [markdown]
+#
+# Tasks are identified by IDs and can be accessed in two different ways:
+#
+# 1. In a list providing basic information on all tasks available on OpenML.
+#    This function will not download the actual tasks, but will instead download
+#    meta data that can be used to filter the tasks and retrieve a set of IDs.
+#    We can filter this list, for example, we can only list tasks having a
+#    special tag or only tasks for a specific target such as
+#    *supervised classification*.
+# 2. A single task by its ID. It contains all meta information, the target
+#    metric, the splits and an iterator which can be used to access the
+#    splits in a useful manner.
+
+# %% [markdown]
+# ## Listing tasks
+#
+# We will start by simply listing only *supervised classification* tasks.
+#
+# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
+# request a
+# [pandas dataframe](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)
+# instead to have better visualization capabilities and easier access:
+
+# %%
+tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
+print(tasks.columns)
+print(f"First 5 of {len(tasks)} tasks:")
+print(tasks.head())
+
+# %% [markdown]
+# We can filter the list of tasks to only contain datasets with more than
+# 500 samples, but less than 1000 samples:
+
+# %%
+filtered_tasks = tasks.query("NumberOfInstances > 500 and NumberOfInstances < 1000")
+print(list(filtered_tasks.index))
+
+
+# %%
+# Number of tasks
+print(len(filtered_tasks))
+
+# %% [markdown]
+# Then, we can further restrict the tasks to all have the same resampling strategy:
+
+# %%
+filtered_tasks = filtered_tasks.query('estimation_procedure == "10-fold Crossvalidation"')
+print(list(filtered_tasks.index))
+
+# %%
+# Number of tasks
+print(len(filtered_tasks))
+
+# %% [markdown]
+# Resampling strategies can be found on the
+# [OpenML Website](https://www.openml.org/search?type=measure&q=estimation%20procedure).
+#
+# Similar to listing tasks by task type, we can list tasks by tags:
+
+# %%
+tasks = openml.tasks.list_tasks(tag="OpenML100")
+print(f"First 5 of {len(tasks)} tasks:")
+print(tasks.head())
+
+# %% [markdown]
+# Furthermore, we can list tasks based on the dataset id:
+
+# %%
+tasks = openml.tasks.list_tasks(data_id=1471)
+print(f"First 5 of {len(tasks)} tasks:")
+print(tasks.head())
+
+# %% [markdown]
+# In addition, a size limit and an offset can be applied both separately and simultaneously:
+
+# %%
+tasks = openml.tasks.list_tasks(size=10, offset=50)
+print(tasks)
+
+# %% [markdown]
+#
+# **OpenML 100**
+# is a curated list of 100 tasks to start using OpenML. They are all
+# supervised classification tasks with more than 500 instances and less than 50000
+# instances per task. To make things easier, the tasks do not contain highly
+# unbalanced data and sparse data. However, the tasks include missing values and
+# categorical features. You can find out more about the *OpenML 100* on
+# [the OpenML benchmarking page](https://docs.openml.org/benchmark/).
+#
+# Finally, it is also possible to list all tasks on OpenML with:
+
+# %%
+tasks = openml.tasks.list_tasks()
+print(len(tasks))
+
+# %% [markdown]
+# ## Exercise
+#
+# Search for the tasks on the 'eeg-eye-state' dataset.
+
+# %%
+tasks.query('name=="eeg-eye-state"')
+
+# %% [markdown]
+# ## Downloading tasks
+#
+# We provide two functions to download tasks, one which downloads only a
+# single task by its ID, and one which takes a list of IDs and downloads
+# all of these tasks:
+
+# %%
+task_id = 31
+task = openml.tasks.get_task(task_id)
+
+# %%
+# Properties of the task are stored as member variables:
+print(task)
+
+# %%
+# And:
+
+ids = [2, 1891, 31, 9983]
+tasks = openml.tasks.get_tasks(ids)
+print(tasks[0])
+
+# %% [markdown]
+# ## Creating tasks
+#
+# You can also create new tasks. Take the following into account:
+#
+# * You can only create tasks on *active* datasets
+# * For now, only the following tasks are supported: classification, regression,
+#   clustering, and learning curve analysis.
+# * For now, tasks can only be created on a single dataset.
+# * The exact same task must not already exist.
+#
+# Creating a task requires the following input:
+#
+# * task_type: The task type ID, required (see below). Required.
+# * dataset_id: The dataset ID. Required.
+# * target_name: The name of the attribute you aim to predict. Optional.
+# * estimation_procedure_id : The ID of the estimation procedure used to create train-test
+#   splits. Optional.
+# * evaluation_measure: The name of the evaluation measure. Optional.
+# * Any additional inputs for specific tasks
+#
+# It is best to leave the evaluation measure open if there is no strong prerequisite for a
+# specific measure. OpenML will always compute all appropriate measures and you can filter
+# or sort results on your favourite measure afterwards. Only add an evaluation measure if
+# necessary (e.g. when other measure make no sense), since it will create a new task, which
+# scatters results across tasks.
+
+# %% [markdown]
+# We'll use the test server for the rest of this tutorial.
+
+# %%
+openml.config.start_using_configuration_for_example()
+
+# %% [markdown]
+# ## Example
+#
+# Let's create a classification task on a dataset. In this example we will do this on the
+# Iris dataset (ID=128 (on test server)). We'll use 10-fold cross-validation (ID=1),
+# and *predictive accuracy* as the predefined measure (this can also be left open).
+# If a task with these parameters exists, we will get an appropriate exception.
+# If such a task doesn't exist, a task will be created and the corresponding task_id
+# will be returned.
+
+# %%
+try:
+    my_task = openml.tasks.create_task(
+        task_type=TaskType.SUPERVISED_CLASSIFICATION,
+        dataset_id=128,
+        target_name="class",
+        evaluation_measure="predictive_accuracy",
+        estimation_procedure_id=1,
+    )
+    my_task.publish()
+except openml.exceptions.OpenMLServerException as e:
+    # Error code for 'task already exists'
+    if e.code == 614:
+        # Lookup task
+        tasks = openml.tasks.list_tasks(data_id=128)
+        tasks = tasks.query(
+            'task_type == "Supervised Classification" '
+            'and estimation_procedure == "10-fold Crossvalidation" '
+            'and evaluation_measures == "predictive_accuracy"'
+        )
+        task_id = tasks.loc[:, "tid"].values[0]
+        print("Task already exists. Task ID is", task_id)
+
+# %%
+openml.config.stop_using_configuration_for_example()
diff --git a/examples/Basics/introduction_tutorial.py b/examples/Basics/introduction_tutorial.py
new file mode 100644
index 000000000..2ba2d0ef1
--- /dev/null
+++ b/examples/Basics/introduction_tutorial.py
@@ -0,0 +1,55 @@
+# %% [markdown]
+# ## Installation
+# Installation is done via ``pip``:
+#
+# ```bash
+# pip install openml
+# ```
+
+# %% [markdown]
+# ## Authentication
+#
+# For certain functionality, such as uploading tasks or datasets, users have to
+# sign up. Only accessing the data on OpenML does not require an account!
+#
+# If you don't have an account yet, sign up now.
+# You will receive an API key, which will authenticate you to the server
+# and allow you to download and upload datasets, tasks, runs and flows.
+#
+# * Create an OpenML account (free) on https://www.openml.org.
+# * After logging in, open your account page (avatar on the top right)
+# * Open 'Account Settings', then 'API authentication' to find your API key.
+#
+# There are two ways to permanently authenticate:
+#
+# * Use the ``openml`` CLI tool with ``openml configure apikey MYKEY``,
+#   replacing **MYKEY** with your API key.
+# * Create a plain text file **~/.openml/config** with the line
+#   **'apikey=MYKEY'**, replacing **MYKEY** with your API key. The config
+#   file must be in the directory ~/.openml/config and exist prior to
+#   importing the openml module.
+#
+# Alternatively, by running the code below and replacing 'YOURKEY' with your API key,
+# you authenticate for the duration of the Python process.
+
+# %%
+import openml
+
+openml.config.apikey = "YOURKEY"
+
+# %% [markdown]
+# ## Caching
+# When downloading datasets, tasks, runs and flows, they will be cached to
+# retrieve them without calling the server later. As with the API key,
+# the cache directory can be either specified through the config file or
+# through the API:
+#
+# * Add the  line **cachedir = 'MYDIR'** to the config file, replacing
+#   'MYDIR' with the path to the cache directory. By default, OpenML
+#   will use **~/.openml/cache** as the cache directory.
+# * Run the code below, replacing 'YOURDIR' with the path to the cache directory.
+
+# %%
+import openml
+
+openml.config.set_root_cache_directory("YOURDIR")
diff --git a/examples/Basics/simple_datasets_tutorial.py b/examples/Basics/simple_datasets_tutorial.py
new file mode 100644
index 000000000..75d36ed0f
--- /dev/null
+++ b/examples/Basics/simple_datasets_tutorial.py
@@ -0,0 +1,57 @@
+# %% [markdown]
+# A basic tutorial on how to list, load and visualize datasets.
+#
+# In general, we recommend working with tasks, so that the results can
+# be easily reproduced. Furthermore, the results can be compared to existing results
+# at OpenML. However, for the purposes of this tutorial, we are going to work with
+# the datasets directly.
+
+# %%
+
+import openml
+
+# %% [markdown]
+# ## List datasets stored on OpenML
+
+# %%
+datasets_df = openml.datasets.list_datasets()
+print(datasets_df.head(n=10))
+
+# %% [markdown]
+# ## Download a dataset
+
+# %%
+# Iris dataset https://www.openml.org/d/61
+dataset = openml.datasets.get_dataset(dataset_id=61)
+
+# Print a summary
+print(
+    f"This is dataset '{dataset.name}', the target feature is '{dataset.default_target_attribute}'"
+)
+print(f"URL: {dataset.url}")
+print(dataset.description[:500])
+
+# %% [markdown]
+# ## Load a dataset
+# * `X` - A dataframe where each row represents one example with
+#   the corresponding feature values.
+# * `y` - the classes for each example
+# * `categorical_indicator` - a list that indicates which feature is categorical
+# * `attribute_names` - the names of the features for the examples (X) and
+# target feature (y)
+
+# %%
+X, y, categorical_indicator, attribute_names = dataset.get_data(
+    target=dataset.default_target_attribute
+)
+
+# %% [markdown]
+# Visualize the dataset
+
+# %%
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+iris_plot = sns.pairplot(pd.concat([X, y], axis=1), hue="class")
+plt.show()
diff --git a/examples/Basics/simple_flows_and_runs_tutorial.py b/examples/Basics/simple_flows_and_runs_tutorial.py
new file mode 100644
index 000000000..eb42c7d02
--- /dev/null
+++ b/examples/Basics/simple_flows_and_runs_tutorial.py
@@ -0,0 +1,122 @@
+# %% [markdown]
+# A simple tutorial on how to upload results from a machine learning experiment to OpenML.
+
+# %%
+import sklearn
+from sklearn.neighbors import KNeighborsClassifier
+
+import openml
+
+# %% [markdown]
+# <div class="admonition warning">
+#     <p class="admonition-title">Warning</p>
+#     <p>
+#         This example uploads data. For that reason, this example connects to the
+#         test server at <a href="https://test.openml.org"
+#         target="_blank">test.openml.org</a>.<br>
+#         This prevents the main server from becoming overloaded with example datasets, tasks,
+#         runs, and other submissions.<br>
+#         Using this test server may affect the behavior and performance of the
+#         OpenML-Python API.
+#     </p>
+# </div>
+
+# %%
+openml.config.start_using_configuration_for_example()
+
+# %% [markdown]
+# ## Train a machine learning model and evaluate it
+# NOTE: We are using task 119 from the test server: https://test.openml.org/d/20
+
+# %%
+task = openml.tasks.get_task(119)
+
+# Get the data
+dataset = task.get_dataset()
+X, y, categorical_indicator, attribute_names = dataset.get_data(
+    target=dataset.default_target_attribute
+)
+
+# Get the holdout split from the task
+train_indices, test_indices = task.get_train_test_split_indices(fold=0, repeat=0)
+X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
+y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
+
+knn_parameters = {
+    "n_neighbors": 3,
+}
+clf = KNeighborsClassifier(**knn_parameters)
+clf.fit(X_train, y_train)
+
+# Get experiment results
+y_pred = clf.predict(X_test)
+y_pred_proba = clf.predict_proba(X_test)
+
+# %% [markdown]
+# ## Upload the machine learning experiments to OpenML
+# First, create a fow and fill it with metadata about the machine learning model.
+
+# %%
+knn_flow = openml.flows.OpenMLFlow(
+    # Metadata
+    model=clf,  # or None, if you do not want to upload the model object.
+    name="CustomKNeighborsClassifier",
+    description="A custom KNeighborsClassifier flow for OpenML.",
+    external_version=f"{sklearn.__version__}",
+    language="English",
+    tags=["openml_tutorial_knn"],
+    dependencies=f"{sklearn.__version__}",
+    # Hyperparameters
+    parameters={k: str(v) for k, v in knn_parameters.items()},
+    parameters_meta_info={
+        "n_neighbors": {"description": "number of neighbors to use", "data_type": "int"}
+    },
+    # If you have a pipeline with subcomponents, such as preprocessing, add them here.
+    components={},
+)
+knn_flow.publish()
+print(f"knn_flow was published with the ID {knn_flow.flow_id}")
+
+# %% [markdown]
+# Second, we create a run to store the results associated with the flow.
+
+# %%
+
+# Format the predictions for OpenML
+predictions = []
+for test_index, y_true_i, y_pred_i, y_pred_proba_i in zip(
+    test_indices, y_test, y_pred, y_pred_proba, strict=False
+):
+    predictions.append(
+        openml.runs.functions.format_prediction(
+            task=task,
+            repeat=0,
+            fold=0,
+            index=test_index,
+            prediction=y_pred_i,
+            truth=y_true_i,
+            proba=dict(zip(task.class_labels, y_pred_proba_i, strict=False)),
+        )
+    )
+
+# Format the parameters for OpenML
+oml_knn_parameters = [
+    {"oml:name": k, "oml:value": v, "oml:component": knn_flow.flow_id}
+    for k, v in knn_parameters.items()
+]
+
+knn_run = openml.runs.OpenMLRun(
+    task_id=task.task_id,
+    flow_id=knn_flow.flow_id,
+    dataset_id=dataset.dataset_id,
+    parameter_settings=oml_knn_parameters,
+    data_content=predictions,
+    tags=["openml_tutorial_knn"],
+    description_text="Run generated by the tutorial.",
+)
+knn_run = knn_run.publish()
+print(f"Run was uploaded to {knn_run.openml_url}")
+print(f"The flow can be found at {knn_run.flow.openml_url}")
+
+# %%
+openml.config.stop_using_configuration_for_example()
diff --git a/examples/Basics/simple_suites_tutorial.py b/examples/Basics/simple_suites_tutorial.py
new file mode 100644
index 000000000..cc3c7b1cf
--- /dev/null
+++ b/examples/Basics/simple_suites_tutorial.py
@@ -0,0 +1,53 @@
+# %% [markdown]
+# This is a brief showcase of OpenML benchmark suites, which were introduced by
+# [Bischl et al. (2019)](https://arxiv.org/abs/1708.03731v2). Benchmark suites standardize the
+# datasets and splits to be used in an experiment or paper. They are fully integrated into OpenML
+# and simplify both the sharing of the setup and the results.
+
+# %%
+import openml
+
+# %% [markdown]
+# ## OpenML-CC18
+#
+# As an example we have a look at the OpenML-CC18, which is a suite of 72 classification datasets
+# from OpenML which were carefully selected to be usable by many algorithms. These are all datasets
+# from mid-2018 that satisfy a large set of clear requirements for thorough yet practical benchmarking:
+#
+# 1. the number of observations are between 500 and 100,000 to focus on medium-sized datasets,
+# 2. the number of features does not exceed 5,000 features to keep the runtime of the algorithms
+#    low
+# 3. the target attribute has at least two classes with no class having less than 20 observations
+# 4. the ratio of the minority class and the majority class is above 0.05 (to eliminate highly
+#    imbalanced datasets which require special treatment for both algorithms and evaluation
+#    measures).
+#
+# A full description can be found in the
+# [OpenML benchmarking docs](https://docs.openml.org/benchmark/#openml-cc18).
+#
+# In this example, we'll focus on how to use benchmark suites in practice.
+
+# %% [markdown]
+# ## Downloading benchmark suites
+
+# %%
+suite = openml.study.get_suite(99)
+print(suite)
+
+# %% [markdown]
+# The benchmark suite does not download the included tasks and datasets itself, but only contains
+# a list of which tasks constitute the study.
+#
+# Tasks can then be accessed via
+
+# %%
+tasks = suite.tasks
+print(tasks)
+
+# %% [markdown]
+# and iterated over for benchmarking. For speed reasons, we only iterate over the first three tasks:
+
+# %%
+for task_id in tasks[:3]:
+    task = openml.tasks.get_task(task_id)
+    print(task)
diff --git a/examples/Basics/simple_tasks_tutorial.py b/examples/Basics/simple_tasks_tutorial.py
new file mode 100644
index 000000000..598ce4e71
--- /dev/null
+++ b/examples/Basics/simple_tasks_tutorial.py
@@ -0,0 +1,27 @@
+# %% [markdown]
+# A brief example on how to use tasks from OpenML.
+
+# %%
+
+import openml
+
+# %% [markdown]
+# Get a [task](https://docs.openml.org/concepts/tasks/) for
+# [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31):
+
+# %%
+task = openml.tasks.get_task(31)
+
+# %% [markdown]
+# Get the dataset and its data from the task.
+
+# %%
+dataset = task.get_dataset()
+X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name)
+
+# %% [markdown]
+# Get the first out of the 10 cross-validation splits from the task.
+
+# %%
+train_indices, test_indices = task.get_train_test_split_indices(fold=0)
+print(train_indices[:10])  # print the first 10 indices of the training set
diff --git a/examples/README.txt b/examples/README.txt
deleted file mode 100644
index e41bfd4fc..000000000
--- a/examples/README.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-Introductory Examples
-=====================
-
-General examples for OpenML usage.
diff --git a/examples/_external_or_deprecated/2015_neurips_feurer_example.py b/examples/_external_or_deprecated/2015_neurips_feurer_example.py
new file mode 100644
index 000000000..2dfc4bb97
--- /dev/null
+++ b/examples/_external_or_deprecated/2015_neurips_feurer_example.py
@@ -0,0 +1,91 @@
+"""
+Feurer et al. (2015)
+====================
+
+A tutorial on how to get the datasets used in the paper introducing *Auto-sklearn* by Feurer et al..
+
+Auto-sklearn website: https://automl.github.io/auto-sklearn/
+
+Publication
+~~~~~~~~~~~
+
+| Efficient and Robust Automated Machine Learning
+| Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
+| In *Advances in Neural Information Processing Systems 28*, 2015
+| Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
+"""
+
+# License: BSD 3-Clause
+
+import openml
+
+####################################################################################################
+# List of dataset IDs given in the supplementary material of Feurer et al.:
+# https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning-supplemental.zip
+# fmt: off
+dataset_ids = [
+    3, 6, 12, 14, 16, 18, 21, 22, 23, 24, 26, 28, 30, 31, 32, 36, 38, 44, 46,
+    57, 60, 179, 180, 181, 182, 184, 185, 273, 293, 300, 351, 354, 357, 389,
+    390, 391, 392, 393, 395, 396, 398, 399, 401, 554, 679, 715, 718, 720, 722,
+    723, 727, 728, 734, 735, 737, 740, 741, 743, 751, 752, 761, 772, 797, 799,
+    803, 806, 807, 813, 816, 819, 821, 822, 823, 833, 837, 843, 845, 846, 847,
+    849, 866, 871, 881, 897, 901, 903, 904, 910, 912, 913, 914, 917, 923, 930,
+    934, 953, 958, 959, 962, 966, 971, 976, 977, 978, 979, 980, 991, 993, 995,
+    1000, 1002, 1018, 1019, 1020, 1021, 1036, 1040, 1041, 1049, 1050, 1053,
+    1056, 1067, 1068, 1069, 1111, 1112, 1114, 1116, 1119, 1120, 1128, 1130,
+    1134, 1138, 1139, 1142, 1146, 1161, 1166,
+]
+# fmt: on
+
+####################################################################################################
+# The dataset IDs could be used directly to load the dataset and split the data into a training set
+# and a test set. However, to be reproducible, we will first obtain the respective tasks from
+# OpenML, which define both the target feature and the train/test split.
+#
+# .. note::
+#    It is discouraged to work directly on datasets and only provide dataset IDs in a paper as
+#    this does not allow reproducibility (unclear splitting). Please do not use datasets but the
+#    respective tasks as basis for a paper and publish task IDS. This example is only given to
+#    showcase the use of OpenML-Python for a published paper and as a warning on how not to do it.
+#    Please check the `OpenML documentation of tasks <https://docs.openml.org/concepts/tasks/>`_ if you
+#    want to learn more about them.
+
+####################################################################################################
+# This lists both active and inactive tasks (because of ``status='all'``). Unfortunately,
+# this is necessary as some of the datasets contain issues found after the publication and became
+# deactivated, which also deactivated the tasks on them. More information on active or inactive
+# datasets can be found in the `online docs <https://docs.openml.org/concepts/data/#dataset-status>`_.
+tasks = openml.tasks.list_tasks(
+    task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION,
+    status="all",
+    output_format="dataframe",
+)
+
+# Query only those with holdout as the resampling startegy.
+tasks = tasks.query('estimation_procedure == "33% Holdout set"')
+
+task_ids = []
+for did in dataset_ids:
+    tasks_ = list(tasks.query(f"did == {did}").tid)
+    if len(tasks_) >= 1:  # if there are multiple task, take the one with lowest ID (oldest).
+        task_id = min(tasks_)
+    else:
+        raise ValueError(did)
+
+    # Optional - Check that the task has the same target attribute as the
+    # dataset default target attribute
+    # (disabled for this example as it needs to run fast to be rendered online)
+    # task = openml.tasks.get_task(task_id)
+    # dataset = task.get_dataset()
+    # if task.target_name != dataset.default_target_attribute:
+    #     raise ValueError(
+    #         (task.target_name, dataset.default_target_attribute)
+    #     )
+
+    task_ids.append(task_id)
+
+assert len(task_ids) == 140
+task_ids.sort()
+
+# These are the tasks to work with:
+print(task_ids)
diff --git a/examples/_external_or_deprecated/2018_ida_strang_example.py b/examples/_external_or_deprecated/2018_ida_strang_example.py
new file mode 100644
index 000000000..0e180badf
--- /dev/null
+++ b/examples/_external_or_deprecated/2018_ida_strang_example.py
@@ -0,0 +1,123 @@
+"""
+Strang et al. (2018)
+====================
+
+A tutorial on how to reproduce the analysis conducted for *Don't Rule Out Simple Models
+Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML*.
+
+Publication
+~~~~~~~~~~~
+
+| Don't Rule Out Simple Models Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML
+| Benjamin Strang, Peter van der Putten, Jan N. van Rijn and Frank Hutter
+| In *Advances in Intelligent Data Analysis XVII 17th International Symposium*, 2018
+| Available at https://link.springer.com/chapter/10.1007%2F978-3-030-01768-2_25
+"""
+
+# License: BSD 3-Clause
+
+import matplotlib.pyplot as plt
+
+import openml
+
+##############################################################################
+# A basic step for each data-mining or machine learning task is to determine
+# which model to choose based on the problem and the data at hand. In this
+# work we investigate when non-linear classifiers outperform linear
+# classifiers by means of a large scale experiment.
+#
+# The paper is accompanied with a study object, containing all relevant tasks
+# and runs (``study_id=123``). The paper features three experiment classes:
+# Support Vector Machines (SVM), Neural Networks (NN) and Decision Trees (DT).
+# This example demonstrates how to reproduce the plots, comparing two
+# classifiers given the OpenML flow ids. Note that this allows us to reproduce
+# the SVM and NN experiment, but not the DT experiment, as this requires a bit
+# more effort to distinguish the same flow with different hyperparameter
+# values.
+
+study_id = 123
+# for comparing svms: flow_ids = [7754, 7756]
+# for comparing nns: flow_ids = [7722, 7729]
+# for comparing dts: flow_ids = [7725], differentiate on hyper-parameter value
+classifier_family = "SVM"
+flow_ids = [7754, 7756]
+measure = "predictive_accuracy"
+meta_features = ["NumberOfInstances", "NumberOfFeatures"]
+class_values = ["non-linear better", "linear better", "equal"]
+
+# Downloads all evaluation records related to this study
+evaluations = openml.evaluations.list_evaluations(
+    measure, size=None, flows=flow_ids, study=study_id, output_format="dataframe"
+)
+# gives us a table with columns data_id, flow1_value, flow2_value
+evaluations = evaluations.pivot(index="data_id", columns="flow_id", values="value").dropna()
+# downloads all data qualities (for scatter plot)
+data_qualities = openml.datasets.list_datasets(
+    data_id=list(evaluations.index.values), output_format="dataframe"
+)
+# removes irrelevant data qualities
+data_qualities = data_qualities[meta_features]
+# makes a join between evaluation table and data qualities table,
+# now we have columns data_id, flow1_value, flow2_value, meta_feature_1,
+# meta_feature_2
+evaluations = evaluations.join(data_qualities, how="inner")
+
+# adds column that indicates the difference between the two classifiers
+evaluations["diff"] = evaluations[flow_ids[0]] - evaluations[flow_ids[1]]
+
+
+##############################################################################
+# makes the s-plot
+
+fig_splot, ax_splot = plt.subplots()
+ax_splot.plot(range(len(evaluations)), sorted(evaluations["diff"]))
+ax_splot.set_title(classifier_family)
+ax_splot.set_xlabel("Dataset (sorted)")
+ax_splot.set_ylabel("difference between linear and non-linear classifier")
+ax_splot.grid(linestyle="--", axis="y")
+plt.show()
+
+
+##############################################################################
+# adds column that indicates the difference between the two classifiers,
+# needed for the scatter plot
+
+
+def determine_class(val_lin, val_nonlin):
+    if val_lin < val_nonlin:
+        return class_values[0]
+    if val_nonlin < val_lin:
+        return class_values[1]
+    return class_values[2]
+
+
+evaluations["class"] = evaluations.apply(
+    lambda row: determine_class(row[flow_ids[0]], row[flow_ids[1]]), axis=1
+)
+
+# does the plotting and formatting
+fig_scatter, ax_scatter = plt.subplots()
+for class_val in class_values:
+    df_class = evaluations[evaluations["class"] == class_val]
+    plt.scatter(df_class[meta_features[0]], df_class[meta_features[1]], label=class_val)
+ax_scatter.set_title(classifier_family)
+ax_scatter.set_xlabel(meta_features[0])
+ax_scatter.set_ylabel(meta_features[1])
+ax_scatter.legend()
+ax_scatter.set_xscale("log")
+ax_scatter.set_yscale("log")
+plt.show()
+
+##############################################################################
+# makes a scatter plot where each data point represents the performance of the
+# two algorithms on various axis (not in the paper)
+
+fig_diagplot, ax_diagplot = plt.subplots()
+ax_diagplot.grid(linestyle="--")
+ax_diagplot.plot([0, 1], ls="-", color="black")
+ax_diagplot.plot([0.2, 1.2], ls="--", color="black")
+ax_diagplot.plot([-0.2, 0.8], ls="--", color="black")
+ax_diagplot.scatter(evaluations[flow_ids[0]], evaluations[flow_ids[1]])
+ax_diagplot.set_xlabel(measure)
+ax_diagplot.set_ylabel(measure)
+plt.show()
diff --git a/examples/_external_or_deprecated/2018_kdd_rijn_example.py b/examples/_external_or_deprecated/2018_kdd_rijn_example.py
new file mode 100644
index 000000000..957281616
--- /dev/null
+++ b/examples/_external_or_deprecated/2018_kdd_rijn_example.py
@@ -0,0 +1,189 @@
+"""
+van Rijn and Hutter (2018)
+==========================
+
+A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*.
+
+Example Deprecation Warning!
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore.
+
+Publication
+~~~~~~~~~~~
+
+| Hyperparameter importance across datasets
+| Jan N. van Rijn and Frank Hutter
+| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018
+| Available at https://dl.acm.org/doi/10.1145/3219819.3220058
+
+Requirements
+~~~~~~~~~~~~
+
+This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other
+systems).
+
+The following Python packages are required:
+
+pip install openml[examples,docs] fanova ConfigSpace<1.0
+"""
+
+# License: BSD 3-Clause
+
+import sys
+
+if sys.platform == "win32":
+    print(
+        "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems"
+    )
+    sys.exit()
+
+# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
+print("This example is deprecated, remove the `if False` in this code to use it manually.")
+if False:
+    import json
+
+    import fanova
+    import matplotlib.pyplot as plt
+    import pandas as pd
+    import seaborn as sns
+
+    import openml
+
+    ##############################################################################
+    # With the advent of automated machine learning, automated hyperparameter
+    # optimization methods are by now routinely used in data mining. However, this
+    # progress is not yet matched by equal progress on automatic analyses that
+    # yield information beyond performance-optimizing hyperparameter settings.
+    # In this example, we aim to answer the following two questions: Given an
+    # algorithm, what are generally its most important hyperparameters?
+    #
+    # This work is carried out on the OpenML-100 benchmark suite, which can be
+    # obtained by ``openml.study.get_suite('OpenML100')``. In this example, we
+    # conduct the experiment on the Support Vector Machine (``flow_id=7707``)
+    # with specific kernel (we will perform a post-process filter operation for
+    # this). We should set some other experimental parameters (number of results
+    # per task, evaluation measure and the number of trees of the internal
+    # functional Anova) before the fun can begin.
+    #
+    # Note that we simplify the example in several ways:
+    #
+    # 1) We only consider numerical hyperparameters
+    # 2) We consider all hyperparameters that are numerical (in reality, some
+    #    hyperparameters might be inactive (e.g., ``degree``) or irrelevant
+    #    (e.g., ``random_state``)
+    # 3) We assume all hyperparameters to be on uniform scale
+    #
+    # Any difference in conclusion between the actual paper and the presented
+    # results is most likely due to one of these simplifications. For example,
+    # the hyperparameter C looks rather insignificant, whereas it is quite
+    # important when it is put on a log-scale. All these simplifications can be
+    # addressed by defining a ConfigSpace. For a more elaborated example that uses
+    # this, please see:
+    # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py
+
+    suite = openml.study.get_suite("OpenML100")
+    flow_id = 7707
+    parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"}
+    evaluation_measure = "predictive_accuracy"
+    limit_per_task = 500
+    limit_nr_tasks = 15
+    n_trees = 16
+
+    fanova_results = []
+    # we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the
+    # communication with OpenML, and for iterated experimenting it is better to cache the results in a local file.
+    for idx, task_id in enumerate(suite.tasks):
+        if limit_nr_tasks is not None and idx >= limit_nr_tasks:
+            continue
+        print(
+            f"Starting with task {task_id} ({idx + 1}/{len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks})"
+        )
+        # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
+        evals = openml.evaluations.list_evaluations_setups(
+            evaluation_measure,
+            flows=[flow_id],
+            tasks=[task_id],
+            size=limit_per_task,
+            output_format="dataframe",
+        )
+
+        performance_column = "value"
+        # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance
+        # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine
+        # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format
+        # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for
+        # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the
+        # setups that belong to the flows embedded in this example though.
+        try:
+            setups_evals = pd.DataFrame(
+                [
+                    dict(
+                        **{name: json.loads(value) for name, value in setup["parameters"].items()},
+                        **{performance_column: setup[performance_column]},
+                    )
+                    for _, setup in evals.iterrows()
+                ]
+            )
+        except json.decoder.JSONDecodeError as e:
+            print(f"Task {task_id} error: {e}")
+            continue
+        # apply our filters, to have only the setups that comply to the hyperparameters we want
+        for filter_key, filter_value in parameter_filters.items():
+            setups_evals = setups_evals[setups_evals[filter_key] == filter_value]
+        # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters,
+        # the fanova library needs to be informed by using a configspace object.
+        setups_evals = setups_evals.select_dtypes(include=["int64", "float64"])
+        # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``,
+        # ``verbose``.
+        setups_evals = setups_evals[
+            [
+                c
+                for c in list(setups_evals)
+                if len(setups_evals[c].unique()) > 1 or c == performance_column
+            ]
+        ]
+        # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g.,
+        # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out:
+
+        # determine x values to pass to fanova library
+        parameter_names = [
+            pname for pname in setups_evals.columns.to_numpy() if pname != performance_column
+        ]
+        evaluator = fanova.fanova.fANOVA(
+            X=setups_evals[parameter_names].to_numpy(),
+            Y=setups_evals[performance_column].to_numpy(),
+            n_trees=n_trees,
+        )
+        for idx, pname in enumerate(parameter_names):  # noqa: PLW2901
+            try:
+                fanova_results.append(
+                    {
+                        "hyperparameter": pname.split(".")[-1],
+                        "fanova": evaluator.quantify_importance([idx])[(idx,)][
+                            "individual importance"
+                        ],
+                    }
+                )
+            except RuntimeError as e:
+                # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
+                # for all configurations (there is no variance). We will skip these tasks (like the authors did in the
+                # paper).
+                print(f"Task {task_id} error: {e}")
+                continue
+
+    # transform ``fanova_results`` from a list of dicts into a DataFrame
+    fanova_results = pd.DataFrame(fanova_results)
+
+    ##############################################################################
+    # make the boxplot of the variance contribution. Obviously, we can also use
+    # this data to make the Nemenyi plot, but this relies on the rather complex
+    # ``Orange`` dependency (``pip install Orange3``). For the complete example,
+    # the reader is referred to the more elaborate script (referred to earlier)
+    fig, ax = plt.subplots()
+    sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax)
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
+    ax.set_ylabel("Variance Contribution")
+    ax.set_xlabel(None)
+    plt.tight_layout()
+    plt.show()
diff --git a/examples/_external_or_deprecated/2018_neurips_perrone_example.py b/examples/_external_or_deprecated/2018_neurips_perrone_example.py
new file mode 100644
index 000000000..8a3c36994
--- /dev/null
+++ b/examples/_external_or_deprecated/2018_neurips_perrone_example.py
@@ -0,0 +1,260 @@
+"""
+Perrone et al. (2018)
+=====================
+
+A tutorial on how to build a surrogate model based on OpenML data as done for *Scalable
+Hyperparameter Transfer Learning* by Perrone et al..
+
+Publication
+~~~~~~~~~~~
+
+| Scalable Hyperparameter Transfer Learning
+| Valerio Perrone and Rodolphe Jenatton and Matthias Seeger and Cedric Archambeau
+| In *Advances in Neural Information Processing Systems 31*, 2018
+| Available at https://papers.nips.cc/paper/7917-scalable-hyperparameter-transfer-learning.pdf
+
+This example demonstrates how OpenML runs can be used to construct a surrogate model.
+
+In the following section, we shall do the following:
+
+* Retrieve tasks and flows as used in the experiments by Perrone et al. (2018).
+* Build a tabular data by fetching the evaluations uploaded to OpenML.
+* Impute missing values and handle categorical data before building a Random Forest model that
+  maps hyperparameter values to the area under curve score.
+"""
+
+############################################################################
+
+# License: BSD 3-Clause
+
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+from sklearn.compose import ColumnTransformer
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import mean_squared_error
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+import openml
+
+flow_type = "svm"  # this example will use the smaller svm flow evaluations
+############################################################################
+# The subsequent functions are defined to fetch tasks, flows, evaluations and preprocess them into
+# a tabular format that can be used to build models.
+
+
+def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_curve"):  # noqa: FBT002
+    """
+    Fetch a list of evaluations based on the flows and tasks used in the experiments.
+
+    Parameters
+    ----------
+    run_full : boolean
+        If True, use the full list of tasks used in the paper
+        If False, use 5 tasks with the smallest number of evaluations available
+    flow_type : str, {'svm', 'xgboost'}
+        To select whether svm or xgboost experiments are to be run
+    metric : str
+        The evaluation measure that is passed to openml.evaluations.list_evaluations
+
+    Returns
+    -------
+    eval_df : dataframe
+    task_ids : list
+    flow_id : int
+    """
+    # Collecting task IDs as used by the experiments from the paper
+    # fmt: off
+    if flow_type == "svm" and run_full:
+        task_ids = [
+            10101, 145878, 146064, 14951, 34537, 3485, 3492, 3493, 3494,
+            37, 3889, 3891, 3899, 3902, 3903, 3913, 3918, 3950, 9889,
+            9914, 9946, 9952, 9967, 9971, 9976, 9978, 9980, 9983,
+        ]
+    elif flow_type == "svm" and not run_full:
+        task_ids = [9983, 3485, 3902, 3903, 145878]
+    elif flow_type == "xgboost" and run_full:
+        task_ids = [
+            10093, 10101, 125923, 145847, 145857, 145862, 145872, 145878,
+            145953, 145972, 145976, 145979, 146064, 14951, 31, 3485,
+            3492, 3493, 37, 3896, 3903, 3913, 3917, 3918, 3, 49, 9914,
+            9946, 9952, 9967,
+        ]
+    else:  # flow_type == 'xgboost' and not run_full:
+        task_ids = [3903, 37, 3485, 49, 3913]
+    # fmt: on
+
+    # Fetching the relevant flow
+    flow_id = 5891 if flow_type == "svm" else 6767
+
+    # Fetching evaluations
+    eval_df = openml.evaluations.list_evaluations_setups(
+        function=metric,
+        tasks=task_ids,
+        flows=[flow_id],
+        uploaders=[2702],
+        output_format="dataframe",
+        parameters_in_separate_columns=True,
+    )
+    return eval_df, task_ids, flow_id
+
+
+def create_table_from_evaluations(
+    eval_df,
+    flow_type="svm",
+    run_count=np.iinfo(np.int64).max,  # noqa: B008
+    task_ids=None,
+):
+    """
+    Create a tabular data with its ground truth from a dataframe of evaluations.
+    Optionally, can filter out records based on task ids.
+
+    Parameters
+    ----------
+    eval_df : dataframe
+        Containing list of runs as obtained from list_evaluations()
+    flow_type : str, {'svm', 'xgboost'}
+        To select whether svm or xgboost experiments are to be run
+    run_count : int
+        Maximum size of the table created, or number of runs included in the table
+    task_ids : list, (optional)
+        List of integers specifying the tasks to be retained from the evaluations dataframe
+
+    Returns
+    -------
+    eval_table : dataframe
+    values : list
+    """
+    if task_ids is not None:
+        eval_df = eval_df[eval_df["task_id"].isin(task_ids)]
+    if flow_type == "svm":
+        colnames = ["cost", "degree", "gamma", "kernel"]
+    else:
+        colnames = [
+            "alpha",
+            "booster",
+            "colsample_bylevel",
+            "colsample_bytree",
+            "eta",
+            "lambda",
+            "max_depth",
+            "min_child_weight",
+            "nrounds",
+            "subsample",
+        ]
+    eval_df = eval_df.sample(frac=1)  # shuffling rows
+    eval_df = eval_df.iloc[:run_count, :]
+    eval_df.columns = [column.split("_")[-1] for column in eval_df.columns]
+    eval_table = eval_df.loc[:, colnames]
+    value = eval_df.loc[:, "value"]
+    return eval_table, value
+
+
+def list_categorical_attributes(flow_type="svm"):
+    if flow_type == "svm":
+        return ["kernel"]
+    return ["booster"]
+
+
+#############################################################################
+# Fetching the data from OpenML
+# *****************************
+# Now, we read all the tasks and evaluations for them and collate into a table.
+# Here, we are reading all the tasks and evaluations for the SVM flow and
+# pre-processing all retrieved evaluations.
+
+eval_df, task_ids, flow_id = fetch_evaluations(run_full=False, flow_type=flow_type)
+X, y = create_table_from_evaluations(eval_df, flow_type=flow_type)
+print(X.head())
+print("Y : ", y[:5])
+
+#############################################################################
+# Creating pre-processing and modelling pipelines
+# ***********************************************
+# The two primary tasks are to impute the missing values, that is, account for the hyperparameters
+# that are not available with the runs from OpenML. And secondly, to handle categorical variables
+# using One-hot encoding prior to modelling.
+
+# Separating data into categorical and non-categorical (numeric for this example) columns
+cat_cols = list_categorical_attributes(flow_type=flow_type)
+num_cols = list(set(X.columns) - set(cat_cols))
+
+# Missing value imputers for numeric columns
+num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1)
+
+# Creating the one-hot encoder for numerical representation of categorical columns
+enc = OneHotEncoder(handle_unknown="ignore")
+
+# Combining column transformers
+ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)])
+
+# Creating the full pipeline with the surrogate model
+clf = RandomForestRegressor(n_estimators=50)
+model = Pipeline(steps=[("preprocess", ct), ("surrogate", clf)])
+
+
+#############################################################################
+# Building a surrogate model on a task's evaluation
+# *************************************************
+# The same set of functions can be used for a single task to retrieve a singular table which can
+# be used for the surrogate model construction. We shall use the SVM flow here to keep execution
+# time simple and quick.
+
+# Selecting a task for the surrogate
+task_id = task_ids[-1]
+print("Task ID : ", task_id)
+X, y = create_table_from_evaluations(eval_df, task_ids=[task_id], flow_type="svm")
+
+model.fit(X, y)
+y_pred = model.predict(X)
+
+print(f"Training RMSE : {mean_squared_error(y, y_pred):.5}")
+
+
+#############################################################################
+# Evaluating the surrogate model
+# ******************************
+# The surrogate model built from a task's evaluations fetched from OpenML will be put into
+# trivial action here, where we shall randomly sample configurations and observe the trajectory
+# of the area under curve (auc) we can obtain from the surrogate we've built.
+#
+# NOTE: This section is written exclusively for the SVM flow
+
+
+# Sampling random configurations
+def random_sample_configurations(num_samples=100):
+    colnames = ["cost", "degree", "gamma", "kernel"]
+    ranges = [
+        (0.000986, 998.492437),
+        (2.0, 5.0),
+        (0.000988, 913.373845),
+        (["linear", "polynomial", "radial", "sigmoid"]),
+    ]
+    X = pd.DataFrame(np.nan, index=range(num_samples), columns=colnames)
+    for i in range(len(colnames)):
+        if len(ranges[i]) == 2:
+            col_val = np.random.uniform(low=ranges[i][0], high=ranges[i][1], size=num_samples)  # noqa: NPY002
+        else:
+            col_val = np.random.choice(ranges[i], size=num_samples)  # noqa: NPY002
+        X.iloc[:, i] = col_val
+    return X
+
+
+configs = random_sample_configurations(num_samples=1000)
+print(configs)
+
+#############################################################################
+preds = model.predict(configs)
+
+# tracking the maximum AUC obtained over the functions evaluations
+preds = np.maximum.accumulate(preds)
+# computing regret (1 - predicted_auc)
+regret = 1 - preds
+
+# plotting the regret curve
+plt.plot(regret)
+plt.title("AUC regret for Random Search on surrogate")
+plt.xlabel("Numbe of function evaluations")
+plt.ylabel("Regret")
diff --git a/examples/_external_or_deprecated/README.md b/examples/_external_or_deprecated/README.md
new file mode 100644
index 000000000..d25a81baa
--- /dev/null
+++ b/examples/_external_or_deprecated/README.md
@@ -0,0 +1,5 @@
+# External or Deprecated Examples
+
+This directory contains examples that are either external or deprecated. They may not be maintained or updated 
+regularly, and their functionality might not align with the latest version of the library. Moreover,
+they are not shown on the documentation website.
\ No newline at end of file
diff --git a/examples/_external_or_deprecated/benchmark_with_optunahub.py b/examples/_external_or_deprecated/benchmark_with_optunahub.py
new file mode 100644
index 000000000..38114bc44
--- /dev/null
+++ b/examples/_external_or_deprecated/benchmark_with_optunahub.py
@@ -0,0 +1,127 @@
+"""
+====================================================
+Hyperparameter Optimization Benchmark with OptunaHub
+====================================================
+
+In this tutorial, we walk through how to conduct hyperparameter optimization experiments using OpenML and OptunaHub.
+"""
+############################################################################
+# Please make sure to install the dependencies with:
+# ``pip install "openml>=0.15.1" plotly``
+# Then we import all the necessary modules.
+
+# License: BSD 3-Clause
+
+import logging
+
+import optuna
+from sklearn.compose import ColumnTransformer
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+import openml
+
+logger = logging.Logger(name="Experiment Logger", level=1)
+
+# <div class="admonition warning">
+#     <p class="admonition-title">Warning</p>
+#     <p>
+#         For the rest of this tutorial, we will require the `openml-sklearn` package.
+#         Install it with `pip install openml-sklearn`.
+#     </p>
+# </div>
+
+# %%
+# Get sklearn extension to run sklearn models easily on OpenML tasks.
+from openml_sklearn import SklearnExtension, cat, cont
+
+extension = SklearnExtension()
+
+# Set your openml api key if you want to upload your results to OpenML (eg:
+# https://openml.org/search?type=run&sort=date) . To get one, simply make an
+# account (you don't need one for anything else, just to upload your results),
+# go to your profile and select the API-KEY.
+# Or log in, and navigate to https://www.openml.org/auth/api-key
+openml.config.apikey = ""
+############################################################################
+# Prepare for preprocessors and an OpenML task
+# ============================================
+
+# OpenML contains several key concepts which it needs to make machine learning research shareable.
+# A machine learning experiment consists of one or several runs, which describe the performance of
+# an algorithm (called a flow in OpenML), its hyperparameter settings (called a setup) on a task.
+# A Task is the combination of a dataset, a split and an evaluation metric We choose a dataset from
+# OpenML, (https://www.openml.org/d/1464) and a subsequent task (https://www.openml.org/t/10101) To
+# make your own dataset and task, please refer to
+# https://openml.github.io/openml-python/main/examples/30_extended/create_upload_tutorial.html
+
+# https://www.openml.org/search?type=study&study_type=task&id=218
+task_id = 10101
+seed = 42
+categorical_preproc = (
+    "categorical",
+    OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
+    cat,
+)
+numerical_preproc = ("numerical", SimpleImputer(strategy="median"), cont)
+preproc = ColumnTransformer([categorical_preproc, numerical_preproc])
+
+############################################################################
+# Define a pipeline for the hyperparameter optimization (this is standark for Optuna)
+# =====================================================
+
+# Optuna explanation
+# we follow the `Optuna <https://github.com/optuna/optuna/>`__ search space design.
+
+# OpenML runs
+# We can simply pass the parametrized classifier to `run_model_on_task` to obtain the performance
+# of the pipeline
+# on the specified OpenML task.
+# Do you want to share your results along with an easily reproducible pipeline, you can set an API
+# key and just upload your results.
+# You can find more examples on https://www.openml.org/
+
+
+def objective(trial: optuna.Trial) -> Pipeline:
+    clf = RandomForestClassifier(
+        max_depth=trial.suggest_int("max_depth", 2, 32, log=True),
+        min_samples_leaf=trial.suggest_float("min_samples_leaf", 0.0, 1.0),
+        random_state=seed,
+    )
+    pipe = Pipeline(steps=[("preproc", preproc), ("model", clf)])
+    logger.log(1, f"Running pipeline - {pipe}")
+    run = openml.runs.run_model_on_task(pipe, task=task_id, avoid_duplicate_runs=False)
+
+    logger.log(1, f"Model has been trained - {run}")
+    if openml.config.apikey != "":
+        try:
+            run.publish()
+
+            logger.log(1, f"Run was uploaded to - {run.openml_url}")
+        except Exception as e:  # noqa: BLE001
+            logger.log(1, f"Could not publish run - {e}")
+    else:
+        logger.log(
+            0,
+            "If you want to publish your results to OpenML, please set an apikey",
+        )
+    accuracy = max(run.fold_evaluations["predictive_accuracy"][0].values())
+    logger.log(0, f"Accuracy {accuracy}")
+
+    return accuracy
+
+
+############################################################################
+# Optimize the pipeline
+# =====================
+study = optuna.create_study(direction="maximize")
+logger.log(0, f"Study {study}")
+study.optimize(objective, n_trials=15)
+
+############################################################################
+# Visualize the optimization history
+# ==================================
+fig = optuna.visualization.plot_optimization_history(study)
+fig.show()
diff --git a/examples/_external_or_deprecated/fetch_runtimes_tutorial.py b/examples/_external_or_deprecated/fetch_runtimes_tutorial.py
new file mode 100644
index 000000000..c8f85adc5
--- /dev/null
+++ b/examples/_external_or_deprecated/fetch_runtimes_tutorial.py
@@ -0,0 +1,480 @@
+# %% [markdown]
+# Measuring runtimes for Scikit-learn models
+#
+# The runtime of machine learning models on specific datasets can be a deciding
+# factor on the choice of algorithms, especially for benchmarking and comparison
+# purposes. OpenML's scikit-learn extension provides runtime data from runs of
+# model fit and prediction on tasks or datasets, for both the CPU-clock as well
+# as the actual wallclock-time incurred. The objective of this example is to
+# illustrate how to retrieve such timing measures, and also offer some potential
+# means of usage and interpretation of the same.
+#
+# It should be noted that there are multiple levels at which parallelism can occur.
+#
+# * At the outermost level, OpenML tasks contain fixed data splits, on which the
+#   defined model/flow is executed. Thus, a model can be fit on each OpenML dataset fold
+#   in parallel using the `n_jobs` parameter to `run_model_on_task` or `run_flow_on_task`
+#   (illustrated under Case 2 & 3 below).
+#
+# * The model/flow specified can also include scikit-learn models that perform their own
+#   parallelization. For instance, by specifying `n_jobs` in a Random Forest model definition
+#   (covered under Case 2 below).
+#
+# * The sklearn model can further be an HPO estimator and contain it's own parallelization.
+#   If the base estimator used also supports `parallelization`, then there's at least a 2-level nested
+#   definition for parallelization possible (covered under Case 3 below).
+#
+# We shall cover these 5 representative scenarios for:
+#
+# * (Case 1) Retrieving runtimes for Random Forest training and prediction on each of the
+#   cross-validation folds
+#
+# * (Case 2) Testing the above setting in a parallel setup and monitor the difference using
+#   runtimes retrieved
+#
+# * (Case 3) Comparing RandomSearchCV and GridSearchCV on the above task based on runtimes
+#
+# * (Case 4) Running models that don't run in parallel or models which scikit-learn doesn't
+#   parallelize
+#
+# * (Case 5) Running models that do not release the Python Global Interpreter Lock (GIL)
+
+import numpy as np
+from joblib.parallel import parallel_backend
+from matplotlib import pyplot as plt
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
+from sklearn.naive_bayes import GaussianNB
+from sklearn.neural_network import MLPClassifier
+from sklearn.tree import DecisionTreeClassifier
+
+import openml
+
+# %% [markdown]
+# # Preparing tasks and scikit-learn models
+
+# %%
+task_id = 167119
+
+task = openml.tasks.get_task(task_id)
+print(task)
+
+# Viewing associated data
+n_repeats, n_folds, n_samples = task.get_split_dimensions()
+print(
+    f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}."
+)
+
+
+# Creating utility function
+def print_compare_runtimes(measures):
+    for repeat, val1 in measures["usercpu_time_millis_training"].items():
+        for fold, val2 in val1.items():
+            print(
+                "Repeat #{}-Fold #{}: CPU-{:.3f} vs Wall-{:.3f}".format(
+                    repeat, fold, val2, measures["wall_clock_time_millis_training"][repeat][fold]
+                )
+            )
+
+
+# %% [markdown]
+# # Case 1: Running a Random Forest model on an OpenML task
+# We'll run a Random Forest model and obtain an OpenML run object. We can
+# see the evaluations recorded per fold for the dataset and the information
+# available for this run.
+
+# %%
+clf = RandomForestClassifier(n_estimators=10)
+
+run1 = openml.runs.run_model_on_task(
+    model=clf,
+    task=task,
+    upload_flow=False,
+    avoid_duplicate_runs=False,
+)
+measures = run1.fold_evaluations
+
+print("The timing and performance metrics available: ")
+for key in measures:
+    print(key)
+print()
+
+print(
+    "The performance metric is recorded under `predictive_accuracy` per "
+    "fold and can be retrieved as: "
+)
+for repeat, val1 in measures["predictive_accuracy"].items():
+    for fold, val2 in val1.items():
+        print(f"Repeat #{repeat}-Fold #{fold}: {val2:.4f}")
+    print()
+
+# %% [markdown]
+# The remaining entries recorded in `measures` are the runtime records
+# related as:
+#
+# usercpu_time_millis = usercpu_time_millis_training + usercpu_time_millis_testing
+#
+# wall_clock_time_millis = wall_clock_time_millis_training + wall_clock_time_millis_testing
+#
+# The timing measures recorded as `*_millis_training` contain the per
+# repeat-per fold timing incurred for the execution of the `.fit()` procedure
+# of the model. For `usercpu_time_*` the time recorded using `time.process_time()`
+# is converted to `milliseconds` and stored. Similarly, `time.time()` is used
+# to record the time entry for `wall_clock_time_*`. The `*_millis_testing` entry
+# follows the same procedure but for time taken for the `.predict()` procedure.
+
+# Comparing the CPU and wall-clock training times of the Random Forest model
+
+# %%
+print_compare_runtimes(measures)
+
+# %% [markdown]
+# ## Case 2: Running Scikit-learn model on an OpenML task in parallel
+# Redefining the model to allow parallelism with `n_jobs=2` (2 cores)
+
+# %%
+clf = RandomForestClassifier(n_estimators=10, n_jobs=2)
+
+run2 = openml.runs.run_model_on_task(
+    model=clf, task=task, upload_flow=False, avoid_duplicate_runs=False
+)
+measures = run2.fold_evaluations
+# The wall-clock time recorded per fold should be lesser than Case 1 above
+print_compare_runtimes(measures)
+
+# %% [markdown]
+# Running a Random Forest model on an OpenML task in parallel (all cores available):
+
+# %%
+# Redefining the model to use all available cores with `n_jobs=-1`
+clf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
+
+run3 = openml.runs.run_model_on_task(
+    model=clf, task=task, upload_flow=False, avoid_duplicate_runs=False
+)
+measures = run3.fold_evaluations
+
+# %% [markdown]
+# The wall-clock time recorded per fold should be lesser than the case above,
+# if more than 2 CPU cores are available. The speed-up is more pronounced for
+# larger datasets.
+print_compare_runtimes(measures)
+
+# %% [markdown]
+# We can now observe that the ratio of CPU time to wallclock time is lower
+# than in case 1. This happens because joblib by default spawns subprocesses
+# for the workloads for which CPU time cannot be tracked. Therefore, interpreting
+# the reported CPU and wallclock time requires knowledge of the parallelization
+# applied at runtime.
+
+# %% [markdown]
+# Running the same task with a different parallel backend. Joblib provides multiple
+# backends: {`loky` (default), `multiprocessing`, `dask`, `threading`, `sequential`}.
+# The backend can be explicitly set using a joblib context manager. The behaviour of
+# the job distribution can change and therefore the scale of runtimes recorded too.
+
+# %%
+with parallel_backend(backend="multiprocessing", n_jobs=-1):
+    run3_ = openml.runs.run_model_on_task(
+        model=clf, task=task, upload_flow=False, avoid_duplicate_runs=False
+    )
+measures = run3_.fold_evaluations
+print_compare_runtimes(measures)
+
+# %% [markdown]
+# The CPU time interpretation becomes ambiguous when jobs are distributed over an
+# unknown number of cores or when subprocesses are spawned for which the CPU time
+# cannot be tracked, as in the examples above. It is impossible for OpenML-Python
+# to capture the availability of the number of cores/threads, their eventual
+# utilisation and whether workloads are executed in subprocesses, for various
+# cases that can arise as demonstrated in the rest of the example. Therefore,
+# the final interpretation of the runtimes is left to the `user`.
+
+# %% [markdown]
+# ## Case 3: Running and benchmarking HPO algorithms with their runtimes
+# We shall now optimize a similar RandomForest model for the same task using
+# scikit-learn's HPO support by using GridSearchCV to optimize our earlier
+# RandomForest model's hyperparameter `n_estimators`. Scikit-learn also provides a
+# `refit_time_` for such HPO models, i.e., the time incurred by training
+# and evaluating the model on the best found parameter setting. This is
+# included in the `wall_clock_time_millis_training` measure recorded.
+
+# %%
+
+clf = RandomForestClassifier(n_estimators=10, n_jobs=2)
+
+# GridSearchCV model
+n_iter = 5
+grid_pipe = GridSearchCV(
+    estimator=clf,
+    param_grid={"n_estimators": np.linspace(start=1, stop=50, num=n_iter).astype(int).tolist()},
+    cv=2,
+    n_jobs=2,
+)
+
+run4 = openml.runs.run_model_on_task(
+    model=grid_pipe, task=task, upload_flow=False, avoid_duplicate_runs=False, n_jobs=2
+)
+measures = run4.fold_evaluations
+print_compare_runtimes(measures)
+
+# %% [markdown]
+# Like any optimisation problem, scikit-learn's HPO estimators also generate
+# a sequence of configurations which are evaluated, using which the best found
+# configuration is tracked throughout the trace.
+# The OpenML run object stores these traces as OpenMLRunTrace objects accessible
+# using keys of the pattern (repeat, fold, iterations). Here `fold` implies the
+# outer-cross validation fold as obtained from the task data splits in OpenML.
+# GridSearchCV here performs grid search over the inner-cross validation folds as
+# parameterized by the `cv` parameter. Since `GridSearchCV` in this example performs a
+# `2-fold` cross validation, the runtime recorded per repeat-per fold in the run object
+# is for the entire `fit()` procedure of GridSearchCV thus subsuming the runtimes of
+# the 2-fold (inner) CV search performed.
+
+# %%
+# We earlier extracted the number of repeats and folds for this task:
+print(f"# repeats: {n_repeats}\n# folds: {n_folds}")
+
+# To extract the training runtime of the first repeat, first fold:
+print(run4.fold_evaluations["wall_clock_time_millis_training"][0][0])
+
+# %% [markdown]
+# To extract the training runtime of the 1-st repeat, 4-th (outer) fold and also
+# to fetch the parameters and performance of the evaluations made during
+# the 1-st repeat, 4-th fold evaluation by the Grid Search model.
+
+# %%
+_repeat = 0
+_fold = 3
+print(
+    "Total runtime for repeat {}'s fold {}: {:4f} ms".format(
+        _repeat, _fold, run4.fold_evaluations["wall_clock_time_millis_training"][_repeat][_fold]
+    )
+)
+for i in range(n_iter):
+    key = (_repeat, _fold, i)
+    r = run4.trace.trace_iterations[key]
+    print(
+        "n_estimators: {:>2} - score: {:.3f}".format(
+            r.parameters["parameter_n_estimators"], r.evaluation
+        )
+    )
+
+# %% [markdown]
+# Scikit-learn's HPO estimators also come with an argument `refit=True` as a default.
+# In our previous model definition it was set to True by default, which meant that the best
+# found hyperparameter configuration was used to refit or retrain the model without any inner
+# cross validation. This extra refit time measure is provided by the scikit-learn model as the
+# attribute `refit_time_`.
+# This time is included in the `wall_clock_time_millis_training` measure.
+#
+# For non-HPO estimators, `wall_clock_time_millis = wall_clock_time_millis_training + wall_clock_time_millis_testing`.
+#
+# For HPO estimators, `wall_clock_time_millis = wall_clock_time_millis_training + wall_clock_time_millis_testing + refit_time`.
+#
+# This refit time can therefore be explicitly extracted in this manner:
+
+
+# %%
+
+
+def extract_refit_time(run, repeat, fold):
+    return (
+        run.fold_evaluations["wall_clock_time_millis"][repeat][fold]
+        - run.fold_evaluations["wall_clock_time_millis_training"][repeat][fold]
+        - run.fold_evaluations["wall_clock_time_millis_testing"][repeat][fold]
+    )
+
+
+for repeat in range(n_repeats):
+    for fold in range(n_folds):
+        print(f"Repeat #{repeat}-Fold #{fold}: {extract_refit_time(run4, repeat, fold):.4f}")
+
+# %% [markdown]
+# Along with the GridSearchCV already used above, we demonstrate how such
+# optimisation traces can be retrieved by showing an application of these
+# traces - comparing the speed of finding the best configuration using
+# RandomizedSearchCV and GridSearchCV available with scikit-learn.
+
+# %%
+# RandomizedSearchCV model
+rs_pipe = RandomizedSearchCV(
+    estimator=clf,
+    param_distributions={
+        "n_estimators": np.linspace(start=1, stop=50, num=15).astype(int).tolist()
+    },
+    cv=2,
+    n_iter=n_iter,
+    n_jobs=2,
+)
+run5 = openml.runs.run_model_on_task(
+    model=rs_pipe, task=task, upload_flow=False, avoid_duplicate_runs=False, n_jobs=2
+)
+
+# %% [markdown]
+# Since for the call to ``openml.runs.run_model_on_task`` the parameter
+# ``n_jobs`` is set to its default ``None``, the evaluations across the OpenML folds
+# are not parallelized. Hence, the time recorded is agnostic to the ``n_jobs``
+# being set at both the HPO estimator ``GridSearchCV`` as well as the base
+# estimator ``RandomForestClassifier`` in this case. The OpenML extension only records the
+# time taken for the completion of the complete ``fit()`` call, per-repeat per-fold.
+#
+# This notion can be used to extract and plot the best found performance per
+# fold by the HPO model and the corresponding time taken for search across
+# that fold. Moreover, since ``n_jobs=None`` for ``openml.runs.run_model_on_task``
+# the runtimes per fold can be cumulatively added to plot the trace against time.
+
+
+# %%
+def extract_trace_data(run, n_repeats, n_folds, n_iter, key=None):
+    key = "wall_clock_time_millis_training" if key is None else key
+    data = {"score": [], "runtime": []}
+    for i_r in range(n_repeats):
+        for i_f in range(n_folds):
+            data["runtime"].append(run.fold_evaluations[key][i_r][i_f])
+            for i_i in range(n_iter):
+                r = run.trace.trace_iterations[(i_r, i_f, i_i)]
+                if r.selected:
+                    data["score"].append(r.evaluation)
+                    break
+    return data
+
+
+def get_incumbent_trace(trace):
+    best_score = 1
+    inc_trace = []
+    for i, r in enumerate(trace):
+        if i == 0 or (1 - r) < best_score:
+            best_score = 1 - r
+        inc_trace.append(best_score)
+    return inc_trace
+
+
+grid_data = extract_trace_data(run4, n_repeats, n_folds, n_iter)
+rs_data = extract_trace_data(run5, n_repeats, n_folds, n_iter)
+
+plt.clf()
+plt.plot(
+    np.cumsum(grid_data["runtime"]), get_incumbent_trace(grid_data["score"]), label="Grid Search"
+)
+plt.plot(
+    np.cumsum(rs_data["runtime"]), get_incumbent_trace(rs_data["score"]), label="Random Search"
+)
+plt.xscale("log")
+plt.yscale("log")
+plt.xlabel("Wallclock time (in milliseconds)")
+plt.ylabel("1 - Accuracy")
+plt.title("Optimisation Trace Comparison")
+plt.legend()
+plt.show()
+
+# %% [markdown]
+# ## Case 4: Running models that scikit-learn doesn't parallelize
+# Both scikit-learn and OpenML depend on parallelism implemented through `joblib`.
+# However, there can be cases where either models cannot be parallelized or don't
+# depend on joblib for its parallelism. 2 such cases are illustrated below.
+#
+# Running a Decision Tree model that doesn't support parallelism implicitly, but
+# using OpenML to parallelize evaluations for the outer-cross validation folds.
+
+# %%
+dt = DecisionTreeClassifier()
+
+run6 = openml.runs.run_model_on_task(
+    model=dt, task=task, upload_flow=False, avoid_duplicate_runs=False, n_jobs=2
+)
+measures = run6.fold_evaluations
+print_compare_runtimes(measures)
+
+# %% [markdown]
+# Although the decision tree does not run in parallel, it can release the
+# `Python GIL <https://docs.python.org/dev/glossary.html#term-global-interpreter-lock>`_.
+# This can result in surprising runtime measures as demonstrated below:
+
+# %%
+with parallel_backend("threading", n_jobs=-1):
+    run7 = openml.runs.run_model_on_task(
+        model=dt, task=task, upload_flow=False, avoid_duplicate_runs=False
+    )
+measures = run7.fold_evaluations
+print_compare_runtimes(measures)
+
+# %% [markdown]
+# Running a Neural Network from scikit-learn that uses scikit-learn independent
+# parallelism using libraries such as
+# [MKL, OpenBLAS or BLIS](https://scikit-learn.org/stable/computing/parallelism.html#parallel-numpy-and-scipy-routines-from-numerical-libraries>).
+
+# %%
+mlp = MLPClassifier(max_iter=10)
+
+run8 = openml.runs.run_model_on_task(
+    model=mlp, task=task, upload_flow=False, avoid_duplicate_runs=False
+)
+measures = run8.fold_evaluations
+print_compare_runtimes(measures)
+
+# %% [markdown]
+# ## Case 5: Running Scikit-learn models that don't release GIL
+# Certain Scikit-learn models do not release the
+# [Python GIL](https://docs.python.org/dev/glossary.html#term-global-interpreter-lock) and
+# are also not executed in parallel via a BLAS library. In such cases, the
+# CPU times and wallclock times are most likely trustworthy. Note however
+# that only very few models such as naive Bayes models are of this kind.
+
+# %%
+clf = GaussianNB()
+
+with parallel_backend("multiprocessing", n_jobs=-1):
+    run9 = openml.runs.run_model_on_task(
+        model=clf, task=task, upload_flow=False, avoid_duplicate_runs=False
+    )
+measures = run9.fold_evaluations
+print_compare_runtimes(measures)
+
+# %% [markdown]
+# ## Summmary
+# The scikit-learn extension for OpenML-Python records model runtimes for the
+# CPU-clock and the wall-clock times. The above examples illustrated how these
+# recorded runtimes can be extracted when using a scikit-learn model and under
+# parallel setups too. To summarize, the scikit-learn extension measures the:
+#
+# * `CPU-time` & `wallclock-time` for the whole run
+#
+#   * A run here corresponds to a call to `run_model_on_task` or `run_flow_on_task`
+#   * The recorded time is for the model fit for each of the outer-cross validations folds,
+#     i.e., the OpenML data splits
+#
+# * Python's `time` module is used to compute the runtimes
+#
+#   * `CPU-time` is recorded using the responses of `time.process_time()`
+#   * `wallclock-time` is recorded using the responses of `time.time()`
+#
+# * The timings recorded by OpenML per outer-cross validation fold is agnostic to
+#   model parallelisation
+#
+#   * The wallclock times reported in Case 2 above highlights the speed-up on using `n_jobs=-1`
+#     in comparison to `n_jobs=2`, since the timing recorded by OpenML is for the entire
+#     `fit()` procedure, whereas the parallelisation is performed inside `fit()` by scikit-learn
+#   * The CPU-time for models that are run in parallel can be difficult to interpret
+#
+# * `CPU-time` & `wallclock-time` for each search per outer fold in an HPO run
+#
+#   * Reports the total time for performing search on each of the OpenML data split, subsuming
+#     any sort of parallelism that happened as part of the HPO estimator or the underlying
+#     base estimator
+#   * Also allows extraction of the `refit_time` that scikit-learn measures using `time.time()`
+#     for retraining the model per outer fold, for the best found configuration
+#
+# * `CPU-time` & `wallclock-time` for models that scikit-learn doesn't parallelize
+#
+#   * Models like Decision Trees or naive Bayes don't parallelize and thus both the wallclock and
+#     CPU times are similar in runtime for the OpenML call
+#   * However, models implemented in Cython, such as the Decision Trees can release the GIL and
+#     still run in parallel if a `threading` backend is used by joblib.
+#   * Scikit-learn Neural Networks can undergo parallelization implicitly owing to thread-level
+#     parallelism involved in the linear algebraic operations and thus the wallclock-time and
+#     CPU-time can differ.
+#
+# Because of all the cases mentioned above it is crucial to understand which case is triggered
+# when reporting runtimes for scikit-learn models measured with OpenML-Python!
+# License: BSD 3-Clause
diff --git a/examples/_external_or_deprecated/flow_id_tutorial.py b/examples/_external_or_deprecated/flow_id_tutorial.py
new file mode 100644
index 000000000..19190cf0b
--- /dev/null
+++ b/examples/_external_or_deprecated/flow_id_tutorial.py
@@ -0,0 +1,85 @@
+# %% [markdown]
+# # Obtaining Flow IDs
+# This tutorial discusses different ways to obtain the ID of a flow in order to perform further
+# analysis.
+
+
+# %%
+import sklearn.tree
+
+import openml
+
+# %% [markdown]
+# .. warning::
+#    .. include:: ../../test_server_usage_warning.txt
+
+# %%
+openml.config.start_using_configuration_for_example()
+openml.config.server = "https://api.openml.org/api/v1/xml"
+
+# %%
+# Defining a classifier
+clf = sklearn.tree.DecisionTreeClassifier()
+
+# %% [markdown]
+# ## 1. Obtaining a flow given a classifier
+
+# %%
+flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish()
+flow_id = flow.flow_id
+print(flow_id)
+
+# %% [markdown]
+# This piece of code is rather involved. First, it retrieves a
+# :class:`~openml.extensions.Extension` which is registered and can handle the given model,
+# in our case it is :class:`openml.extensions.sklearn.SklearnExtension`. Second, the extension
+# converts the classifier into an instance of :class:`openml.OpenMLFlow`. Third and finally,
+# the publish method checks whether the current flow is already present on OpenML. If not,
+# it uploads the flow, otherwise, it updates the current instance with all information computed
+# by the server (which is obviously also done when uploading/publishing a flow).
+#
+# To simplify the usage we have created a helper function which automates all these steps:
+
+# %%
+flow_id = openml.flows.get_flow_id(model=clf)
+print(flow_id)
+
+# %% [markdown]
+# ## 2. Obtaining a flow given its name
+# The schema of a flow is given in XSD (
+# [here](https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.implementation.upload.xsd)).
+# Only two fields are required, a unique name, and an external version. While it should be pretty
+# obvious why we need a name, the need for the additional external version information might not
+# be immediately clear. However, this information is very important as it allows to have multiple
+# flows with the same name for different versions of a software. This might be necessary if an
+# algorithm or implementation introduces, renames or drop hyperparameters over time.
+
+# %%
+print(flow.name, flow.external_version)
+
+# %% [markdown]
+# The name and external version are automatically added to a flow when constructing it from a
+# model. We can then use them to retrieve the flow id as follows:
+
+# %%
+flow_id = openml.flows.flow_exists(name=flow.name, external_version=flow.external_version)
+print(flow_id)
+
+# %% [markdown]
+# We can also retrieve all flows for a given name:
+
+# %%
+flow_ids = openml.flows.get_flow_id(name=flow.name)
+print(flow_ids)
+
+# %% [markdown]
+# This also works with the actual model (generalizing the first part of this example):
+
+# %%
+flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
+print(flow_ids)
+
+# %%
+# Deactivating test configuration
+openml.config.stop_using_configuration_for_example()
+# License: BSD 3-Clause
diff --git a/examples/_external_or_deprecated/flows_and_runs_tutorial.py b/examples/_external_or_deprecated/flows_and_runs_tutorial.py
new file mode 100644
index 000000000..71d6960bd
--- /dev/null
+++ b/examples/_external_or_deprecated/flows_and_runs_tutorial.py
@@ -0,0 +1,250 @@
+# %% [markdown]
+# #Flows and Runs
+# This tutorial covers how to train/run a model and how to upload the results.
+
+# %%
+from sklearn import compose, ensemble, impute, neighbors, pipeline, preprocessing, tree
+
+import openml
+
+# %% [markdown]
+# We'll use the test server for the rest of this tutorial.
+#
+# .. warning::
+#    .. include:: ../../test_server_usage_warning.txt
+
+# %%
+openml.config.start_using_configuration_for_example()
+
+# %% [markdown]
+# ## Train machine learning models
+#
+# Train a scikit-learn model on the data manually.
+
+# %%
+# NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68
+dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1)
+X, y, categorical_indicator, attribute_names = dataset.get_data(
+    target=dataset.default_target_attribute
+)
+clf = neighbors.KNeighborsClassifier(n_neighbors=1)
+clf.fit(X, y)
+
+# %% [markdown]
+# You can also ask for meta-data to automatically preprocess the data.
+#
+# * e.g. categorical features -> do feature encoding
+
+# %%
+dataset = openml.datasets.get_dataset(17)
+X, y, categorical_indicator, attribute_names = dataset.get_data(
+    target=dataset.default_target_attribute
+)
+print(f"Categorical features: {categorical_indicator}")
+transformer = compose.ColumnTransformer(
+    [("one_hot_encoder", preprocessing.OneHotEncoder(categories="auto"), categorical_indicator)]
+)
+X = transformer.fit_transform(X)
+clf.fit(X, y)
+
+# %% [markdown]
+# ## Runs: Easily explore models
+# We can run (many) scikit-learn algorithms on (many) OpenML tasks.
+
+# %%
+# Get a task
+task = openml.tasks.get_task(403)
+
+# Build any classifier or pipeline
+clf = tree.DecisionTreeClassifier()
+
+# Run the flow
+run = openml.runs.run_model_on_task(clf, task)
+
+print(run)
+
+# %% [markdown]
+# Share the run on the OpenML server
+#
+# So far the run is only available locally. By calling the publish function,
+# the run is sent to the OpenML server:
+
+# %%
+myrun = run.publish()
+# For this tutorial, our configuration publishes to the test server
+# as to not pollute the main server.
+print(f"Uploaded to {myrun.openml_url}")
+
+# %% [markdown]
+# We can now also inspect the flow object which was automatically created:
+
+# %%
+flow = openml.flows.get_flow(run.flow_id)
+print(flow)
+
+# %% [markdown]
+# ## It also works with pipelines
+#
+# When you need to handle 'dirty' data, build pipelines to model then automatically.
+# To demonstrate this using the dataset `credit-a <https://test.openml.org/d/16>`_ via
+# `task <https://test.openml.org/t/96>`_ as it contains both numerical and categorical
+# variables and missing values in both.
+
+# %%
+task = openml.tasks.get_task(96)
+
+# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines
+from openml.extensions.sklearn import cat, cont
+
+pipe = pipeline.Pipeline(
+    steps=[
+        (
+            "Preprocessing",
+            compose.ColumnTransformer(
+                [
+                    (
+                        "categorical",
+                        preprocessing.OneHotEncoder(handle_unknown="ignore"),
+                        cat,  # returns the categorical feature indices
+                    ),
+                    (
+                        "continuous",
+                        impute.SimpleImputer(strategy="median"),
+                        cont,
+                    ),  # returns the numeric feature indices
+                ]
+            ),
+        ),
+        ("Classifier", ensemble.RandomForestClassifier(n_estimators=10)),
+    ]
+)
+
+run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
+myrun = run.publish()
+print(f"Uploaded to {myrun.openml_url}")
+
+
+# %% [markdown]
+# The above pipeline works with the helper functions that internally deal with pandas DataFrame.
+# In the case, pandas is not available, or a NumPy based data processing is the requirement, the
+# above pipeline is presented below to work with NumPy.
+
+# %%
+# Extracting the indices of the categorical columns
+features = task.get_dataset().features
+categorical_feature_indices = []
+numeric_feature_indices = []
+for i in range(len(features)):
+    if features[i].name == task.target_name:
+        continue
+    if features[i].data_type == "nominal":
+        categorical_feature_indices.append(i)
+    else:
+        numeric_feature_indices.append(i)
+
+pipe = pipeline.Pipeline(
+    steps=[
+        (
+            "Preprocessing",
+            compose.ColumnTransformer(
+                [
+                    (
+                        "categorical",
+                        preprocessing.OneHotEncoder(handle_unknown="ignore"),
+                        categorical_feature_indices,
+                    ),
+                    (
+                        "continuous",
+                        impute.SimpleImputer(strategy="median"),
+                        numeric_feature_indices,
+                    ),
+                ]
+            ),
+        ),
+        ("Classifier", ensemble.RandomForestClassifier(n_estimators=10)),
+    ]
+)
+
+run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
+myrun = run.publish()
+print(f"Uploaded to {myrun.openml_url}")
+
+# %% [markdown]
+# ## Running flows on tasks offline for later upload
+# For those scenarios where there is no access to internet, it is possible to run
+# a model on a task without uploading results or flows to the server immediately.
+
+# To perform the following line offline, it is required to have been called before
+# such that the task is cached on the local openml cache directory:
+
+# %%
+task = openml.tasks.get_task(96)
+
+# The following lines can then be executed offline:
+run = openml.runs.run_model_on_task(
+    pipe,
+    task,
+    avoid_duplicate_runs=False,
+    upload_flow=False,
+)
+
+# The run may be stored offline, and the flow will be stored along with it:
+run.to_filesystem(directory="myrun")
+
+# They may be loaded and uploaded at a later time
+run = openml.runs.OpenMLRun.from_filesystem(directory="myrun")
+run.publish()
+
+# Publishing the run will automatically upload the related flow if
+# it does not yet exist on the server.
+
+# %% [markdown]
+# Alternatively, one can also directly run flows.
+
+# %%
+# Get a task
+task = openml.tasks.get_task(403)
+
+# Build any classifier or pipeline
+clf = tree.ExtraTreeClassifier()
+
+# Obtain the scikit-learn extension interface to convert the classifier
+# into a flow object.
+extension = openml.extensions.get_extension_by_model(clf)
+flow = extension.model_to_flow(clf)
+
+run = openml.runs.run_flow_on_task(flow, task)
+
+# %% [markdown]
+# ## Challenge
+#
+# Try to build the best possible models on several OpenML tasks,
+# compare your results with the rest of the class and learn from
+# them. Some tasks you could try (or browse openml.org):
+#
+# * EEG eye state: data_id:`1471 <https://www.openml.org/d/1471>`_,
+#   task_id:`14951 <https://www.openml.org/t/14951>`_
+# * Volcanoes on Venus: data_id:`1527 <https://www.openml.org/d/1527>`_,
+#   task_id:`10103 <https://www.openml.org/t/10103>`_
+# * Walking activity: data_id:`1509 <https://www.openml.org/d/1509>`_,
+#   task_id:`9945 <https://www.openml.org/t/9945>`_, 150k instances.
+# * Covertype (Satellite): data_id:`150 <https://www.openml.org/d/150>`_,
+#   task_id:`218 <https://www.openml.org/t/218>`_, 500k instances.
+# * Higgs (Physics): data_id:`23512 <https://www.openml.org/d/23512>`_,
+#   task_id:`52950 <https://www.openml.org/t/52950>`_, 100k instances, missing values.
+
+# %%
+# Easy benchmarking:
+for task_id in [115]:  # Add further tasks. Disclaimer: they might take some time
+    task = openml.tasks.get_task(task_id)
+    data = openml.datasets.get_dataset(task.dataset_id)
+    clf = neighbors.KNeighborsClassifier(n_neighbors=5)
+
+    run = openml.runs.run_model_on_task(clf, task, avoid_duplicate_runs=False)
+    myrun = run.publish()
+    print(f"kNN on {data.name}: {myrun.openml_url}")
+
+
+# %%
+openml.config.stop_using_configuration_for_example()
+# License: BSD 3-Clause
diff --git a/examples/_external_or_deprecated/plot_svm_hyperparameters_tutorial.py b/examples/_external_or_deprecated/plot_svm_hyperparameters_tutorial.py
new file mode 100644
index 000000000..7bb72db5a
--- /dev/null
+++ b/examples/_external_or_deprecated/plot_svm_hyperparameters_tutorial.py
@@ -0,0 +1,83 @@
+# %% [markdown]
+# # Plotting hyperparameter surfaces
+
+# %%
+import numpy as np
+
+import openml
+
+# %% [markdown]
+# # First step - obtaining the data
+# First, we need to choose an SVM flow, for example 8353, and a task. Finding the IDs of them are
+# not part of this tutorial, this could for example be done via the website.
+#
+# For this we use the function ``list_evaluations_setup`` which can automatically join
+# evaluations conducted by the server with the hyperparameter settings extracted from the
+# uploaded runs (called *setup*).
+
+# %%
+df = openml.evaluations.list_evaluations_setups(
+    function="predictive_accuracy",
+    flows=[8353],
+    tasks=[6],
+    # Using this flag incorporates the hyperparameters into the returned dataframe. Otherwise,
+    # the dataframe would contain a field ``paramaters`` containing an unparsed dictionary.
+    parameters_in_separate_columns=True,
+)
+print(df.head(n=10))
+
+# %% [markdown]
+# We can see all the hyperparameter names in the columns of the dataframe:
+
+# %%
+for name in df.columns:
+    print(name)
+
+# %% [markdown]
+# Next, we cast and transform the hyperparameters of interest (``C`` and ``gamma``) so that we
+# can nicely plot them.
+
+# %%
+hyperparameters = ["sklearn.svm.classes.SVC(16)_C", "sklearn.svm.classes.SVC(16)_gamma"]
+df[hyperparameters] = df[hyperparameters].astype(float).apply(np.log10)
+
+# %% [markdown]
+# ## Option 1 - plotting via the pandas helper functions
+
+# %%
+df.plot.hexbin(
+    x="sklearn.svm.classes.SVC(16)_C",
+    y="sklearn.svm.classes.SVC(16)_gamma",
+    C="value",
+    reduce_C_function=np.mean,
+    gridsize=25,
+    title="SVM performance landscape",
+)
+
+# %% [markdown]
+# ## Option 2 - plotting via matplotlib
+
+# %%
+import matplotlib.pyplot as plt
+
+fig, ax = plt.subplots()
+
+C = df["sklearn.svm.classes.SVC(16)_C"]
+gamma = df["sklearn.svm.classes.SVC(16)_gamma"]
+score = df["value"]
+
+# Plotting all evaluations:
+ax.plot(C, gamma, "ko", ms=1)
+# Create a contour plot
+cntr = ax.tricontourf(C, gamma, score, levels=12, cmap="RdBu_r")
+# Adjusting the colorbar
+fig.colorbar(cntr, ax=ax, label="accuracy")
+# Adjusting the axis limits
+ax.set(
+    xlim=(min(C), max(C)),
+    ylim=(min(gamma), max(gamma)),
+    xlabel="C (log10)",
+    ylabel="gamma (log10)",
+)
+ax.set_title("SVM performance landscape")
+# License: BSD 3-Clause
diff --git a/examples/_external_or_deprecated/run_setup_tutorial.py b/examples/_external_or_deprecated/run_setup_tutorial.py
new file mode 100644
index 000000000..25591bb58
--- /dev/null
+++ b/examples/_external_or_deprecated/run_setup_tutorial.py
@@ -0,0 +1,119 @@
+# %% [markdown]
+# # Run Setup
+# One of the key features of the openml-python library is that is allows to
+# reinstantiate flows with hyperparameter settings that were uploaded before.
+# This tutorial uses the concept of setups. Although setups are not extensively
+# described in the OpenML documentation (because most users will not directly
+# use them), they form a important concept within OpenML distinguishing between
+# hyperparameter configurations.
+# A setup is the combination of a flow with all its hyperparameters set.
+#
+# A key requirement for reinstantiating a flow is to have the same scikit-learn
+# version as the flow that was uploaded. However, this tutorial will upload the
+# flow (that will later be reinstantiated) itself, so it can be ran with any
+# scikit-learn version that is supported by this library. In this case, the
+# requirement of the corresponding scikit-learn versions is automatically met.
+#
+# In this tutorial we will
+#     1) Create a flow and use it to solve a task;
+#     2) Download the flow, reinstantiate the model with same hyperparameters,
+#        and solve the same task again;
+#     3) We will verify that the obtained results are exactly the same.
+
+# %%
+
+import numpy as np
+from sklearn.compose import ColumnTransformer
+from sklearn.decomposition import TruncatedSVD
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+import openml
+from openml.extensions.sklearn import cat, cont
+
+# %% [markdown]
+# .. warning::
+#    .. include:: ../../test_server_usage_warning.txt
+
+# %%
+openml.config.start_using_configuration_for_example()
+
+# %% [markdown]
+# 1) Create a flow and use it to solve a task
+
+# First, let's download the task that we are interested in
+
+# %%
+task = openml.tasks.get_task(6)
+
+# %% [markdown]
+# we will create a fairly complex model, with many preprocessing components and
+# many potential hyperparameters. Of course, the model can be as complex and as
+# easy as you want it to be
+
+
+# %%
+cat_imp = make_pipeline(
+    OneHotEncoder(handle_unknown="ignore"),
+    TruncatedSVD(),
+)
+cont_imp = SimpleImputer(strategy="median")
+ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
+model_original = Pipeline(
+    steps=[
+        ("transform", ct),
+        ("estimator", RandomForestClassifier()),
+    ]
+)
+
+# %% [markdown]
+# Let's change some hyperparameters. Of course, in any good application we
+# would tune them using, e.g., Random Search or Bayesian Optimization, but for
+# the purpose of this tutorial we set them to some specific values that might
+# or might not be optimal
+
+# %%
+hyperparameters_original = {
+    "estimator__criterion": "gini",
+    "estimator__n_estimators": 50,
+    "estimator__max_depth": 10,
+    "estimator__min_samples_leaf": 1,
+}
+model_original.set_params(**hyperparameters_original)
+
+# solve the task and upload the result (this implicitly creates the flow)
+run = openml.runs.run_model_on_task(model_original, task, avoid_duplicate_runs=False)
+run_original = run.publish()  # this implicitly uploads the flow
+
+# %% [markdown]
+# ## 2) Download the flow and solve the same task again.
+
+# %%
+# obtain setup id (note that the setup id is assigned by the OpenML server -
+# therefore it was not yet available in our local copy of the run)
+run_downloaded = openml.runs.get_run(run_original.run_id)
+setup_id = run_downloaded.setup_id
+
+# after this, we can easily reinstantiate the model
+model_duplicate = openml.setups.initialize_model(setup_id)
+# it will automatically have all the hyperparameters set
+
+# and run the task again
+run_duplicate = openml.runs.run_model_on_task(model_duplicate, task, avoid_duplicate_runs=False)
+
+
+# %% [markdown]
+# ## 3) We will verify that the obtained results are exactly the same.
+
+# %%
+# the run has stored all predictions in the field data content
+np.testing.assert_array_equal(run_original.data_content, run_duplicate.data_content)
+
+
+# %%
+openml.config.stop_using_configuration_for_example()
+
+# By: Jan N. van Rijn
+# License: BSD 3-Clause
diff --git a/examples/_external_or_deprecated/upload_amlb_flows_and_runs.py b/examples/_external_or_deprecated/upload_amlb_flows_and_runs.py
new file mode 100644
index 000000000..b43926d4e
--- /dev/null
+++ b/examples/_external_or_deprecated/upload_amlb_flows_and_runs.py
@@ -0,0 +1,222 @@
+# %% [markdown]
+# # Creating and Using a Custom Flow
+
+# The most convenient way to create a flow for your machine learning workflow is to generate it
+# automatically as described in the
+# ["Obtaining Flow IDs"](../../30_extended/flow_id_tutorial) tutorial.
+# However, there are scenarios where this is not possible, such
+# as when the flow uses a framework without an extension or when the flow is described by a script.
+
+# In those cases you can still create a custom flow by following the steps of this tutorial.
+# As an example we will use the flows generated for the
+# [AutoML Benchmark](https://openml.github.io/automlbenchmark/),
+# and also show how to link runs to the custom flow.
+
+# %%
+from collections import OrderedDict
+
+import numpy as np
+
+import openml
+from openml.runs.functions import format_prediction
+
+# %% [markdown]
+# .. warning::
+#    .. include:: ../../test_server_usage_warning.txt
+
+# %%
+openml.config.start_using_configuration_for_example()
+
+# %% [markdown]
+# ## 1. Defining the flow
+# The first step is to define all the hyperparameters of your flow.
+# The API pages feature a descriptions of each variable of the :class:`openml.flows.OpenMLFlow`.
+# Note that `external version` and `name` together uniquely identify a flow.
+#
+# The AutoML Benchmark runs AutoML systems across a range of tasks.
+# OpenML stores Flows for each AutoML system. However, the AutoML benchmark adds
+# preprocessing to the flow, so should be described in a new flow.
+#
+# We will break down the flow arguments into several groups, for the tutorial.
+# First we will define the name and version information.
+# Make sure to leave enough information so others can determine exactly which
+# version of the package/script is used. Use tags so users can find your flow easily.
+
+# %%
+general = {
+    "name": "automlbenchmark_autosklearn",
+    "description": (
+        "Auto-sklearn as set up by the AutoML Benchmark"
+        "Source: https://github.com/openml/automlbenchmark/releases/tag/v0.9"
+    ),
+    "external_version": "amlb==0.9",
+    "language": "English",
+    "tags": ["amlb", "benchmark", "study_218"],
+    "dependencies": "amlb==0.9",
+}
+
+# %% [markdown]
+# Next we define the flow hyperparameters. We define their name and default value in `parameters`,
+# and provide meta-data for each hyperparameter through `parameters_meta_info`.
+# Note that even though the argument name is `parameters` they describe the hyperparameters.
+# The use of ordered dicts is required.
+
+# %%
+flow_hyperparameters = {
+    "parameters": OrderedDict(time="240", memory="32", cores="8"),
+    "parameters_meta_info": OrderedDict(
+        cores=OrderedDict(description="number of available cores", data_type="int"),
+        memory=OrderedDict(description="memory in gigabytes", data_type="int"),
+        time=OrderedDict(description="time in minutes", data_type="int"),
+    ),
+}
+
+# %% [markdown]
+# It is possible to build a flow which uses other flows.
+# For example, the Random Forest Classifier is a flow, but you could also construct a flow
+# which uses a Random Forest Classifier in a ML pipeline. When constructing the pipeline flow,
+# you can use the Random Forest Classifier flow as a *subflow*. It allows for
+# all hyperparameters of the Random Classifier Flow to also be specified in your pipeline flow.
+#
+# Note: you can currently only specific one subflow as part of the components.
+#
+# In this example, the auto-sklearn flow is a subflow: the auto-sklearn flow is entirely executed as part of this flow.
+# This allows people to specify auto-sklearn hyperparameters used in this flow.
+# In general, using a subflow is not required.
+#
+# Note: flow 9313 is not actually the right flow on the test server,
+# but that does not matter for this demonstration.
+
+# %%
+autosklearn_flow = openml.flows.get_flow(9313)  # auto-sklearn 0.5.1
+subflow = {
+    "components": OrderedDict(automl_tool=autosklearn_flow),
+    # If you do not want to reference a subflow, you can use the following:
+    # components=OrderedDict(),
+}
+
+# %% [markdown]
+# With all parameters of the flow defined, we can now initialize the OpenMLFlow and publish.
+# Because we provided all the details already, we do not need to provide a `model` to the flow.
+#
+# In our case, we don't even have a model. It is possible to have a model but still require
+# to follow these steps when the model (python object) does not have an extensions from which
+# to automatically extract the hyperparameters.
+# So whether you have a model with no extension or no model at all, explicitly set
+# the model of the flow to `None`.
+
+# %%
+autosklearn_amlb_flow = openml.flows.OpenMLFlow(
+    **general,
+    **flow_hyperparameters,
+    **subflow,
+    model=None,
+)
+autosklearn_amlb_flow.publish()
+print(f"autosklearn flow created: {autosklearn_amlb_flow.flow_id}")
+
+# %% [markdown]
+# ## 2. Using the flow
+# This Section will show how to upload run data for your custom flow.
+# Take care to change the values of parameters as well as the task id,
+# to reflect the actual run.
+# Task and parameter values in the example are fictional.
+
+# %%
+flow_id = autosklearn_amlb_flow.flow_id
+
+parameters = [
+    OrderedDict([("oml:name", "cores"), ("oml:value", 4), ("oml:component", flow_id)]),
+    OrderedDict([("oml:name", "memory"), ("oml:value", 16), ("oml:component", flow_id)]),
+    OrderedDict([("oml:name", "time"), ("oml:value", 120), ("oml:component", flow_id)]),
+]
+
+task_id = 1200  # Iris Task
+task = openml.tasks.get_task(task_id)
+dataset_id = task.get_dataset().dataset_id
+
+
+# %% [markdown]
+# The last bit of information for the run we need are the predicted values.
+# The exact format of the predictions will depend on the task.
+#
+# The predictions should always be a list of lists, each list should contain:
+#
+# - the repeat number: for repeated evaluation strategies. (e.g. repeated cross-validation)
+# - the fold number: for cross-validation. (what should this be for holdout?)
+# - 0: this field is for backward compatibility.
+# - index: the row (of the original dataset) for which the prediction was made.
+# - p_1, ..., p_c: for each class the predicted probability of the sample
+#   belonging to that class. (no elements for regression tasks)
+#   Make sure the order of these elements follows the order of `task.class_labels`.
+# - the predicted class/value for the sample
+# - the true class/value for the sample
+#
+# When using openml-python extensions (such as through `run_model_on_task`),
+# all of this formatting is automatic.
+# Unfortunately we can not automate this procedure for custom flows,
+# which means a little additional effort is required.
+#
+# Here we generated some random predictions in place.
+# You can ignore this code, or use it to better understand the formatting of the predictions.
+#
+# Find the repeats/folds for this task:
+
+# %%
+n_repeats, n_folds, _ = task.get_split_dimensions()
+all_test_indices = [
+    (repeat, fold, index)
+    for repeat in range(n_repeats)
+    for fold in range(n_folds)
+    for index in task.get_train_test_split_indices(fold, repeat)[1]
+]
+
+# random class probabilities (Iris has 150 samples and 3 classes):
+r = np.random.rand(150 * n_repeats, 3)  # noqa: NPY002
+# scale the random values so that the probabilities of each sample sum to 1:
+y_proba = r / r.sum(axis=1).reshape(-1, 1)
+y_pred = y_proba.argmax(axis=1)
+
+class_map = dict(zip(range(3), task.class_labels, strict=False))
+_, y_true = task.get_X_and_y()
+y_true = [class_map[y] for y in y_true]
+
+# We format the predictions with the utility function `format_prediction`.
+# It will organize the relevant data in the expected format/order.
+predictions = []
+for where, y, yp, proba in zip(all_test_indices, y_true, y_pred, y_proba, strict=False):
+    repeat, fold, index = where
+
+    prediction = format_prediction(
+        task=task,
+        repeat=repeat,
+        fold=fold,
+        index=index,
+        prediction=class_map[yp],
+        truth=y,
+        proba=dict(zip(task.class_labels, proba, strict=False)),
+    )
+    predictions.append(prediction)
+
+# %% [markdown]
+# Finally we can create the OpenMLRun object and upload.
+# We use the argument setup_string because the used flow was a script.
+
+# %%
+benchmark_command = "python3 runbenchmark.py auto-sklearn medium -m aws -t 119"
+my_run = openml.runs.OpenMLRun(
+    task_id=task_id,
+    flow_id=flow_id,
+    dataset_id=dataset_id,
+    parameter_settings=parameters,
+    setup_string=benchmark_command,
+    data_content=predictions,
+    tags=["study_218"],
+    description_text="Run generated by the Custom Flow tutorial.",
+)
+my_run.publish()
+print("run created:", my_run.run_id)
+
+# %%
+openml.config.stop_using_configuration_for_example()
+# License: BSD 3-Clause
diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py
deleted file mode 100644
index cb5506cfd..000000000
--- a/examples/create_upload_tutorial.py
+++ /dev/null
@@ -1,318 +0,0 @@
-"""
-Dataset upload tutorial
-=======================
-
-A tutorial on how to create and upload a dataset to OpenML.
-"""
-import numpy as np
-import pandas as pd
-import sklearn.datasets
-from scipy.sparse import coo_matrix
-
-import openml
-from openml.datasets.functions import create_dataset
-
-############################################################################
-# .. warning:: This example uploads data. For that reason, this example
-#   connects to the test server at test.openml.org. This prevents the main
-#   server from crowding with example datasets, tasks, runs, and so on.
-
-openml.config.start_using_configuration_for_example()
-############################################################################
-
-############################################################################
-# Below we will cover the following cases of the dataset object:
-#
-# * A numpy array
-# * A list
-# * A pandas dataframe
-# * A sparse matrix
-# * A pandas sparse dataframe
-
-############################################################################
-# Dataset is a numpy array
-# ========================
-# A numpy array can contain lists in the case of dense data or it can contain
-# OrderedDicts in the case of sparse data.
-#
-# Prepare dataset
-# ^^^^^^^^^^^^^^^
-# Load an example dataset from scikit-learn which we will upload to OpenML.org
-# via the API.
-
-diabetes = sklearn.datasets.load_diabetes()
-name = 'Diabetes(scikit-learn)'
-X = diabetes.data
-y = diabetes.target
-attribute_names = diabetes.feature_names
-description = diabetes.DESCR
-
-############################################################################
-# OpenML does not distinguish between the attributes and targets on the data
-# level and stores all data in a single matrix.
-#
-# The target feature is indicated as meta-data of the dataset (and tasks on
-# that data).
-
-data = np.concatenate((X, y.reshape((-1, 1))), axis=1)
-attribute_names = list(attribute_names)
-attributes = [
-    (attribute_name, 'REAL') for attribute_name in attribute_names
-] + [('class', 'INTEGER')]
-citation = (
-    "Bradley Efron, Trevor Hastie, Iain Johnstone and "
-    "Robert Tibshirani (2004) (Least Angle Regression) "
-    "Annals of Statistics (with discussion), 407-499"
-)
-paper_url = (
-    'http://web.stanford.edu/~hastie/Papers/'
-    'LARS/LeastAngle_2002.pdf'
-)
-
-############################################################################
-# Create the dataset object
-# ^^^^^^^^^^^^^^^^^^^^^^^^^
-# The definition of all fields can be found in the XSD files describing the
-# expected format:
-#
-# https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd
-
-diabetes_dataset = create_dataset(
-    # The name of the dataset (needs to be unique).
-    # Must not be longer than 128 characters and only contain
-    # a-z, A-Z, 0-9 and the following special characters: _\-\.(),
-    name=name,
-    # Textual description of the dataset.
-    description=description,
-    # The person who created the dataset.
-    creator="Bradley Efron, Trevor Hastie, "
-            "Iain Johnstone and Robert Tibshirani",
-    # People who contributed to the current version of the dataset.
-    contributor=None,
-    # The date the data was originally collected, given by the uploader.
-    collection_date='09-01-2012',
-    # Language in which the data is represented.
-    # Starts with 1 upper case letter, rest lower case, e.g. 'English'.
-    language='English',
-    # License under which the data is/will be distributed.
-    licence='BSD (from scikit-learn)',
-    # Name of the target. Can also have multiple values (comma-separated).
-    default_target_attribute='class',
-    # The attribute that represents the row-id column, if present in the
-    # dataset.
-    row_id_attribute=None,
-    # Attributes that should be excluded in modelling, such as identifiers and
-    # indexes.
-    ignore_attribute=None,
-    # How to cite the paper.
-    citation=citation,
-    # Attributes of the data
-    attributes=attributes,
-    data=data,
-    # A version label which is provided by the user.
-    version_label='test',
-    original_data_url=(
-        'http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html'
-    ),
-    paper_url=paper_url,
-)
-
-############################################################################
-
-upload_did = diabetes_dataset.publish()
-print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
-
-############################################################################
-# Dataset is a list
-# =================
-# A list can contain lists in the case of dense data or it can contain
-# OrderedDicts in the case of sparse data.
-#
-# Weather dataset:
-# http://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html
-
-data = [
-    ['sunny', 85, 85, 'FALSE', 'no'],
-    ['sunny', 80, 90, 'TRUE', 'no'],
-    ['overcast', 83, 86, 'FALSE', 'yes'],
-    ['rainy', 70, 96, 'FALSE', 'yes'],
-    ['rainy', 68, 80, 'FALSE', 'yes'],
-    ['rainy', 65, 70, 'TRUE', 'no'],
-    ['overcast', 64, 65, 'TRUE', 'yes'],
-    ['sunny', 72, 95, 'FALSE', 'no'],
-    ['sunny', 69, 70, 'FALSE', 'yes'],
-    ['rainy', 75, 80, 'FALSE', 'yes'],
-    ['sunny', 75, 70, 'TRUE', 'yes'],
-    ['overcast', 72, 90, 'TRUE', 'yes'],
-    ['overcast', 81, 75, 'FALSE', 'yes'],
-    ['rainy', 71, 91, 'TRUE', 'no'],
-]
-
-attribute_names = [
-    ('outlook', ['sunny', 'overcast', 'rainy']),
-    ('temperature', 'REAL'),
-    ('humidity', 'REAL'),
-    ('windy', ['TRUE', 'FALSE']),
-    ('play', ['yes', 'no']),
-]
-
-description = (
-    'The weather problem is a tiny dataset that we will use repeatedly'
-    ' to illustrate machine learning methods. Entirely fictitious, it '
-    'supposedly concerns the conditions that are suitable for playing '
-    'some unspecified game. In general, instances in a dataset are '
-    'characterized by the values of features, or attributes, that measure '
-    'different aspects of the instance. In this case there are four '
-    'attributes: outlook, temperature, humidity, and windy. '
-    'The outcome is whether to play or not.'
-)
-
-citation = (
-    'I. H. Witten, E. Frank, M. A. Hall, and ITPro,'
-    'Data mining practical machine learning tools and techniques, '
-    'third edition. Burlington, Mass.: Morgan Kaufmann Publishers, 2011'
-)
-
-weather_dataset = create_dataset(
-    name="Weather",
-    description=description,
-    creator='I. H. Witten, E. Frank, M. A. Hall, and ITPro',
-    contributor=None,
-    collection_date='01-01-2011',
-    language='English',
-    licence=None,
-    default_target_attribute='play',
-    row_id_attribute=None,
-    ignore_attribute=None,
-    citation=citation,
-    attributes=attribute_names,
-    data=data,
-    version_label='example',
-)
-
-############################################################################
-
-upload_did = weather_dataset.publish()
-print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
-
-############################################################################
-# Dataset is a pandas DataFrame
-# =============================
-# It might happen that your dataset is made of heterogeneous data which can be
-# usually stored as a Pandas DataFrame. DataFrame offers the adavantages to
-# store the type of data for each column as well as the attribute names.
-# Therefore, when providing a Pandas DataFrame, OpenML can infer those
-# information without the need to specifically provide them when calling the
-# function :func:`create_dataset`. In this regard, you only need to pass
-# ``'auto'`` to the ``attributes`` parameter.
-
-df = pd.DataFrame(data, columns=[col_name for col_name, _ in attribute_names])
-# enforce the categorical column to have a categorical dtype
-df['outlook'] = df['outlook'].astype('category')
-df['windy'] = df['windy'].astype('bool')
-df['play'] = df['play'].astype('category')
-print(df.info())
-
-############################################################################
-# We enforce the column 'outlook', 'windy', and 'play' to be a categorical
-# dtype while the column 'rnd_str' is kept as a string column. Then, we can
-# call :func:`create_dataset` by passing the dataframe and fixing the parameter
-# ``attributes`` to ``'auto'``.
-
-weather_dataset = create_dataset(
-    name="Weather",
-    description=description,
-    creator='I. H. Witten, E. Frank, M. A. Hall, and ITPro',
-    contributor=None,
-    collection_date='01-01-2011',
-    language='English',
-    licence=None,
-    default_target_attribute='play',
-    row_id_attribute=None,
-    ignore_attribute=None,
-    citation=citation,
-    attributes='auto',
-    data=df,
-    version_label='example',
-)
-
-############################################################################
-
-upload_did = weather_dataset.publish()
-print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
-
-############################################################################
-# Dataset is a sparse matrix
-# ==========================
-
-sparse_data = coo_matrix((
-    [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-    ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
-))
-
-column_names = [
-    ('input1', 'REAL'),
-    ('input2', 'REAL'),
-    ('y', 'REAL'),
-]
-
-xor_dataset = create_dataset(
-    name="XOR",
-    description='Dataset representing the XOR operation',
-    creator=None,
-    contributor=None,
-    collection_date=None,
-    language='English',
-    licence=None,
-    default_target_attribute='y',
-    row_id_attribute=None,
-    ignore_attribute=None,
-    citation=None,
-    attributes=column_names,
-    data=sparse_data,
-    version_label='example',
-)
-
-############################################################################
-
-upload_did = xor_dataset.publish()
-print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
-
-
-############################################################################
-# Dataset is a pandas sparse dataframe
-# ====================================
-
-sparse_data = coo_matrix((
-    [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-    ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
-))
-column_names = ['input1', 'input2', 'y']
-df = pd.SparseDataFrame(sparse_data, columns=column_names)
-print(df.info())
-
-xor_dataset = create_dataset(
-    name="XOR",
-    description='Dataset representing the XOR operation',
-    creator=None,
-    contributor=None,
-    collection_date=None,
-    language='English',
-    licence=None,
-    default_target_attribute='y',
-    row_id_attribute=None,
-    ignore_attribute=None,
-    citation=None,
-    attributes='auto',
-    data=df,
-    version_label='example',
-)
-
-############################################################################
-
-upload_did = xor_dataset.publish()
-print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
-
-
-############################################################################
-openml.config.stop_using_configuration_for_example()
diff --git a/examples/datasets_tutorial.py b/examples/datasets_tutorial.py
deleted file mode 100644
index 70da03d15..000000000
--- a/examples/datasets_tutorial.py
+++ /dev/null
@@ -1,110 +0,0 @@
-"""
-========
-Datasets
-========
-
-How to list and download datasets.
-"""
-############################################################################
-import openml
-import pandas as pd
-
-############################################################################
-# Exercise 0
-# **********
-#
-# * List datasets
-#   * Use the output_format parameter to select output type
-#   * Default gives 'dict' (other option: 'dataframe')
-
-openml_list = openml.datasets.list_datasets()  # returns a dict
-
-# Show a nice table with some key data properties
-datalist = pd.DataFrame.from_dict(openml_list, orient='index')
-datalist = datalist[[
-    'did', 'name', 'NumberOfInstances',
-    'NumberOfFeatures', 'NumberOfClasses'
-]]
-
-print("First 10 of %s datasets..." % len(datalist))
-datalist.head(n=10)
-
-# The same can be done with lesser lines of code
-openml_df = openml.datasets.list_datasets(output_format='dataframe')
-openml_df.head(n=10)
-
-############################################################################
-# Exercise 1
-# **********
-#
-# * Find datasets with more than 10000 examples.
-# * Find a dataset called 'eeg_eye_state'.
-# * Find all datasets with more than 50 classes.
-datalist[datalist.NumberOfInstances > 10000
-         ].sort_values(['NumberOfInstances']).head(n=20)
-############################################################################
-datalist.query('name == "eeg-eye-state"')
-############################################################################
-datalist.query('NumberOfClasses > 50')
-
-############################################################################
-# Download datasets
-# =================
-
-# This is done based on the dataset ID.
-dataset = openml.datasets.get_dataset(1471)
-
-# Print a summary
-print("This is dataset '%s', the target feature is '%s'" %
-      (dataset.name, dataset.default_target_attribute))
-print("URL: %s" % dataset.url)
-print(dataset.description[:500])
-
-############################################################################
-# Get the actual data.
-#
-# The dataset can be returned in 2 possible formats: as a NumPy array, a SciPy
-# sparse matrix, or as a Pandas DataFrame (or SparseDataFrame). The format is
-# controlled with the parameter ``dataset_format`` which can be either 'array'
-# (default) or 'dataframe'. Let's first build our dataset from a NumPy array
-# and manually create a dataframe.
-X, y, categorical_indicator, attribute_names = dataset.get_data(
-    dataset_format='array',
-    target=dataset.default_target_attribute
-)
-eeg = pd.DataFrame(X, columns=attribute_names)
-eeg['class'] = y
-print(eeg[:10])
-
-############################################################################
-# Instead of manually creating the dataframe, you can already request a
-# dataframe with the correct dtypes.
-X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target=dataset.default_target_attribute,
-    dataset_format='dataframe'
-)
-print(X.head())
-print(X.info())
-
-############################################################################
-# Sometimes you only need access to a dataset's metadata.
-# In those cases, you can download the dataset without downloading the
-# data file. The dataset object can be used as normal.
-# Whenever you use any functionality that requires the data,
-# such as `get_data`, the data will be downloaded.
-dataset = openml.datasets.get_dataset(1471, download_data=False)
-
-############################################################################
-# Exercise 2
-# **********
-# * Explore the data visually.
-eegs = eeg.sample(n=1000)
-_ = pd.plotting.scatter_matrix(
-    eegs.iloc[:100, :4],
-    c=eegs[:100]['class'],
-    figsize=(10, 10),
-    marker='o',
-    hist_kwds={'bins': 20},
-    alpha=.8,
-    cmap='plasma'
-)
diff --git a/examples/flows_and_runs_tutorial.py b/examples/flows_and_runs_tutorial.py
deleted file mode 100644
index d65abdf28..000000000
--- a/examples/flows_and_runs_tutorial.py
+++ /dev/null
@@ -1,189 +0,0 @@
-"""
-Flows and Runs
-==============
-
-How to train/run a model and how to upload the results.
-"""
-
-import openml
-from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
-
-############################################################################
-# Train machine learning models
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-#
-# Train a scikit-learn model on the data manually.
-#
-# .. warning:: This example uploads data. For that reason, this example
-#   connects to the test server at test.openml.org. This prevents the main
-#   server from crowding with example datasets, tasks, runs, and so on.
-
-openml.config.start_using_configuration_for_example()
-# NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68
-dataset = openml.datasets.get_dataset(68)
-X, y, categorical_indicator, attribute_names = dataset.get_data(
-    dataset_format='array',
-    target=dataset.default_target_attribute
-)
-clf = neighbors.KNeighborsClassifier(n_neighbors=1)
-clf.fit(X, y)
-
-############################################################################
-# You can also ask for meta-data to automatically preprocess the data.
-#
-# * e.g. categorical features -> do feature encoding
-dataset = openml.datasets.get_dataset(17)
-X, y, categorical_indicator, attribute_names = dataset.get_data(
-    dataset_format='array',
-    target=dataset.default_target_attribute
-)
-print("Categorical features: {}".format(categorical_indicator))
-transformer = compose.ColumnTransformer(
-    [('one_hot_encoder', preprocessing.OneHotEncoder(categories='auto'), categorical_indicator)])
-X = transformer.fit_transform(X)
-clf.fit(X, y)
-
-############################################################################
-# Runs: Easily explore models
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
-# We can run (many) scikit-learn algorithms on (many) OpenML tasks.
-
-# Get a task
-task = openml.tasks.get_task(403)
-
-# Build any classifier or pipeline
-clf = tree.ExtraTreeClassifier()
-
-# Run the flow
-run = openml.runs.run_model_on_task(clf, task)
-
-print(run)
-
-############################################################################
-# Share the run on the OpenML server
-#
-# So far the run is only available locally. By calling the publish function,
-# the run is sent to the OpenML server:
-
-myrun = run.publish()
-# For this tutorial, our configuration publishes to the test server
-# as to not pollute the main server.
-print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id))
-
-############################################################################
-# We can now also inspect the flow object which was automatically created:
-
-flow = openml.flows.get_flow(run.flow_id)
-print(flow)
-
-############################################################################
-# It also works with pipelines
-# ############################
-#
-# When you need to handle 'dirty' data, build pipelines to model then automatically.
-task = openml.tasks.get_task(1)
-features = task.get_dataset().features
-nominal_feature_indices = [
-    i for i in range(len(features))
-    if features[i].name != task.target_name and features[i].data_type == 'nominal'
-]
-pipe = pipeline.Pipeline(steps=[
-    (
-        'Preprocessing',
-        compose.ColumnTransformer([
-            ('Nominal', pipeline.Pipeline(
-                [
-                    ('Imputer', impute.SimpleImputer(strategy='most_frequent')),
-                    (
-                        'Encoder',
-                        preprocessing.OneHotEncoder(
-                            sparse=False, handle_unknown='ignore',
-                        )
-                    ),
-                ]),
-                nominal_feature_indices,
-             ),
-        ]),
-    ),
-    ('Classifier', ensemble.RandomForestClassifier(n_estimators=10))
-])
-
-run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
-myrun = run.publish()
-print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id))
-
-###############################################################################
-# Running flows on tasks offline for later upload
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-# For those scenarios where there is no access to internet, it is possible to run
-# a model on a task without uploading results or flows to the server immediately.
-
-# To perform the following line offline, it is required to have been called before
-# such that the task is cached on the local openml cache directory:
-task = openml.tasks.get_task(6)
-
-# The following lines can then be executed offline:
-run = openml.runs.run_model_on_task(
-    pipe,
-    task,
-    avoid_duplicate_runs=False,
-    upload_flow=False)
-
-# The run may be stored offline, and the flow will be stored along with it:
-run.to_filesystem(directory='myrun')
-
-# They made later be loaded and uploaded
-run = openml.runs.OpenMLRun.from_filesystem(directory='myrun')
-run.publish()
-
-# Publishing the run will automatically upload the related flow if
-# it does not yet exist on the server.
-
-############################################################################
-# Alternatively, one can also directly run flows.
-
-# Get a task
-task = openml.tasks.get_task(403)
-
-# Build any classifier or pipeline
-clf = tree.ExtraTreeClassifier()
-
-# Obtain the scikit-learn extension interface to convert the classifier
-# into a flow object.
-extension = openml.extensions.get_extension_by_model(clf)
-flow = extension.model_to_flow(clf)
-
-run = openml.runs.run_flow_on_task(flow, task)
-
-############################################################################
-# Challenge
-# ^^^^^^^^^
-#
-# Try to build the best possible models on several OpenML tasks,
-# compare your results with the rest of the class and learn from
-# them. Some tasks you could try (or browse openml.org):
-#
-# * EEG eye state: data_id:`1471 <http://www.openml.org/d/1471>`_,
-#   task_id:`14951 <http://www.openml.org/t/14951>`_
-# * Volcanoes on Venus: data_id:`1527 <http://www.openml.org/d/1527>`_,
-#   task_id:`10103 <http://www.openml.org/t/10103>`_
-# * Walking activity: data_id:`1509 <http://www.openml.org/d/1509>`_,
-#   task_id:`9945 <http://www.openml.org/t/9945>`_, 150k instances.
-# * Covertype (Satellite): data_id:`150 <http://www.openml.org/d/150>`_,
-#   task_id:`218 <http://www.openml.org/t/218>`_, 500k instances.
-# * Higgs (Physics): data_id:`23512 <http://www.openml.org/d/23512>`_,
-#   task_id:`52950 <http://www.openml.org/t/52950>`_, 100k instances, missing values.
-
-# Easy benchmarking:
-for task_id in [115, ]:  # Add further tasks. Disclaimer: they might take some time
-    task = openml.tasks.get_task(task_id)
-    data = openml.datasets.get_dataset(task.dataset_id)
-    clf = neighbors.KNeighborsClassifier(n_neighbors=5)
-
-    run = openml.runs.run_model_on_task(clf, task, avoid_duplicate_runs=False)
-    myrun = run.publish()
-    print("kNN on %s: http://test.openml.org/r/%d" % (data.name, myrun.run_id))
-
-
-############################################################################
-openml.config.stop_using_configuration_for_example()
diff --git a/examples/introduction.py b/examples/introduction.py
new file mode 100644
index 000000000..630c72f9d
--- /dev/null
+++ b/examples/introduction.py
@@ -0,0 +1,22 @@
+# %% [markdown]
+#
+# We provide a set of examples here to get started with OpenML-Python. These examples cover various aspects of using the
+# OpenML API, including downloading datasets, uploading results, and working with tasks.
+#
+# ## Basics
+#
+# 1. [Installing and setting up OpenML-Python](../Basics/introduction_tutorial/)
+# 2. [Downloading datasets](../Basics/simple_datasets_tutorial/)
+# 3. [Using tasks](../Basics/simple_tasks_tutorial/)
+# 3. [Uploading experiment results](../Basics/simple_flows_and_runs_tutorial/)
+# 4. [Working with collections of tasks](../Basics/simple_suites_tutorial/)
+#
+# ## Advanced
+# 1. [Getting splits for datasets from tasks](../Advanced/task_manual_iteration_tutorial/)
+# 2. [Creating and uploading datasets](../Advanced/create_upload_tutorial/)
+# 3. [Searching and editing datasets](../Advanced/datasets_tutorial/)
+# 4. [Searching and creating tasks](../Advanced/task_tutorial/)
+# 5. [Listing, downloading, and uploading suites](../Advanced/suites_tutorial/)
+# 6. [Listing, downloading, and uploading studies](../Advanced/study_tutorial/)
+# 7. [Downloading evaluation results](../Advanced/fetch_evaluations_tutorial/)
+# 8. [Configuring logging](../Advanced/configure_logging/)
diff --git a/examples/introduction_tutorial.py b/examples/introduction_tutorial.py
deleted file mode 100644
index 9cd88ceba..000000000
--- a/examples/introduction_tutorial.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""
-Introduction
-============
-
-An introduction to OpenML, followed up by a simple example.
-"""
-############################################################################
-# OpenML is an online collaboration platform for machine learning which allows
-# you to:
-#
-# * Find or share interesting, well-documented datasets
-# * Define research / modelling goals (tasks)
-# * Explore large amounts of machine learning algorithms, with APIs in Java, R, Python
-# * Log and share reproducible experiments, models, results
-# * Works seamlessly with scikit-learn and other libraries
-# * Large scale benchmarking, compare to state of the art
-#
-
-############################################################################
-# Installation
-# ^^^^^^^^^^^^
-# Installation is done via ``pip``:
-#
-# .. code:: bash
-#
-#     pip install openml
-#
-# For further information, please check out the installation guide at
-# https://openml.github.io/openml-python/master/contributing.html#installation
-#
-
-############################################################################
-# Authentication
-# ^^^^^^^^^^^^^^
-#
-# The OpenML server can only be accessed by users who have signed up on the
-# OpenML platform. If you don’t have an account yet, sign up now.
-# You will receive an API key, which will authenticate you to the server
-# and allow you to download and upload datasets, tasks, runs and flows.
-#
-# * Create an OpenML account (free) on http://www.openml.org.
-# * After logging in, open your account page (avatar on the top right)
-# * Open 'Account Settings', then 'API authentication' to find your API key.
-#
-# There are two ways to authenticate:
-#
-# * Create a plain text file **~/.openml/config** with the line
-#   **'apikey=MYKEY'**, replacing **MYKEY** with your API key. The config
-#   file must be in the directory ~/.openml/config and exist prior to
-#   importing the openml module.
-# * Run the code below, replacing 'YOURKEY' with your API key.
-#
-# .. warning:: This example uploads data. For that reason, this example
-#   connects to the test server instead. This prevents the live server from
-#   crowding with example datasets, tasks, studies, and so on.
-
-############################################################################
-import openml
-from sklearn import neighbors
-
-openml.config.start_using_configuration_for_example()
-
-############################################################################
-# When using the main server, instead make sure your apikey is configured.
-# This can be done with the following line of code (uncomment it!).
-# Never share your apikey with others.
-
-# openml.config.apikey = 'YOURKEY'
-
-############################################################################
-# Caching
-# ^^^^^^^
-# When downloading datasets, tasks, runs and flows, they will be cached to
-# retrieve them without calling the server later. As with the API key,
-# the cache directory can be either specified through the config file or
-# through the API:
-#
-# * Add the  line **cachedir = 'MYDIR'** to the config file, replacing
-#   'MYDIR' with the path to the cache directory. By default, OpenML
-#   will use **~/.openml/cache** as the cache directory.
-# * Run the code below, replacing 'YOURDIR' with the path to the cache directory.
-
-# Uncomment and set your OpenML cache directory
-# import os
-# openml.config.cache_directory = os.path.expanduser('YOURDIR')
-
-############################################################################
-# Simple Example
-# ^^^^^^^^^^^^^^
-# Download the OpenML task for the eeg-eye-state.
-task = openml.tasks.get_task(403)
-data = openml.datasets.get_dataset(task.dataset_id)
-clf = neighbors.KNeighborsClassifier(n_neighbors=5)
-run = openml.runs.run_model_on_task(clf, task, avoid_duplicate_runs=False)
-# Publish the experiment on OpenML (optional, requires an API key).
-# For this tutorial, our configuration publishes to the test server
-# as to not crowd the main server with runs created by examples.
-myrun = run.publish()
-print("kNN on %s: http://test.openml.org/r/%d" % (data.name, myrun.run_id))
-
-############################################################################
-openml.config.stop_using_configuration_for_example()
diff --git a/examples/run_setup_tutorial.py b/examples/run_setup_tutorial.py
deleted file mode 100644
index d64f27e62..000000000
--- a/examples/run_setup_tutorial.py
+++ /dev/null
@@ -1,110 +0,0 @@
-"""
-=========
-Run Setup
-=========
-
-By: Jan N. van Rijn
-
-One of the key features of the openml-python library is that is allows to
-reinstantiate flows with hyperparameter settings that were uploaded before.
-This tutorial uses the concept of setups. Although setups are not extensively
-described in the OpenML documentation (because most users will not directly
-use them), they form a important concept within OpenML distinguishing between
-hyperparameter configurations.
-A setup is the combination of a flow with all its hyperparameters set.
-
-A key requirement for reinstantiating a flow is to have the same scikit-learn
-version as the flow that was uploaded. However, this tutorial will upload the
-flow (that will later be reinstantiated) itself, so it can be ran with any
-scikit-learn version that is supported by this library. In this case, the
-requirement of the corresponding scikit-learn versions is automatically met.
-
-In this tutorial we will
-    1) Create a flow and use it to solve a task;
-    2) Download the flow, reinstantiate the model with same hyperparameters,
-       and solve the same task again;
-    3) We will verify that the obtained results are exactly the same.
-
-.. warning:: This example uploads data. For that reason, this example
-   connects to the test server at test.openml.org. This prevents the main
-   server from crowding with example datasets, tasks, runs, and so on.
-"""
-import logging
-import numpy as np
-import openml
-import sklearn.ensemble
-import sklearn.impute
-import sklearn.preprocessing
-
-
-root = logging.getLogger()
-root.setLevel(logging.INFO)
-openml.config.start_using_configuration_for_example()
-
-###############################################################################
-# 1) Create a flow and use it to solve a task
-###############################################################################
-
-# first, let's download the task that we are interested in
-task = openml.tasks.get_task(6)
-
-
-# we will create a fairly complex model, with many preprocessing components and
-# many potential hyperparameters. Of course, the model can be as complex and as
-# easy as you want it to be
-model_original = sklearn.pipeline.make_pipeline(
-    sklearn.impute.SimpleImputer(),
-    sklearn.ensemble.RandomForestClassifier()
-)
-
-
-# Let's change some hyperparameters. Of course, in any good application we
-# would tune them using, e.g., Random Search or Bayesian Optimization, but for
-# the purpose of this tutorial we set them to some specific values that might
-# or might not be optimal
-hyperparameters_original = {
-    'simpleimputer__strategy': 'median',
-    'randomforestclassifier__criterion': 'entropy',
-    'randomforestclassifier__max_features': 0.2,
-    'randomforestclassifier__min_samples_leaf': 1,
-    'randomforestclassifier__n_estimators': 16,
-    'randomforestclassifier__random_state': 42,
-}
-model_original.set_params(**hyperparameters_original)
-
-# solve the task and upload the result (this implicitly creates the flow)
-run = openml.runs.run_model_on_task(
-    model_original,
-    task,
-    avoid_duplicate_runs=False)
-run_original = run.publish()  # this implicitly uploads the flow
-
-###############################################################################
-# 2) Download the flow and solve the same task again.
-###############################################################################
-
-# obtain setup id (note that the setup id is assigned by the OpenML server -
-# therefore it was not yet available in our local copy of the run)
-run_downloaded = openml.runs.get_run(run_original.run_id)
-setup_id = run_downloaded.setup_id
-
-# after this, we can easily reinstantiate the model
-model_duplicate = openml.setups.initialize_model(setup_id)
-# it will automatically have all the hyperparameters set
-
-# and run the task again
-run_duplicate = openml.runs.run_model_on_task(
-    model_duplicate, task, avoid_duplicate_runs=False)
-
-
-###############################################################################
-# 3) We will verify that the obtained results are exactly the same.
-###############################################################################
-
-# the run has stored all predictions in the field data content
-np.testing.assert_array_equal(run_original.data_content,
-                              run_duplicate.data_content)
-
-###############################################################################
-
-openml.config.stop_using_configuration_for_example()
diff --git a/examples/sklearn/README.txt b/examples/sklearn/README.txt
deleted file mode 100644
index d61578cf1..000000000
--- a/examples/sklearn/README.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-Experiment Examples
-===================
-
-OpenML experiment examples using a sklearn classifier/pipeline.
diff --git a/examples/sklearn/openml_run_example.py b/examples/sklearn/openml_run_example.py
deleted file mode 100644
index 195a0aa77..000000000
--- a/examples/sklearn/openml_run_example.py
+++ /dev/null
@@ -1,40 +0,0 @@
-"""
-OpenML Run Example
-==================
-
-An example of an automated machine learning experiment.
-"""
-import openml
-from sklearn import impute, tree, pipeline
-
-############################################################################
-# .. warning:: This example uploads data. For that reason, this example
-#   connects to the test server at test.openml.org. This prevents the main
-#   server from crowding with example datasets, tasks, runs, and so on.
-
-openml.config.start_using_configuration_for_example()
-############################################################################
-
-# Uncomment and set your OpenML key. Don't share your key with others.
-# openml.config.apikey = 'YOURKEY'
-
-# Define a scikit-learn pipeline
-clf = pipeline.Pipeline(
-    steps=[
-        ('imputer', impute.SimpleImputer()),
-        ('estimator', tree.DecisionTreeClassifier())
-    ]
-)
-############################################################################
-# Download the OpenML task for the german credit card dataset.
-task = openml.tasks.get_task(97)
-############################################################################
-# Run the scikit-learn model on the task (requires an API key).
-run = openml.runs.run_model_on_task(clf, task)
-# Publish the experiment on OpenML (optional, requires an API key).
-run.publish()
-
-print('URL for run: %s/run/%d' % (openml.config.server, run.run_id))
-
-############################################################################
-openml.config.stop_using_configuration_for_example()
diff --git a/examples/tasks_tutorial.py b/examples/tasks_tutorial.py
deleted file mode 100644
index c54ecdbd9..000000000
--- a/examples/tasks_tutorial.py
+++ /dev/null
@@ -1,219 +0,0 @@
-"""
-Tasks
-=====
-
-A tutorial on how to list and download tasks.
-"""
-
-import openml
-import pandas as pd
-
-############################################################################
-#
-# Tasks are identified by IDs and can be accessed in two different ways:
-#
-# 1. In a list providing basic information on all tasks available on OpenML.
-# This function will not download the actual tasks, but will instead download
-# meta data that can be used to filter the tasks and retrieve a set of IDs.
-# We can filter this list, for example, we can only list tasks having a
-# special tag or only tasks for a specific target such as
-# *supervised classification*.
-#
-# 2. A single task by its ID. It contains all meta information, the target
-# metric, the splits and an iterator which can be used to access the
-# splits in a useful manner.
-
-############################################################################
-# Listing tasks
-# ^^^^^^^^^^^^^
-#
-# We will start by simply listing only *supervised classification* tasks:
-
-tasks = openml.tasks.list_tasks(task_type_id=1)
-
-############################################################################
-# **openml.tasks.list_tasks()** returns a dictionary of dictionaries, we convert it into a
-# `pandas dataframe <http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_
-# to have better visualization and easier access:
-
-tasks = pd.DataFrame.from_dict(tasks, orient='index')
-print(tasks.columns)
-print("First 5 of %s tasks:" % len(tasks))
-print(tasks.head())
-
-# The same can be obtained through lesser lines of code
-tasks_df = openml.tasks.list_tasks(task_type_id=1, output_format='dataframe')
-print(tasks_df.head())
-
-############################################################################
-# We can filter the list of tasks to only contain datasets with more than
-# 500 samples, but less than 1000 samples:
-
-filtered_tasks = tasks.query('NumberOfInstances > 500 and NumberOfInstances < 1000')
-print(list(filtered_tasks.index))
-
-############################################################################
-
-# Number of tasks
-print(len(filtered_tasks))
-
-############################################################################
-# Then, we can further restrict the tasks to all have the same resampling strategy:
-
-filtered_tasks = filtered_tasks.query('estimation_procedure == "10-fold Crossvalidation"')
-print(list(filtered_tasks.index))
-
-############################################################################
-
-# Number of tasks
-print(len(filtered_tasks))
-
-############################################################################
-# Resampling strategies can be found on the
-# `OpenML Website <http://www.openml.org/search?type=measure&q=estimation%20procedure>`_.
-#
-# Similar to listing tasks by task type, we can list tasks by tags:
-
-tasks = openml.tasks.list_tasks(tag='OpenML100')
-tasks = pd.DataFrame.from_dict(tasks, orient='index')
-print("First 5 of %s tasks:" % len(tasks))
-print(tasks.head())
-
-############################################################################
-# Furthermore, we can list tasks based on the dataset id:
-
-tasks = openml.tasks.list_tasks(data_id=1471)
-tasks = pd.DataFrame.from_dict(tasks, orient='index')
-print("First 5 of %s tasks:" % len(tasks))
-print(tasks.head())
-
-############################################################################
-# In addition, a size limit and an offset can be applied both separately and simultaneously:
-
-tasks = openml.tasks.list_tasks(size=10, offset=50)
-tasks = pd.DataFrame.from_dict(tasks, orient='index')
-print(tasks)
-
-############################################################################
-#
-# **OpenML 100**
-# is a curated list of 100 tasks to start using OpenML. They are all
-# supervised classification tasks with more than 500 instances and less than 50000
-# instances per task. To make things easier, the tasks do not contain highly
-# unbalanced data and sparse data. However, the tasks include missing values and
-# categorical features. You can find out more about the *OpenML 100* on
-# `the OpenML benchmarking page <https://www.openml.org/guide/benchmark>`_.
-#
-# Finally, it is also possible to list all tasks on OpenML with:
-
-############################################################################
-tasks = openml.tasks.list_tasks()
-tasks = pd.DataFrame.from_dict(tasks, orient='index')
-print(len(tasks))
-
-############################################################################
-# Exercise
-# ########
-#
-# Search for the tasks on the 'eeg-eye-state' dataset.
-
-tasks.query('name=="eeg-eye-state"')
-
-############################################################################
-# Downloading tasks
-# ^^^^^^^^^^^^^^^^^
-#
-# We provide two functions to download tasks, one which downloads only a
-# single task by its ID, and one which takes a list of IDs and downloads
-# all of these tasks:
-
-task_id = 31
-task = openml.tasks.get_task(task_id)
-
-############################################################################
-# Properties of the task are stored as member variables:
-
-print(task)
-
-############################################################################
-# And:
-
-ids = [2, 1891, 31, 9983]
-tasks = openml.tasks.get_tasks(ids)
-print(tasks[0])
-
-############################################################################
-# Creating tasks
-# ^^^^^^^^^^^^^^
-#
-# You can also create new tasks. Take the following into account:
-#
-# * You can only create tasks on _active_ datasets
-# * For now, only the following tasks are supported: classification, regression,
-# clustering, and learning curve analysis.
-# * For now, tasks can only be created on a single dataset.
-# * The exact same task must not already exist.
-#
-# Creating a task requires the following input:
-#
-# * task_type_id: The task type ID, required (see below). Required.
-# * dataset_id: The dataset ID. Required.
-# * target_name: The name of the attribute you aim to predict.
-# Optional.
-# * estimation_procedure_id : The ID of the estimation procedure used to create train-test
-# splits. Optional.
-# * evaluation_measure: The name of the evaluation measure. Optional.
-# * Any additional inputs for specific tasks
-#
-# It is best to leave the evaluation measure open if there is no strong prerequisite for a
-# specific measure. OpenML will always compute all appropriate measures and you can filter
-# or sort results on your favourite measure afterwards. Only add an evaluation measure if
-# necessary (e.g. when other measure make no sense), since it will create a new task, which
-# scatters results across tasks.
-
-
-############################################################################
-# Example
-# #######
-#
-# Let's create a classification task on a dataset. In this example we will do this on the
-# Iris dataset (ID=128 (on test server)). We'll use 10-fold cross-validation (ID=1),
-# and _predictive accuracy_ as the predefined measure (this can also be left open).
-# If a task with these parameters exist, we will get an appropriate exception.
-# If such a task doesn't exist, a task will be created and the corresponding task_id
-# will be returned.
-
-
-# using test server for example uploads
-openml.config.start_using_configuration_for_example()
-
-try:
-    tasktypes = openml.tasks.TaskTypeEnum
-    my_task = openml.tasks.create_task(
-        task_type_id=tasktypes.SUPERVISED_CLASSIFICATION,
-        dataset_id=128,
-        target_name="class",
-        evaluation_measure="predictive_accuracy",
-        estimation_procedure_id=1)
-    my_task.publish()
-except openml.exceptions.OpenMLServerException as e:
-    # Error code for 'task already exists'
-    if e.code == 614:
-        # Lookup task
-        tasks = openml.tasks.list_tasks(data_id=128, output_format='dataframe').to_numpy()
-        tasks = tasks[tasks[:, 4] == "Supervised Classification"]
-        tasks = tasks[tasks[:, 6] == "10-fold Crossvalidation"]
-        tasks = tasks[tasks[:, 19] == "predictive_accuracy"]
-        task_id = tasks[0][0]
-        print("Task already exists. Task ID is", task_id)
-
-# reverting to prod server
-openml.config.stop_using_configuration_for_example()
-
-
-############################################################################
-# [Complete list of task types](https://www.openml.org/search?type=task_type)
-# [Complete list of model estimation procedures](
-# https://www.openml.org/search?q=%2520measure_type%3Aestimation_procedure&type=measure)
-# [Complete list of evaluation measures](
-# https://www.openml.org/search?q=measure_type%3Aevaluation_measure&type=measure)
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 000000000..419cc249e
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,165 @@
+site_name: openml-python
+repo_url: https://github.com/openml/openml-python
+repo_name: openml/openml-python
+theme:
+  logo: images/openml_icon.png
+  favicon: images/openml_icon.png
+  name: material
+  features:
+    - content.code.annotate
+    - content.code.copy
+    - navigation.footer
+    - navigation.sections
+    - toc.follow
+    - toc.integrate
+    - navigation.tabs
+    - navigation.tabs.sticky
+    - header.autohide
+    - header.social
+    - search.suggest
+    - search.highlight
+    - search.share
+  palette:
+    - scheme: slate
+      media: "(prefers-color-scheme: dark)"
+      primary: indigo
+      accent: deep purple
+      toggle:
+        icon: material/eye-outline
+        name: Switch to light mode
+
+    # Palette toggle for light mode
+    - scheme: default
+      media: "(prefers-color-scheme: light)"
+      primary: indigo
+      accent: deep purple
+      toggle:
+        icon: material/eye
+        name: Switch to dark mode
+
+extra_css:
+  - stylesheets/extra.css
+
+nav:
+  - index.md
+  - Examples:
+    - Overview: examples/introduction.py
+    - Basics:
+        - Setup: examples/Basics/introduction_tutorial.py
+        - Datasets: examples/Basics/simple_datasets_tutorial.py
+        - Tasks: examples/Basics/simple_tasks_tutorial.py
+        - Flows and Runs: examples/Basics/simple_flows_and_runs_tutorial.py
+        - Suites: examples/Basics/simple_suites_tutorial.py
+    - Advanced:
+      - Dataset Splits from Tasks: examples/Advanced/task_manual_iteration_tutorial.py
+      - Creating and Uploading Datasets: examples/Advanced/create_upload_tutorial.py
+      - Searching and Editing Datasets: examples/Advanced/datasets_tutorial.py
+      - Searching and Creating Tasks: examples/Advanced/tasks_tutorial.py
+      - List, Download, and Upload Suites: examples/Advanced/suites_tutorial.py
+      - List, Download, and Upload Studies: examples/Advanced/study_tutorial.py
+      - Downloading Evaluation Results: examples/Advanced/fetch_evaluations_tutorial.py
+      - Configuring Logging: examples/Advanced/configure_logging.py
+
+
+  - Extensions: extensions.md
+  - Advanced User Guide: details.md
+  - API: reference/
+  - Contributing: contributing.md
+  - Developer Setup: developer_setup.md
+
+markdown_extensions:
+  - pymdownx.highlight:
+      anchor_linenums: true
+  - pymdownx.superfences
+  - attr_list
+  - admonition
+  - tables
+  - attr_list
+  - md_in_html
+  - toc:
+      permalink: "#"
+  - pymdownx.highlight:
+      anchor_linenums: true
+  - pymdownx.magiclink:
+      hide_protocol: true
+      repo_url_shortener: true
+      repo_url_shorthand: true
+      user: openml
+      repo: openml-python
+  - pymdownx.highlight
+  - pymdownx.inlinehilite
+  - pymdownx.snippets
+  - pymdownx.details
+  - pymdownx.tabbed:
+      alternate_style: true
+  - pymdownx.superfences:
+      custom_fences:
+      - name: mermaid
+        class: mermaid
+        format: !!python/name:pymdownx.superfences.fence_code_format
+  - pymdownx.emoji:
+      emoji_index: !!python/name:material.extensions.emoji.twemoji
+      emoji_generator: !!python/name:material.extensions.emoji.to_svg
+  - pymdownx.tabbed:
+      alternate_style: true
+
+extra:
+  version:
+    provider: mike
+  social:
+    - icon: fontawesome/brands/github
+      link: https://github.com/openml
+    - icon: fontawesome/brands/twitter
+      link: https://x.com/open_ml
+
+plugins:
+  - search
+  - autorefs
+  - section-index
+  # - mkdocstrings:
+  - mkdocstrings:
+      default_handler: python
+      enable_inventory: true
+      handlers:
+        python:
+          # paths: [openml]
+          options:  # https://mkdocstrings.github.io/python/usage/
+            docstring_section_style: spacy
+            docstring_options:
+              ignore_init_summary: true
+              trim_doctest_flags: true
+            show_docstring_attributes: true
+            show_docstring_description: true
+            show_root_heading: true
+            show_root_toc_entry: true
+            show_object_full_path: false
+            show_root_members_full_path: false
+            signature_crossrefs: true
+            merge_init_into_class: true
+            show_symbol_type_heading: true
+            show_symbol_type_toc: true
+            docstring_style: numpy
+            inherited_members: true
+            show_if_no_docstring: false
+            show_bases: true
+            show_source: true
+            members_order: "alphabetical"
+            group_by_category: true
+            show_signature: true
+            separate_signature: true
+            show_signature_annotations: true
+            filters:
+              - "!^_[^_]"
+
+  - gen-files:
+      scripts:
+        - scripts/gen_ref_pages.py
+  - literate-nav:
+      nav_file: SUMMARY.md
+  - mkdocs-jupyter:
+      theme: light
+  - mike:
+      version_selector: true
+      css_dir: css
+      javascript_dir: js
+      canonical_version: latest
diff --git a/openml/__init__.py b/openml/__init__.py
index 94c46341f..9a457c146 100644
--- a/openml/__init__.py
+++ b/openml/__init__.py
@@ -12,43 +12,57 @@
 In particular, this module implements a python interface for the
 `OpenML REST API <https://www.openml.org/guide#!rest_services>`_
 (`REST on wikipedia
-<http://en.wikipedia.org/wiki/Representational_state_transfer>`_).
+<https://en.wikipedia.org/wiki/Representational_state_transfer>`_).
 """
 
-from . import _api_calls
-from . import config
-from .datasets import OpenMLDataset, OpenMLDataFeature
-from . import datasets
-from . import evaluations
+# License: BSD 3-Clause
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from . import (
+    _api_calls,
+    _config as _config_module,
+    datasets,
+    evaluations,
+    exceptions,
+    extensions,
+    flows,
+    runs,
+    setups,
+    study,
+    tasks,
+    utils,
+)
+from .__version__ import __version__
+from .datasets import OpenMLDataFeature, OpenMLDataset
 from .evaluations import OpenMLEvaluation
-from . import extensions
-from . import exceptions
-from . import tasks
+from .flows import OpenMLFlow
+from .runs import OpenMLRun
+from .setups import OpenMLParameter, OpenMLSetup
+from .study import OpenMLBenchmarkSuite, OpenMLStudy
 from .tasks import (
-    OpenMLTask,
-    OpenMLSplit,
-    OpenMLSupervisedTask,
     OpenMLClassificationTask,
-    OpenMLRegressionTask,
     OpenMLClusteringTask,
     OpenMLLearningCurveTask,
+    OpenMLRegressionTask,
+    OpenMLSplit,
+    OpenMLSupervisedTask,
+    OpenMLTask,
 )
-from . import runs
-from .runs import OpenMLRun
-from . import flows
-from .flows import OpenMLFlow
-from . import study
-from .study import OpenMLStudy, OpenMLBenchmarkSuite
-from . import utils
-from . import setups
-from .setups import OpenMLSetup, OpenMLParameter
 
+if TYPE_CHECKING:
+    from ._config import OpenMLConfigManager
 
-from .__version__ import __version__
+config: OpenMLConfigManager = _config_module.__config
 
 
-def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None,
-                   run_ids=None):
+def populate_cache(
+    task_ids: list[int] | None = None,
+    dataset_ids: list[int | str] | None = None,
+    flow_ids: list[int] | None = None,
+    run_ids: list[int] | None = None,
+) -> None:
     """
     Populate a cache for offline and parallel usage of the OpenML connector.
 
@@ -84,36 +98,33 @@ def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None,
 
 
 __all__ = [
-    'OpenMLDataset',
-    'OpenMLDataFeature',
-    'OpenMLRun',
-    'OpenMLSplit',
-    'OpenMLEvaluation',
-    'OpenMLSetup',
-    'OpenMLParameter',
-    'OpenMLTask',
-    'OpenMLSupervisedTask',
-    'OpenMLClusteringTask',
-    'OpenMLLearningCurveTask',
-    'OpenMLRegressionTask',
-    'OpenMLClassificationTask',
-    'OpenMLFlow',
-    'OpenMLStudy',
-    'OpenMLBenchmarkSuite',
-    'datasets',
-    'evaluations',
-    'exceptions',
-    'extensions',
-    'config',
-    'runs',
-    'flows',
-    'tasks',
-    'setups',
-    'study',
-    'utils',
-    '_api_calls',
-    '__version__',
+    "OpenMLBenchmarkSuite",
+    "OpenMLClassificationTask",
+    "OpenMLClusteringTask",
+    "OpenMLDataFeature",
+    "OpenMLDataset",
+    "OpenMLEvaluation",
+    "OpenMLFlow",
+    "OpenMLLearningCurveTask",
+    "OpenMLParameter",
+    "OpenMLRegressionTask",
+    "OpenMLRun",
+    "OpenMLSetup",
+    "OpenMLSplit",
+    "OpenMLStudy",
+    "OpenMLSupervisedTask",
+    "OpenMLTask",
+    "__version__",
+    "_api_calls",
+    "config",
+    "datasets",
+    "evaluations",
+    "exceptions",
+    "extensions",
+    "flows",
+    "runs",
+    "setups",
+    "study",
+    "tasks",
+    "utils",
 ]
-
-# Load the scikit-learn extension by default
-import openml.extensions.sklearn  # noqa: F401
diff --git a/openml/__version__.py b/openml/__version__.py
index bfb63854a..cf5a8535d 100644
--- a/openml/__version__.py
+++ b/openml/__version__.py
@@ -1,4 +1,8 @@
 """Version information."""
 
+# License: BSD 3-Clause
+
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.9.0"
+from __future__ import annotations
+
+__version__ = "0.16.0"
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
index 803dc6b42..179c814e7 100644
--- a/openml/_api_calls.py
+++ b/openml/_api_calls.py
@@ -1,15 +1,90 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import contextlib
+import hashlib
+import logging
+import math
+import random
+import shutil
 import time
-import requests
-import warnings
+import urllib.parse
+import xml
+import zipfile
+from pathlib import Path
+from typing import cast
 
+import minio
+import requests
+import requests.utils
 import xmltodict
+from urllib3 import ProxyManager
+
+import openml
+
+from .__version__ import __version__
+from .exceptions import (
+    OpenMLAuthenticationError,
+    OpenMLHashException,
+    OpenMLServerError,
+    OpenMLServerException,
+    OpenMLServerNoResult,
+)
+from .utils import ProgressBar
+
+_HEADERS = {"user-agent": f"openml-python/{__version__}"}
+
+DATA_TYPE = dict[str, str | int]
+FILE_ELEMENTS_TYPE = dict[str, str | tuple[str, str]]
+DATABASE_CONNECTION_ERRCODE = 107
 
-from . import config
-from .exceptions import (OpenMLServerError, OpenMLServerException,
-                         OpenMLServerNoResult)
+API_TOKEN_HELP_LINK = "https://openml.github.io/openml-python/latest/examples/Basics/introduction_tutorial/#authentication"  # noqa: S105
 
 
-def _perform_api_call(call, request_method, data=None, file_elements=None):
+def _robot_delay(n: int) -> float:
+    wait = (1 / (1 + math.exp(-(n * 0.5 - 4)))) * 60
+    variation = random.gauss(0, wait / 10)
+    return max(1.0, wait + variation)
+
+
+def _human_delay(n: int) -> float:
+    return max(1.0, n)
+
+
+def resolve_env_proxies(url: str) -> str | None:
+    """Attempt to find a suitable proxy for this url.
+
+    Relies on ``requests`` internals to remain consistent. To disable this from the
+    environment, please set the enviornment varialbe ``no_proxy="*"``.
+
+    Parameters
+    ----------
+    url : str
+        The url endpoint
+
+    Returns
+    -------
+    Optional[str]
+        The proxy url if found, else None
+    """
+    resolved_proxies = requests.utils.get_environ_proxies(url)
+    return requests.utils.select_proxy(url, resolved_proxies)  # type: ignore
+
+
+def _create_url_from_endpoint(endpoint: str) -> str:
+    url = cast("str", openml.config.server)
+    if not url.endswith("/"):
+        url += "/"
+    url += endpoint
+    return url.replace("=", "%3d")
+
+
+def _perform_api_call(
+    call: str,
+    request_method: str,
+    data: DATA_TYPE | None = None,
+    file_elements: FILE_ELEMENTS_TYPE | None = None,
+) -> str:
     """
     Perform an API call at the OpenML server.
 
@@ -31,133 +106,419 @@ def _perform_api_call(call, request_method, data=None, file_elements=None):
 
     Returns
     -------
-    return_code : int
-        HTTP return code
     return_value : str
         Return value of the OpenML server
     """
-    url = config.server
-    if not url.endswith("/"):
-        url += "/"
-    url += call
-
-    url = url.replace('=', '%3d')
+    url = _create_url_from_endpoint(call)
+    logging.info("Starting [%s] request for the URL %s", request_method, url)
+    start = time.time()
 
     if file_elements is not None:
-        if request_method != 'post':
-            raise ValueError('request method must be post when file elements '
-                             'are present')
-        return _read_url_files(url, data=data, file_elements=file_elements)
-    return _read_url(url, request_method, data)
+        if request_method != "post":
+            raise ValueError("request method must be post when file elements are present")
+        response = _read_url_files(url, data=data, file_elements=file_elements)
+    else:
+        response = __read_url(url, request_method, data)
+
+    __check_response(response, url, file_elements)
 
+    logging.info(
+        "%.7fs taken for [%s] request for the URL %s",
+        time.time() - start,
+        request_method,
+        url,
+    )
+    return response.text
 
-def _file_id_to_url(file_id, filename=None):
+
+def _download_minio_file(
+    source: str,
+    destination: str | Path,
+    exists_ok: bool = True,  # noqa: FBT002
+    proxy: str | None = "auto",
+) -> None:
+    """Download file ``source`` from a MinIO Bucket and store it at ``destination``.
+
+    Parameters
+    ----------
+    source : str
+        URL to a file in a MinIO bucket.
+    destination : str | Path
+        Path to store the file to, if a directory is provided the original filename is used.
+    exists_ok : bool, optional (default=True)
+        If False, raise FileExists if a file already exists in ``destination``.
+    proxy: str, optional (default = "auto")
+        The proxy server to use. By default it's "auto" which uses ``requests`` to
+        automatically find the proxy to use. Pass None or the environment variable
+        ``no_proxy="*"`` to disable proxies.
     """
-     Presents the URL how to download a given file id
-     filename is optional
+    destination = Path(destination)
+    parsed_url = urllib.parse.urlparse(source)
+
+    # expect path format: /BUCKET/path/to/file.ext
+    bucket, object_name = parsed_url.path[1:].split("/", maxsplit=1)
+    if destination.is_dir():
+        destination = Path(destination, object_name)
+    if destination.is_file() and not exists_ok:
+        raise FileExistsError(f"File already exists in {destination}.")
+
+    if proxy == "auto":
+        proxy = resolve_env_proxies(parsed_url.geturl())
+
+    proxy_client = ProxyManager(proxy) if proxy else None
+
+    client = minio.Minio(endpoint=parsed_url.netloc, secure=False, http_client=proxy_client)
+    try:
+        client.fget_object(
+            bucket_name=bucket,
+            object_name=object_name,
+            file_path=str(destination),
+            progress=ProgressBar() if openml.config.show_progress else None,
+            request_headers=_HEADERS,
+        )
+        if destination.is_file() and destination.suffix == ".zip":
+            with zipfile.ZipFile(destination, "r") as zip_ref:
+                zip_ref.extractall(destination.parent)
+
+    except minio.error.S3Error as e:
+        if e.message is not None and e.message.startswith("Object does not exist"):
+            raise FileNotFoundError(f"Object at '{source}' does not exist.") from e
+        # e.g. permission error, or a bucket does not exist (which is also interpreted as a
+        # permission error on minio level).
+        raise FileNotFoundError("Bucket does not exist or is private.") from e
+
+
+def _download_minio_bucket(source: str, destination: str | Path) -> None:
+    """Download file ``source`` from a MinIO Bucket and store it at ``destination``.
+
+    Does not redownload files which already exist.
+
+    Parameters
+    ----------
+    source : str
+        URL to a MinIO bucket.
+    destination : str | Path
+        Path to a directory to store the bucket content in.
     """
-    openml_url = config.server.split('/api/')
-    url = openml_url[0] + '/data/download/%s' % file_id
+    destination = Path(destination)
+    parsed_url = urllib.parse.urlparse(source)
+
+    # expect path format: /BUCKET/path/to/file.ext
+    _, bucket, *prefixes, _file = parsed_url.path.split("/")
+    prefix = "/".join(prefixes)
+
+    client = minio.Minio(endpoint=parsed_url.netloc, secure=False)
+
+    for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
+        if file_object.object_name is None:
+            raise ValueError(f"Object name is None for object {file_object!r}")
+        if file_object.etag is None:
+            raise ValueError(f"Object etag is None for object {file_object!r}")
+
+        marker = destination / file_object.etag
+        if marker.exists():
+            continue
+
+        file_destination = destination / file_object.object_name.rsplit("/", 1)[1]
+        if (file_destination.parent / file_destination.stem).exists():
+            # Marker is missing but archive exists means the server archive changed, force a refresh
+            shutil.rmtree(file_destination.parent / file_destination.stem)
+
+        with contextlib.suppress(FileExistsError):
+            _download_minio_file(
+                source=source.rsplit("/", 1)[0] + "/" + file_object.object_name.rsplit("/", 1)[1],
+                destination=file_destination,
+                exists_ok=False,
+            )
+
+        if file_destination.is_file() and file_destination.suffix == ".zip":
+            file_destination.unlink()
+            marker.touch()
+
+
+def _download_text_file(
+    source: str,
+    output_path: str | Path | None = None,
+    md5_checksum: str | None = None,
+    exists_ok: bool = True,  # noqa: FBT002
+    encoding: str = "utf8",
+) -> str | None:
+    """Download the text file at `source` and store it in `output_path`.
+
+    By default, do nothing if a file already exists in `output_path`.
+    The downloaded file can be checked against an expected md5 checksum.
+
+    Parameters
+    ----------
+    source : str
+        url of the file to be downloaded
+    output_path : str | Path | None (default=None)
+        full path, including filename, of where the file should be stored. If ``None``,
+        this function returns the downloaded file as string.
+    md5_checksum : str, optional (default=None)
+        If not None, should be a string of hexidecimal digits of the expected digest value.
+    exists_ok : bool, optional (default=True)
+        If False, raise an FileExistsError if there already exists a file at `output_path`.
+    encoding : str, optional (default='utf8')
+        The encoding with which the file should be stored.
+    """
+    if isinstance(output_path, str):
+        output_path = Path(output_path)
+
+    if output_path is not None and output_path.exists():
+        if not exists_ok:
+            raise FileExistsError
+
+        return None
+
+    logging.info("Starting [%s] request for the URL %s", "get", source)
+    start = time.time()
+    response = __read_url(source, request_method="get", md5_checksum=md5_checksum)
+    downloaded_file = response.text
+
+    if output_path is None:
+        logging.info(
+            "%.7fs taken for [%s] request for the URL %s",
+            time.time() - start,
+            "get",
+            source,
+        )
+        return downloaded_file
+
+    with output_path.open("w", encoding=encoding) as fh:
+        fh.write(downloaded_file)
+
+    logging.info(
+        "%.7fs taken for [%s] request for the URL %s",
+        time.time() - start,
+        "get",
+        source,
+    )
+    return None
+
+
+def _file_id_to_url(file_id: int, filename: str | None = None) -> str:
+    """
+    Presents the URL how to download a given file id
+    filename is optional
+    """
+    openml_server = cast("str", openml.config.server)
+    openml_url = openml_server.split("/api/")
+    url = openml_url[0] + f"/data/download/{file_id!s}"
     if filename is not None:
-        url += '/' + filename
+        url += "/" + filename
     return url
 
 
-def _read_url_files(url, data=None, file_elements=None):
-    """do a post request to url with data
-    and sending file_elements as files"""
-
+def _read_url_files(
+    url: str,
+    data: DATA_TYPE | None = None,
+    file_elements: FILE_ELEMENTS_TYPE | None = None,
+) -> requests.Response:
+    """Do a post request to url with data
+    and sending file_elements as files
+    """
     data = {} if data is None else data
-    data['api_key'] = config.apikey
+    data["api_key"] = openml.config.apikey
     if file_elements is None:
         file_elements = {}
     # Using requests.post sets header 'Accept-encoding' automatically to
     # 'gzip,deflate'
-    response = send_request(
-        request_method='post',
+    return _send_request(
+        request_method="post",
         url=url,
         data=data,
         files=file_elements,
     )
-    if response.status_code != 200:
-        raise _parse_server_exception(response, url)
-    if 'Content-Encoding' not in response.headers or \
-            response.headers['Content-Encoding'] != 'gzip':
-        warnings.warn('Received uncompressed content from OpenML for {}.'
-                      .format(url))
-    return response.text
 
 
-def _read_url(url, request_method, data=None):
+def __read_url(
+    url: str,
+    request_method: str,
+    data: DATA_TYPE | None = None,
+    md5_checksum: str | None = None,
+) -> requests.Response:
     data = {} if data is None else data
-    if config.apikey is not None:
-        data['api_key'] = config.apikey
+    if openml.config.apikey:
+        data["api_key"] = openml.config.apikey
+    return _send_request(
+        request_method=request_method,
+        url=url,
+        data=data,
+        md5_checksum=md5_checksum,
+    )
+
+
+def __is_checksum_equal(downloaded_file_binary: bytes, md5_checksum: str | None = None) -> bool:
+    if md5_checksum is None:
+        return True
+    md5 = hashlib.md5()  # noqa: S324
+    md5.update(downloaded_file_binary)
+    md5_checksum_download = md5.hexdigest()
+    return md5_checksum == md5_checksum_download
 
-    response = send_request(request_method=request_method, url=url, data=data)
-    if response.status_code != 200:
-        raise _parse_server_exception(response, url)
-    if 'Content-Encoding' not in response.headers or \
-            response.headers['Content-Encoding'] != 'gzip':
-        warnings.warn('Received uncompressed content from OpenML for {}.'
-                      .format(url))
-    return response.text
 
+def _send_request(  # noqa: C901, PLR0912
+    request_method: str,
+    url: str,
+    data: DATA_TYPE,
+    files: FILE_ELEMENTS_TYPE | None = None,
+    md5_checksum: str | None = None,
+) -> requests.Response:
+    n_retries = max(1, openml.config.connection_n_retries)
+
+    response: requests.Response | None = None
+    delay_method = _human_delay if openml.config.retry_policy == "human" else _robot_delay
+
+    # Error to raise in case of retrying too often. Will be set to the last observed exception.
+    retry_raise_e: Exception | None = None
 
-def send_request(
-    request_method,
-    url,
-    data,
-    files=None,
-):
-    n_retries = config.connection_n_retries
-    response = None
     with requests.Session() as session:
         # Start at one to have a non-zero multiplier for the sleep
-        for i in range(1, n_retries + 1):
+        for retry_counter in range(1, n_retries + 1):
             try:
-                if request_method == 'get':
-                    response = session.get(url, params=data)
-                elif request_method == 'delete':
-                    response = session.delete(url, params=data)
-                elif request_method == 'post':
-                    response = session.post(url, data=data, files=files)
+                if request_method == "get":
+                    response = session.get(url, params=data, headers=_HEADERS)
+                elif request_method == "delete":
+                    response = session.delete(url, params=data, headers=_HEADERS)
+                elif request_method == "post":
+                    response = session.post(url, data=data, files=files, headers=_HEADERS)
                 else:
                     raise NotImplementedError()
-                break
+
+                __check_response(response=response, url=url, file_elements=files)
+
+                if request_method == "get" and not __is_checksum_equal(
+                    response.text.encode("utf-8"), md5_checksum
+                ):
+                    # -- Check if encoding is not UTF-8 perhaps
+                    if __is_checksum_equal(response.content, md5_checksum):
+                        raise OpenMLHashException(
+                            f"Checksum of downloaded file is unequal to the expected checksum"
+                            f"{md5_checksum} because the text encoding is not UTF-8 when "
+                            f"downloading {url}. There might be a sever-sided issue with the file, "
+                            "see: https://github.com/openml/openml-python/issues/1180.",
+                        )
+
+                    raise OpenMLHashException(
+                        f"Checksum of downloaded file is unequal to the expected checksum "
+                        f"{md5_checksum} when downloading {url}.",
+                    )
+
+                return response
+            except OpenMLServerException as e:
+                # Propagate all server errors to the calling functions, except
+                # for 107 which represents a database connection error.
+                # These are typically caused by high server load,
+                # which means trying again might resolve the issue.
+                if e.code != DATABASE_CONNECTION_ERRCODE:
+                    raise e
+                retry_raise_e = e
+            except xml.parsers.expat.ExpatError as e:
+                if request_method != "get" or retry_counter >= n_retries:
+                    if response is not None:
+                        extra = f"Status code: {response.status_code}\n{response.text}"
+                    else:
+                        extra = "No response retrieved."
+
+                    raise OpenMLServerError(
+                        f"Unexpected server error when calling {url}. Please contact the "
+                        f"developers!\n{extra}"
+                    ) from e
+                retry_raise_e = e
             except (
-                    requests.exceptions.ConnectionError,
-                    requests.exceptions.SSLError,
+                requests.exceptions.ChunkedEncodingError,
+                requests.exceptions.ConnectionError,
+                requests.exceptions.SSLError,
+                OpenMLHashException,
             ) as e:
-                if i == n_retries:
-                    raise e
-                else:
-                    time.sleep(0.1 * i)
-    if response is None:
-        raise ValueError('This should never happen!')
+                retry_raise_e = e
+
+            # We can only be here if there was an exception
+            assert retry_raise_e is not None
+            if retry_counter >= n_retries:
+                raise retry_raise_e
+            delay = delay_method(retry_counter)
+            time.sleep(delay)
+
+    assert response is not None
     return response
 
 
-def _parse_server_exception(response, url):
-    # OpenML has a sophisticated error system
-    # where information about failures is provided. try to parse this
+def __check_response(
+    response: requests.Response,
+    url: str,
+    file_elements: FILE_ELEMENTS_TYPE | None,
+) -> None:
+    if response.status_code != 200:
+        raise __parse_server_exception(response, url, file_elements=file_elements)
+    if "Content-Encoding" not in response.headers or response.headers["Content-Encoding"] != "gzip":
+        logging.warning(f"Received uncompressed content from OpenML for {url}.")
+
+
+def __parse_server_exception(
+    response: requests.Response,
+    url: str,
+    file_elements: FILE_ELEMENTS_TYPE | None,
+) -> OpenMLServerError:
+    if response.status_code == requests.codes.URI_TOO_LONG:
+        raise OpenMLServerError(f"URI too long! ({url})")
+
+    # OpenML has a sophisticated error system where information about failures is provided,
+    # in the response body itself.
+    # First, we need to parse it out.
     try:
         server_exception = xmltodict.parse(response.text)
-    except Exception:
+    except xml.parsers.expat.ExpatError as e:
+        raise e
+    except Exception as e:
+        # If we failed to parse it out, then something has gone wrong in the body we have sent back
+        # from the server and there is little extra information we can capture.
         raise OpenMLServerError(
-            'Unexpected server error when calling {}. Please contact the developers!\n'
-            'Status code: {}\n{}'.format(url, response.status_code, response.text))
-
-    server_error = server_exception['oml:error']
-    code = int(server_error['oml:code'])
-    message = server_error['oml:message']
-    additional_information = server_error.get('oml:additional_information')
-    if code in [372, 512, 500, 482, 542, 674]:
+            f"Unexpected server error when calling {url}. Please contact the developers!\n"
+            f"Status code: {response.status_code}\n{response.text}",
+        ) from e
+
+    # Now we can parse out the specific error codes that we return. These
+    # are in addition to the typical HTTP error codes, but encode more
+    # specific informtion. You can find these codes here:
+    # https://github.com/openml/OpenML/blob/develop/openml_OS/views/pages/api_new/v1/xml/pre.php
+    server_error = server_exception["oml:error"]
+    code = int(server_error["oml:code"])
+    message = server_error["oml:message"]
+    additional_information = server_error.get("oml:additional_information")
+    if code in [111, 372, 512, 500, 482, 542, 674]:
+        if additional_information:
+            full_message = f"{message} - {additional_information}"
+        else:
+            full_message = message
+
         # 512 for runs, 372 for datasets, 500 for flows
         # 482 for tasks, 542 for evaluations, 674 for setups
-        return OpenMLServerNoResult(code, message, additional_information)
-    return OpenMLServerException(
-        code=code,
-        message=message,
-        additional=additional_information,
-        url=url
-    )
+        # 111 for dataset descriptions
+        return OpenMLServerNoResult(code=code, message=full_message, url=url)
+
+    # 163: failure to validate flow XML (https://www.openml.org/api_docs#!/flow/post_flow)
+    if code in [163] and file_elements is not None and "description" in file_elements:
+        # file_elements['description'] is the XML file description of the flow
+        full_message = "\n{}\n{} - {}".format(
+            file_elements["description"],
+            message,
+            additional_information,
+        )
+    else:
+        full_message = f"{message} - {additional_information}"
+
+    if code in [
+        102,  # flow/exists post
+        137,  # dataset post
+        350,  # dataset/42 delete
+        310,  # flow/<something> post
+        320,  # flow/42 delete
+        400,  # run/42 delete
+        460,  # task/42 delete
+    ]:
+        msg = f"The API call {url} requires authentication via an API key."
+        return OpenMLAuthenticationError(message=msg)
+
+    return OpenMLServerException(code=code, message=full_message, url=url)
diff --git a/openml/_config.py b/openml/_config.py
new file mode 100644
index 000000000..a7034b9b4
--- /dev/null
+++ b/openml/_config.py
@@ -0,0 +1,459 @@
+"""Store module level information like the API key, cache directory and the server"""
+
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import configparser
+import logging
+import logging.handlers
+import os
+import platform
+import shutil
+import warnings
+from collections.abc import Iterator
+from contextlib import contextmanager
+from dataclasses import dataclass, field, fields, replace
+from io import StringIO
+from pathlib import Path
+from typing import Any, ClassVar, Literal, cast
+from urllib.parse import urlparse
+
+logger = logging.getLogger(__name__)
+openml_logger = logging.getLogger("openml")
+
+
+def _resolve_default_cache_dir() -> Path:
+    user_defined_cache_dir = os.environ.get("OPENML_CACHE_DIR")
+    if user_defined_cache_dir is not None:
+        return Path(user_defined_cache_dir)
+
+    if platform.system().lower() != "linux":
+        return Path("~", ".openml").expanduser()
+
+    xdg_cache_home = os.environ.get("XDG_CACHE_HOME")
+    if xdg_cache_home is None:
+        return Path("~", ".cache", "openml").expanduser()
+
+    cache_dir = Path(xdg_cache_home) / "openml"
+    if cache_dir.exists():
+        return cache_dir
+
+    heuristic_dir_for_backwards_compat = Path(xdg_cache_home) / "org" / "openml"
+    if not heuristic_dir_for_backwards_compat.exists():
+        return cache_dir
+
+    root_dir_to_delete = Path(xdg_cache_home) / "org"
+    openml_logger.warning(
+        "An old cache directory was found at '%s'. This directory is no longer used by "
+        "OpenML-Python. To silence this warning you would need to delete the old cache "
+        "directory. The cached files will then be located in '%s'.",
+        root_dir_to_delete,
+        cache_dir,
+    )
+    return Path(xdg_cache_home)
+
+
+@dataclass
+class OpenMLConfig:
+    """Dataclass storing the OpenML configuration."""
+
+    apikey: str | None = ""
+    server: str = "https://www.openml.org/api/v1/xml"
+    cachedir: Path = field(default_factory=_resolve_default_cache_dir)
+    avoid_duplicate_runs: bool = False
+    retry_policy: Literal["human", "robot"] = "human"
+    connection_n_retries: int = 5
+    show_progress: bool = False
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name == "apikey" and not isinstance(value, (type(None), str)):
+            raise TypeError("apikey must be a string or None")
+
+        super().__setattr__(name, value)
+
+
+class OpenMLConfigManager:
+    """The OpenMLConfigManager manages the configuration of the openml-python package."""
+
+    def __init__(self) -> None:
+        self.console_handler: logging.StreamHandler | None = None
+        self.file_handler: logging.handlers.RotatingFileHandler | None = None
+
+        self.OPENML_CACHE_DIR_ENV_VAR = "OPENML_CACHE_DIR"
+        self.OPENML_SKIP_PARQUET_ENV_VAR = "OPENML_SKIP_PARQUET"
+        self._TEST_SERVER_NORMAL_USER_KEY = "normaluser"
+        self.OPENML_TEST_SERVER_ADMIN_KEY_ENV_VAR = "OPENML_TEST_SERVER_ADMIN_KEY"
+        self.TEST_SERVER_URL = "https://test.openml.org"
+
+        self._config: OpenMLConfig = OpenMLConfig()
+        # for legacy test `test_non_writable_home`
+        self._defaults: dict[str, Any] = OpenMLConfig().__dict__.copy()
+        self._root_cache_directory: Path = self._config.cachedir
+
+        self.logger = logger
+        self.openml_logger = openml_logger
+
+        self._examples = ConfigurationForExamples(self)
+
+        self._setup()
+
+    def __getattr__(self, name: str) -> Any:
+        if hasattr(self._config, name):
+            return getattr(self._config, name)
+        raise AttributeError(f"{type(self).__name__!r} object has no attribute {name!r}")
+
+    _FIELDS: ClassVar[set[str]] = {f.name for f in fields(OpenMLConfig)}
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        # during __init__ before _config exists
+        if name in {
+            "_config",
+            "_root_cache_directory",
+            "console_handler",
+            "file_handler",
+            "logger",
+            "openml_logger",
+            "_examples",
+            "OPENML_CACHE_DIR_ENV_VAR",
+            "OPENML_SKIP_PARQUET_ENV_VAR",
+            "_TEST_SERVER_NORMAL_USER_KEY",
+        }:
+            return object.__setattr__(self, name, value)
+
+        if name in self._FIELDS:
+            # write into dataclass, not manager (prevents shadowing)
+            if name == "cachedir":
+                object.__setattr__(self, "_root_cache_directory", Path(value))
+            object.__setattr__(self, "_config", replace(self._config, **{name: value}))
+            return None
+
+        object.__setattr__(self, name, value)
+        return None
+
+    def _create_log_handlers(self, create_file_handler: bool = True) -> None:  # noqa: FBT002
+        if self.console_handler is not None or self.file_handler is not None:
+            self.logger.debug("Requested to create log handlers, but they are already created.")
+            return
+
+        message_format = "[%(levelname)s] [%(asctime)s:%(name)s] %(message)s"
+        output_formatter = logging.Formatter(message_format, datefmt="%H:%M:%S")
+
+        self.console_handler = logging.StreamHandler()
+        self.console_handler.setFormatter(output_formatter)
+
+        if create_file_handler:
+            one_mb = 2**20
+            log_path = self._root_cache_directory / "openml_python.log"
+            self.file_handler = logging.handlers.RotatingFileHandler(
+                log_path,
+                maxBytes=one_mb,
+                backupCount=1,
+                delay=True,
+            )
+            self.file_handler.setFormatter(output_formatter)
+
+    def _convert_log_levels(self, log_level: int) -> tuple[int, int]:
+        openml_to_python = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG}
+        python_to_openml = {
+            logging.DEBUG: 2,
+            logging.INFO: 1,
+            logging.WARNING: 0,
+            logging.CRITICAL: 0,
+            logging.ERROR: 0,
+        }
+        openml_level = python_to_openml.get(log_level, log_level)
+        python_level = openml_to_python.get(log_level, log_level)
+        return openml_level, python_level
+
+    def _set_level_register_and_store(self, handler: logging.Handler, log_level: int) -> None:
+        _oml_level, py_level = self._convert_log_levels(log_level)
+        handler.setLevel(py_level)
+
+        if self.openml_logger.level > py_level or self.openml_logger.level == logging.NOTSET:
+            self.openml_logger.setLevel(py_level)
+
+        if handler not in self.openml_logger.handlers:
+            self.openml_logger.addHandler(handler)
+
+    def set_console_log_level(self, console_output_level: int) -> None:
+        """Set the log level for console output."""
+        assert self.console_handler is not None
+        self._set_level_register_and_store(self.console_handler, console_output_level)
+
+    def set_file_log_level(self, file_output_level: int) -> None:
+        """Set the log level for file output."""
+        assert self.file_handler is not None
+        self._set_level_register_and_store(self.file_handler, file_output_level)
+
+    def get_server_base_url(self) -> str:
+        """Get the base URL of the OpenML server (i.e., without /api)."""
+        domain, _ = self._config.server.split("/api", maxsplit=1)
+        return domain.replace("api", "www")
+
+    def set_retry_policy(
+        self, value: Literal["human", "robot"], n_retries: int | None = None
+    ) -> None:
+        """Set the retry policy for server connections."""
+        default_retries_by_policy = {"human": 5, "robot": 50}
+
+        if value not in default_retries_by_policy:
+            raise ValueError(
+                f"Detected retry_policy '{value}' but must be one of "
+                f"{list(default_retries_by_policy.keys())}",
+            )
+        if n_retries is not None and not isinstance(n_retries, int):
+            raise TypeError(
+                f"`n_retries` must be of type `int` or `None` but is `{type(n_retries)}`."
+            )
+
+        if isinstance(n_retries, int) and n_retries < 1:
+            raise ValueError(f"`n_retries` is '{n_retries}' but must be positive.")
+
+        self._config = replace(
+            self._config,
+            retry_policy=value,
+            connection_n_retries=(
+                default_retries_by_policy[value] if n_retries is None else n_retries
+            ),
+        )
+
+    def _handle_xdg_config_home_backwards_compatibility(self, xdg_home: str) -> Path:
+        config_dir = Path(xdg_home) / "openml"
+
+        backwards_compat_config_file = Path(xdg_home) / "config"
+        if not backwards_compat_config_file.exists():
+            return config_dir
+
+        try:
+            self._parse_config(backwards_compat_config_file)
+        except Exception:  # noqa: BLE001
+            return config_dir
+
+        correct_config_location = config_dir / "config"
+        try:
+            shutil.copy(backwards_compat_config_file, correct_config_location)
+            self.openml_logger.warning(
+                "An openml configuration file was found at the old location "
+                f"at {backwards_compat_config_file}. We have copied it to the new "
+                f"location at {correct_config_location}. "
+                "\nTo silence this warning please verify that the configuration file "
+                f"at {correct_config_location} is correct and delete the file at "
+                f"{backwards_compat_config_file}."
+            )
+            return config_dir
+        except Exception as e:  # noqa: BLE001
+            self.openml_logger.warning(
+                "While attempting to perform a backwards compatible fix, we "
+                f"failed to copy the openml config file at "
+                f"{backwards_compat_config_file}' to {correct_config_location}"
+                f"\n{type(e)}: {e}",
+                "\n\nTo silence this warning, please copy the file "
+                "to the new location and delete the old file at "
+                f"{backwards_compat_config_file}.",
+            )
+            return backwards_compat_config_file
+
+    def determine_config_file_path(self) -> Path:
+        """Determine the path to the openml configuration file."""
+        if platform.system().lower() == "linux":
+            xdg_home = os.environ.get("XDG_CONFIG_HOME")
+            if xdg_home is not None:
+                config_dir = self._handle_xdg_config_home_backwards_compatibility(xdg_home)
+            else:
+                config_dir = Path("~", ".config", "openml")
+        else:
+            config_dir = Path("~") / ".openml"
+
+        config_dir = Path(config_dir).expanduser().resolve()
+        return config_dir / "config"
+
+    def _parse_config(self, config_file: str | Path) -> dict[str, Any]:
+        config_file = Path(config_file)
+        config = configparser.RawConfigParser(defaults=OpenMLConfig().__dict__)  # type: ignore
+
+        config_file_ = StringIO()
+        config_file_.write("[FAKE_SECTION]\n")
+        try:
+            with config_file.open("r") as fh:
+                for line in fh:
+                    config_file_.write(line)
+        except FileNotFoundError:
+            self.logger.info(
+                "No config file found at %s, using default configuration.", config_file
+            )
+        except OSError as e:
+            self.logger.info("Error opening file %s: %s", config_file, e.args[0])
+        config_file_.seek(0)
+        config.read_file(config_file_)
+        configuration = dict(config.items("FAKE_SECTION"))
+        for boolean_field in ["avoid_duplicate_runs", "show_progress"]:
+            if isinstance(config["FAKE_SECTION"][boolean_field], str):
+                configuration[boolean_field] = config["FAKE_SECTION"].getboolean(boolean_field)  # type: ignore
+        return configuration  # type: ignore
+
+    def start_using_configuration_for_example(self) -> None:
+        """Sets the configuration to connect to the test server with valid apikey."""
+        return self._examples.start_using_configuration_for_example()
+
+    def stop_using_configuration_for_example(self) -> None:
+        """Store the configuration as it was before `start_use_example_configuration`."""
+        return self._examples.stop_using_configuration_for_example()
+
+    def _setup(self, config: dict[str, Any] | None = None) -> None:
+        config_file = self.determine_config_file_path()
+        config_dir = config_file.parent
+
+        try:
+            if not config_dir.exists():
+                config_dir.mkdir(exist_ok=True, parents=True)
+        except PermissionError:
+            self.openml_logger.warning(
+                f"No permission to create OpenML directory at {config_dir}!"
+                " This can result in OpenML-Python not working properly."
+            )
+
+        if config is None:
+            config = self._parse_config(config_file)
+
+        self._config = replace(
+            self._config,
+            apikey=config["apikey"],
+            server=config["server"],
+            show_progress=config["show_progress"],
+            avoid_duplicate_runs=config["avoid_duplicate_runs"],
+            retry_policy=config["retry_policy"],
+            connection_n_retries=int(config["connection_n_retries"]),
+        )
+
+        user_defined_cache_dir = os.environ.get(self.OPENML_CACHE_DIR_ENV_VAR)
+        if user_defined_cache_dir is not None:
+            short_cache_dir = Path(user_defined_cache_dir)
+        else:
+            short_cache_dir = Path(config["cachedir"])
+
+        self._root_cache_directory = short_cache_dir.expanduser().resolve()
+        self._config = replace(self._config, cachedir=self._root_cache_directory)
+
+        try:
+            cache_exists = self._root_cache_directory.exists()
+            if not cache_exists:
+                self._root_cache_directory.mkdir(exist_ok=True, parents=True)
+            self._create_log_handlers()
+        except PermissionError:
+            self.openml_logger.warning(
+                f"No permission to create OpenML directory at {self._root_cache_directory}!"
+                " This can result in OpenML-Python not working properly."
+            )
+            self._create_log_handlers(create_file_handler=False)
+
+    def set_field_in_config_file(self, field: str, value: Any) -> None:
+        """Set a field in the configuration file."""
+        if not hasattr(OpenMLConfig(), field):
+            raise ValueError(
+                f"Field '{field}' is not valid and must be one of "
+                f"'{OpenMLConfig().__dict__.keys()}'."
+            )
+
+        self._config = replace(self._config, **{field: value})
+        config_file = self.determine_config_file_path()
+        existing = self._parse_config(config_file)
+        with config_file.open("w") as fh:
+            for f in OpenMLConfig().__dict__:
+                v = value if f == field else existing.get(f)
+                if v is not None:
+                    fh.write(f"{f} = {v}\n")
+
+    def get_config_as_dict(self) -> dict[str, Any]:
+        """Get the current configuration as a dictionary."""
+        return self._config.__dict__.copy()
+
+    def get_cache_directory(self) -> str:
+        """Get the cache directory for the current server."""
+        url_suffix = urlparse(self._config.server).netloc
+        url_parts = url_suffix.replace(":", "_").split(".")[::-1]
+        reversed_url_suffix = os.sep.join(url_parts)  # noqa: PTH118
+        return os.path.join(self._root_cache_directory, reversed_url_suffix)  # noqa: PTH118
+
+    def set_root_cache_directory(self, root_cache_directory: str | Path) -> None:
+        """Set the root cache directory."""
+        self._root_cache_directory = Path(root_cache_directory)
+        self._config = replace(self._config, cachedir=self._root_cache_directory)
+
+    @contextmanager
+    def overwrite_config_context(self, config: dict[str, Any]) -> Iterator[dict[str, Any]]:
+        """Overwrite the current configuration within a context manager."""
+        existing_config = self.get_config_as_dict()
+        merged_config = {**existing_config, **config}
+
+        self._setup(merged_config)
+        yield merged_config
+        self._setup(existing_config)
+
+
+class ConfigurationForExamples:
+    """Allows easy switching to and from a test configuration, used for examples."""
+
+    _last_used_server = None
+    _last_used_key = None
+    _start_last_called = False
+
+    def __init__(self, manager: OpenMLConfigManager):
+        self._manager = manager
+        self._test_apikey = manager._TEST_SERVER_NORMAL_USER_KEY
+        self._test_server = f"{manager.TEST_SERVER_URL}/api/v1/xml"
+
+    def start_using_configuration_for_example(self) -> None:
+        """Sets the configuration to connect to the test server with valid apikey.
+
+        To configuration as was before this call is stored, and can be recovered
+        by using the `stop_use_example_configuration` method.
+        """
+        if (
+            self._start_last_called
+            and self._manager._config.server == self._test_server
+            and self._manager._config.apikey == self._test_apikey
+        ):
+            # Method is called more than once in a row without modifying the server or apikey.
+            # We don't want to save the current test configuration as a last used configuration.
+            return
+
+        self._last_used_server = self._manager._config.server
+        self._last_used_key = self._manager._config.apikey
+        type(self)._start_last_called = True
+
+        # Test server key for examples
+        self._manager._config = replace(
+            self._manager._config,
+            server=self._test_server,
+            apikey=self._test_apikey,
+        )
+        warnings.warn(
+            f"Switching to the test server {self._test_server} to not upload results to "
+            "the live server. Using the test server may result in reduced performance of the "
+            "API!",
+            stacklevel=2,
+        )
+
+    def stop_using_configuration_for_example(self) -> None:
+        """Return to configuration as it was before `start_use_example_configuration`."""
+        if not type(self)._start_last_called:
+            # We don't want to allow this because it will (likely) result in the `server` and
+            # `apikey` variables being set to None.
+            raise RuntimeError(
+                "`stop_use_example_configuration` called without a saved config."
+                "`start_use_example_configuration` must be called first.",
+            )
+
+        self._manager._config = replace(
+            self._manager._config,
+            server=cast("str", self._last_used_server),
+            apikey=cast("str", self._last_used_key),
+        )
+        type(self)._start_last_called = False
+
+
+__config = OpenMLConfigManager()
+
+
+def __getattr__(name: str) -> Any:
+    return getattr(__config, name)
diff --git a/openml/base.py b/openml/base.py
new file mode 100644
index 000000000..ddee71196
--- /dev/null
+++ b/openml/base.py
@@ -0,0 +1,171 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import re
+import webbrowser
+from abc import ABC, abstractmethod
+from collections.abc import Iterable, Sequence
+
+import xmltodict
+
+import openml._api_calls
+
+from .utils import _get_rest_api_type_alias, _tag_openml_base
+
+
+class OpenMLBase(ABC):
+    """Base object for functionality that is shared across entities."""
+
+    def __repr__(self) -> str:
+        body_fields = self._get_repr_body_fields()
+        return self._apply_repr_template(body_fields)
+
+    @property
+    @abstractmethod
+    def id(self) -> int | None:
+        """The id of the entity, it is unique for its entity type."""
+
+    @property
+    def openml_url(self) -> str | None:
+        """The URL of the object on the server, if it was uploaded, else None."""
+        if self.id is None:
+            return None
+        return self.__class__.url_for_id(self.id)
+
+    @classmethod
+    def url_for_id(cls, id_: int) -> str:
+        """Return the OpenML URL for the object of the class entity with the given id."""
+        # Sample url for a flow: openml.org/f/123
+        return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"
+
+    @classmethod
+    def _entity_letter(cls) -> str:
+        """Return the letter which represents the entity type in urls, e.g. 'f' for flow."""
+        # We take advantage of the class naming convention (OpenMLX),
+        # which holds for all entities except studies and tasks, which overwrite this method.
+        return cls.__name__.lower()[len("OpenML") :][0]
+
+    # TODO(eddiebergman): This would be much cleaner as an iterator...
+    @abstractmethod
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
+        """Collect all information to display in the __repr__ body.
+
+        Returns
+        -------
+        body_fields : List[Tuple[str, Union[str, int, List[str]]]]
+            A list of (name, value) pairs to display in the body of the __repr__.
+            E.g.: [('metric', 'accuracy'), ('dataset', 'iris')]
+            If value is a List of str, then each item of the list will appear in a separate row.
+        """
+        # Should be implemented in the base class.
+
+    def _apply_repr_template(
+        self,
+        body_fields: Iterable[tuple[str, str | int | list[str] | None]],
+    ) -> str:
+        """Generates the header and formats the body for string representation of the object.
+
+        Parameters
+        ----------
+        body_fields: List[Tuple[str, str]]
+           A list of (name, value) pairs to display in the body of the __repr__.
+        """
+        # We add spaces between capitals, e.g. ClassificationTask -> Classification Task
+        name_with_spaces = re.sub(
+            r"(\w)([A-Z])",
+            r"\1 \2",
+            self.__class__.__name__[len("OpenML") :],
+        )
+        header_text = f"OpenML {name_with_spaces}"
+        header = f"{header_text}\n{'=' * len(header_text)}\n"
+
+        _body_fields: list[tuple[str, str | int | list[str]]] = [
+            (k, "None" if v is None else v) for k, v in body_fields
+        ]
+        longest_field_name_length = max(len(name) for name, _ in _body_fields)
+        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
+        body = "\n".join(field_line_format.format(name, value) for name, value in _body_fields)
+        return header + body
+
+    @abstractmethod
+    def _to_dict(self) -> dict[str, dict]:
+        """Creates a dictionary representation of self.
+
+        The return value will be used to create the upload xml file.
+        The xml file must have the tags in exactly the order of the object's xsd.
+        (see https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/).
+
+        Returns
+        -------
+            Thing represented as dict.
+        """
+        # Should be implemented in the base class.
+
+    def _to_xml(self) -> str:
+        """Generate xml representation of self for upload to server."""
+        dict_representation = self._to_dict()
+        xml_representation = xmltodict.unparse(dict_representation, pretty=True)
+
+        # A task may not be uploaded with the xml encoding specification:
+        # <?xml version="1.0" encoding="utf-8"?>
+        _encoding_specification, xml_body = xml_representation.split("\n", 1)
+        return str(xml_body)
+
+    def _get_file_elements(self) -> openml._api_calls.FILE_ELEMENTS_TYPE:
+        """Get file_elements to upload to the server, called during Publish.
+
+        Derived child classes should overwrite this method as necessary.
+        The description field will be populated automatically if not provided.
+        """
+        return {}
+
+    @abstractmethod
+    def _parse_publish_response(self, xml_response: dict[str, str]) -> None:
+        """Parse the id from the xml_response and assign it to self."""
+
+    def publish(self) -> OpenMLBase:
+        """Publish the object on the OpenML server."""
+        file_elements = self._get_file_elements()
+
+        if "description" not in file_elements:
+            file_elements["description"] = self._to_xml()
+
+        call = f"{_get_rest_api_type_alias(self)}/"
+        response_text = openml._api_calls._perform_api_call(
+            call,
+            "post",
+            file_elements=file_elements,
+        )
+        xml_response = xmltodict.parse(response_text)
+
+        self._parse_publish_response(xml_response)
+        return self
+
+    def open_in_browser(self) -> None:
+        """Opens the OpenML web page corresponding to this object in your default browser."""
+        if self.openml_url is None:
+            raise ValueError(
+                "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
+            )
+
+        webbrowser.open(self.openml_url)
+
+    def push_tag(self, tag: str) -> None:
+        """Annotates this entity with a tag on the server.
+
+        Parameters
+        ----------
+        tag : str
+            Tag to attach to the flow.
+        """
+        _tag_openml_base(self, tag)
+
+    def remove_tag(self, tag: str) -> None:
+        """Removes a tag from this entity on the server.
+
+        Parameters
+        ----------
+        tag : str
+            Tag to attach to the flow.
+        """
+        _tag_openml_base(self, tag, untag=True)
diff --git a/openml/cli.py b/openml/cli.py
new file mode 100644
index 000000000..838f774d1
--- /dev/null
+++ b/openml/cli.py
@@ -0,0 +1,379 @@
+"""Command Line Interface for `openml` to configure its settings."""
+
+from __future__ import annotations
+
+import argparse
+import string
+import sys
+from collections.abc import Callable
+from dataclasses import fields
+from pathlib import Path
+from urllib.parse import urlparse
+
+import openml
+from openml.__version__ import __version__
+
+
+def is_hex(string_: str) -> bool:
+    return all(c in string.hexdigits for c in string_)
+
+
+def looks_like_url(url: str) -> bool:
+    # There's no thorough url parser, but we only seem to use netloc.
+    try:
+        return bool(urlparse(url).netloc)
+    except Exception:  # noqa: BLE001
+        return False
+
+
+def wait_until_valid_input(
+    prompt: str,
+    check: Callable[[str], str],
+    sanitize: Callable[[str], str] | None,
+) -> str:
+    """Asks `prompt` until an input is received which returns True for `check`.
+
+    Parameters
+    ----------
+    prompt: str
+        message to display
+    check: Callable[[str], str]
+        function to call with the given input, that provides an error message if the input is not
+        valid otherwise, and False-like otherwise.
+    sanitize: Callable[[str], str], optional
+        A function which attempts to sanitize the user input (e.g. auto-complete).
+
+    Returns
+    -------
+    valid input
+
+    """
+    while True:
+        response = input(prompt)
+        if sanitize:
+            response = sanitize(response)
+        error_message = check(response)
+        if error_message:
+            print(error_message, end="\n\n")
+        else:
+            return response
+
+
+def print_configuration() -> None:
+    file = openml.config.determine_config_file_path()
+    header = f"File '{file}' contains (or defaults to):"
+    print(header)
+
+    max_key_length = max(map(len, openml.config.get_config_as_dict()))
+    for field, value in openml.config.get_config_as_dict().items():
+        print(f"{field.ljust(max_key_length)}: {value}")
+
+
+def verbose_set(field: str, value: str) -> None:
+    openml.config.set_field_in_config_file(field, value)
+    print(f"{field} set to '{value}'.")
+
+
+def configure_apikey(value: str) -> None:
+    def check_apikey(apikey: str) -> str:
+        if len(apikey) != 32:
+            return f"The key should contain 32 characters but contains {len(apikey)}."
+        if not is_hex(apikey):
+            return "Some characters are not hexadecimal."
+        return ""
+
+    instructions = (
+        f"Your current API key is set to: '{openml.config.apikey}'. "
+        "You can get an API key at https://new.openml.org. "
+        "You must create an account if you don't have one yet:\n"
+        "  1. Log in with the account.\n"
+        "  2. Navigate to the profile page (top right circle > Your Profile). \n"
+        "  3. Click the API Key button to reach the page with your API key.\n"
+        "If you have any difficulty following these instructions, let us know on Github."
+    )
+
+    configure_field(
+        field="apikey",
+        value=value,
+        check_with_message=check_apikey,
+        intro_message=instructions,
+        input_message="Please enter your API key:",
+    )
+
+
+def configure_server(value: str) -> None:
+    def check_server(server: str) -> str:
+        is_shorthand = server in ["test", "production_server"]
+        if is_shorthand or looks_like_url(server):
+            return ""
+        return "Must be 'test', 'production_server' or a url."
+
+    def replace_shorthand(server: str) -> str:
+        if server == "test":
+            return f"{openml.config.TEST_SERVER_URL}/api/v1/xml"
+        if server == "production_server":
+            return "https://www.openml.org/api/v1/xml"
+        return server
+
+    configure_field(
+        field="server",
+        value=value,
+        check_with_message=check_server,
+        intro_message="Specify which server you wish to connect to.",
+        input_message="Specify a url or use 'test' or 'production_server' as a shorthand: ",
+        sanitize=replace_shorthand,
+    )
+
+
+def configure_cachedir(value: str) -> None:
+    def check_cache_dir(path: str) -> str:
+        _path = Path(path)
+        if _path.is_file():
+            return f"'{_path}' is a file, not a directory."
+
+        expanded = _path.expanduser()
+        if not expanded.is_absolute():
+            return f"'{_path}' is not absolute (even after expanding '~')."
+
+        if not expanded.exists():
+            try:
+                expanded.mkdir()
+            except PermissionError:
+                return f"'{path}' does not exist and there are not enough permissions to create it."
+
+        return ""
+
+    configure_field(
+        field="cachedir",
+        value=value,
+        check_with_message=check_cache_dir,
+        intro_message="Configuring the cache directory. It can not be a relative path.",
+        input_message="Specify the directory to use (or create) as cache directory: ",
+    )
+
+
+def configure_connection_n_retries(value: str) -> None:
+    def valid_connection_retries(n: str) -> str:
+        if not n.isdigit():
+            return f"'{n}' is not a valid positive integer."
+        if int(n) <= 0:
+            return "connection_n_retries must be positive."
+        return ""
+
+    configure_field(
+        field="connection_n_retries",
+        value=value,
+        check_with_message=valid_connection_retries,
+        intro_message="Configuring the number of times to attempt to connect to the OpenML Server",
+        input_message="Enter a positive integer: ",
+    )
+
+
+def configure_avoid_duplicate_runs(value: str) -> None:
+    def is_python_bool(bool_: str) -> str:
+        if bool_ in ["True", "False"]:
+            return ""
+        return "Must be 'True' or 'False' (mind the capital)."
+
+    def autocomplete_bool(bool_: str) -> str:
+        if bool_.lower() in ["n", "no", "f", "false", "0"]:
+            return "False"
+        if bool_.lower() in ["y", "yes", "t", "true", "1"]:
+            return "True"
+        return bool_
+
+    intro_message = (
+        "If set to True, when `run_flow_on_task` or similar methods are called a lookup is "
+        "performed to see if there already exists such a run on the server. "
+        "If so, download those results instead. "
+        "If set to False, runs will always be executed."
+    )
+
+    configure_field(
+        field="avoid_duplicate_runs",
+        value=value,
+        check_with_message=is_python_bool,
+        intro_message=intro_message,
+        input_message="Enter 'True' or 'False': ",
+        sanitize=autocomplete_bool,
+    )
+
+
+def configure_verbosity(value: str) -> None:
+    def is_zero_through_two(verbosity: str) -> str:
+        if verbosity in ["0", "1", "2"]:
+            return ""
+        return "Must be '0', '1' or '2'."
+
+    intro_message = (
+        "Set the verbosity of log messages which should be shown by openml-python."
+        " 0: normal output (warnings and errors)"
+        " 1: info output (some high-level progress output)"
+        " 2: debug output (detailed information (for developers))"
+    )
+
+    configure_field(
+        field="verbosity",
+        value=value,
+        check_with_message=is_zero_through_two,
+        intro_message=intro_message,
+        input_message="Enter '0', '1' or '2': ",
+    )
+
+
+def configure_retry_policy(value: str) -> None:
+    def is_known_policy(policy: str) -> str:
+        if policy in ["human", "robot"]:
+            return ""
+        return "Must be 'human' or 'robot'."
+
+    def autocomplete_policy(policy: str) -> str:
+        for option in ["human", "robot"]:
+            if option.startswith(policy.lower()):
+                return option
+        return policy
+
+    intro_message = (
+        "Set the retry policy which determines how to react if the server is unresponsive."
+        "We recommend 'human' for interactive usage and 'robot' for scripts."
+        "'human': try a few times in quick succession, less reliable but quicker response."
+        "'robot': try many times with increasing intervals, more reliable but slower response."
+    )
+
+    configure_field(
+        field="retry_policy",
+        value=value,
+        check_with_message=is_known_policy,
+        intro_message=intro_message,
+        input_message="Enter 'human' or 'robot': ",
+        sanitize=autocomplete_policy,
+    )
+
+
+def configure_field(  # noqa: PLR0913
+    field: str,
+    value: None | str,
+    check_with_message: Callable[[str], str],
+    intro_message: str,
+    input_message: str,
+    sanitize: Callable[[str], str] | None = None,
+) -> None:
+    """Configure `field` with `value`. If `value` is None ask the user for input.
+
+    `value` and user input are first corrected/auto-completed with `convert_value` if provided,
+    then validated with `check_with_message` function.
+    If the user input a wrong value in interactive mode, the user gets to input a new value.
+    The new valid value is saved in the openml configuration file.
+    In case an invalid `value` is supplied directly (non-interactive), no changes are made.
+
+    Parameters
+    ----------
+    field: str
+        Field to set.
+    value: str, None
+        Value to field to. If `None` will ask user for input.
+    check_with_message: Callable[[str], str]
+        Function which validates `value` or user input, and returns either an error message if it
+        is invalid, or a False-like value if `value` is valid.
+    intro_message: str
+        Message that is printed once if user input is requested (e.g. instructions).
+    input_message: str
+        Message that comes with the input prompt.
+    sanitize: Union[Callable[[str], str], None]
+        A function to convert user input to 'more acceptable' input, e.g. for auto-complete.
+        If no correction of user input is possible, return the original value.
+        If no function is provided, don't attempt to correct/auto-complete input.
+    """
+    if value is not None:
+        if sanitize:
+            value = sanitize(value)
+        malformed_input = check_with_message(value)
+        if malformed_input:
+            print(malformed_input)
+            sys.exit()
+    else:
+        print(intro_message)
+        value = wait_until_valid_input(
+            prompt=input_message,
+            check=check_with_message,
+            sanitize=sanitize,
+        )
+    verbose_set(field, value)
+
+
+def configure(args: argparse.Namespace) -> None:
+    """Calls the right submenu(s) to edit `args.field` in the configuration file."""
+    set_functions = {
+        "apikey": configure_apikey,
+        "server": configure_server,
+        "cachedir": configure_cachedir,
+        "retry_policy": configure_retry_policy,
+        "connection_n_retries": configure_connection_n_retries,
+        "avoid_duplicate_runs": configure_avoid_duplicate_runs,
+        "verbosity": configure_verbosity,
+    }
+
+    def not_supported_yet(_: str) -> None:
+        print(f"Setting '{args.field}' is not supported yet.")
+
+    if args.field not in ["all", "none"]:
+        set_functions.get(args.field, not_supported_yet)(args.value)
+    else:
+        if args.value is not None:
+            print(f"Can not set value ('{args.value}') when field is specified as '{args.field}'.")
+            sys.exit()
+        print_configuration()
+
+    if args.field == "all":
+        for set_field_function in set_functions.values():
+            set_field_function(args.value)
+
+
+def main() -> None:
+    subroutines = {"configure": configure}
+
+    parser = argparse.ArgumentParser()
+    # Add a global --version flag to display installed version and exit
+    parser.add_argument(
+        "--version",
+        action="version",
+        version=f"%(prog)s {__version__}",
+        help="Show the OpenML version and exit",
+    )
+    subparsers = parser.add_subparsers(dest="subroutine")
+
+    parser_configure = subparsers.add_parser(
+        "configure",
+        description="Set or read variables in your configuration file. For more help also see "
+        "'https://openml.github.io/openml-python/main/usage.html#configuration'.",
+    )
+
+    configurable_fields = [
+        f.name for f in fields(openml._config.OpenMLConfig) if f.name not in ["max_retries"]
+    ]
+
+    parser_configure.add_argument(
+        "field",
+        type=str,
+        choices=[*configurable_fields, "all", "none"],
+        default="all",
+        nargs="?",
+        help="The field you wish to edit. "
+        "Choosing 'all' lets you configure all fields one by one. "
+        "Choosing 'none' will print out the current configuration.",
+    )
+
+    parser_configure.add_argument(
+        "value",
+        type=str,
+        default=None,
+        nargs="?",
+        help="The value to set the FIELD to.",
+    )
+
+    args = parser.parse_args()
+    subroutines.get(args.subroutine, lambda _: parser.print_help())(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/openml/config.py b/openml/config.py
deleted file mode 100644
index 91d7345e0..000000000
--- a/openml/config.py
+++ /dev/null
@@ -1,205 +0,0 @@
-"""
-Store module level information like the API key, cache directory and the server
-"""
-import logging
-import os
-
-from io import StringIO
-import configparser
-from urllib.parse import urlparse
-
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(
-    format='[%(levelname)s] [%(asctime)s:%(name)s] %('
-           'message)s', datefmt='%H:%M:%S')
-
-# Default values!
-_defaults = {
-    'apikey': None,
-    'server': "https://www.openml.org/api/v1/xml",
-    'verbosity': 0,
-    'cachedir': os.path.expanduser(os.path.join('~', '.openml', 'cache')),
-    'avoid_duplicate_runs': 'True',
-    'connection_n_retries': 2,
-}
-
-config_file = os.path.expanduser(os.path.join('~', '.openml', 'config'))
-
-# Default values are actually added here in the _setup() function which is
-# called at the end of this module
-server = _defaults['server']
-apikey = _defaults['apikey']
-# The current cache directory (without the server name)
-cache_directory = _defaults['cachedir']
-avoid_duplicate_runs = True if _defaults['avoid_duplicate_runs'] == 'True' else False
-
-# Number of retries if the connection breaks
-connection_n_retries = _defaults['connection_n_retries']
-
-
-class ConfigurationForExamples:
-    """ Allows easy switching to and from a test configuration, used for examples. """
-    _last_used_server = None
-    _last_used_key = None
-    _start_last_called = False
-    _test_server = "https://test.openml.org/api/v1/xml"
-    _test_apikey = "c0c42819af31e706efe1f4b88c23c6c1"
-
-    @classmethod
-    def start_using_configuration_for_example(cls):
-        """ Sets the configuration to connect to the test server with valid apikey.
-
-        To configuration as was before this call is stored, and can be recovered
-        by using the `stop_use_example_configuration` method.
-        """
-        global server
-        global apikey
-
-        if cls._start_last_called and server == cls._test_server and apikey == cls._test_apikey:
-            # Method is called more than once in a row without modifying the server or apikey.
-            # We don't want to save the current test configuration as a last used configuration.
-            return
-
-        cls._last_used_server = server
-        cls._last_used_key = apikey
-        cls._start_last_called = True
-
-        # Test server key for examples
-        server = cls._test_server
-        apikey = cls._test_apikey
-
-    @classmethod
-    def stop_using_configuration_for_example(cls):
-        """ Return to configuration as it was before `start_use_example_configuration`. """
-        if not cls._start_last_called:
-            # We don't want to allow this because it will (likely) result in the `server` and
-            # `apikey` variables being set to None.
-            raise RuntimeError("`stop_use_example_configuration` called without a saved config."
-                               "`start_use_example_configuration` must be called first.")
-
-        global server
-        global apikey
-
-        server = cls._last_used_server
-        apikey = cls._last_used_key
-        cls._start_last_called = False
-
-
-def _setup():
-    """Setup openml package. Called on first import.
-
-    Reads the config file and sets up apikey, server, cache appropriately.
-    key and server can be set by the user simply using
-    openml.config.apikey = THEIRKEY
-    openml.config.server = SOMESERVER
-    We could also make it a property but that's less clear.
-    """
-    global apikey
-    global server
-    global cache_directory
-    global avoid_duplicate_runs
-    global connection_n_retries
-    # read config file, create cache directory
-    try:
-        os.mkdir(os.path.expanduser(os.path.join('~', '.openml')))
-    except (IOError, OSError):
-        # TODO add debug information
-        pass
-    config = _parse_config()
-    apikey = config.get('FAKE_SECTION', 'apikey')
-    server = config.get('FAKE_SECTION', 'server')
-
-    short_cache_dir = config.get('FAKE_SECTION', 'cachedir')
-    cache_directory = os.path.expanduser(short_cache_dir)
-
-    avoid_duplicate_runs = config.getboolean('FAKE_SECTION',
-                                             'avoid_duplicate_runs')
-    connection_n_retries = config.get('FAKE_SECTION', 'connection_n_retries')
-    if connection_n_retries > 20:
-        raise ValueError(
-            'A higher number of retries than 20 is not allowed to keep the '
-            'server load reasonable'
-        )
-
-
-def _parse_config():
-    """Parse the config file, set up defaults.
-    """
-
-    config = configparser.RawConfigParser(defaults=_defaults)
-
-    if not os.path.exists(config_file):
-        # Create an empty config file if there was none so far
-        fh = open(config_file, "w")
-        fh.close()
-        logger.info("Could not find a configuration file at %s. Going to "
-                    "create an empty file there." % config_file)
-
-    try:
-        # Cheat the ConfigParser module by adding a fake section header
-        config_file_ = StringIO()
-        config_file_.write("[FAKE_SECTION]\n")
-        with open(config_file) as fh:
-            for line in fh:
-                config_file_.write(line)
-        config_file_.seek(0)
-        config.read_file(config_file_)
-    except OSError as e:
-        logging.info("Error opening file %s: %s", config_file, e.message)
-    return config
-
-
-def get_cache_directory():
-    """Get the current cache directory.
-
-    Returns
-    -------
-    cachedir : string
-        The current cache directory.
-
-    """
-    url_suffix = urlparse(server).netloc
-    reversed_url_suffix = os.sep.join(url_suffix.split('.')[::-1])
-    if not cache_directory:
-        _cachedir = _defaults(cache_directory)
-    else:
-        _cachedir = cache_directory
-    _cachedir = os.path.join(_cachedir, reversed_url_suffix)
-    return _cachedir
-
-
-def set_cache_directory(cachedir):
-    """Set module-wide cache directory.
-
-    Sets the cache directory into which to download datasets, tasks etc.
-
-    Parameters
-    ----------
-    cachedir : string
-         Path to use as cache directory.
-
-    See also
-    --------
-    get_cache_directory
-    """
-
-    global cache_directory
-    cache_directory = cachedir
-
-
-start_using_configuration_for_example = (
-    ConfigurationForExamples.start_using_configuration_for_example
-)
-stop_using_configuration_for_example = (
-    ConfigurationForExamples.stop_using_configuration_for_example
-)
-
-__all__ = [
-    'get_cache_directory',
-    'set_cache_directory',
-    'start_using_configuration_for_example',
-    'stop_using_configuration_for_example',
-]
-
-_setup()
diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py
index 8f52e16fc..eb0932652 100644
--- a/openml/datasets/__init__.py
+++ b/openml/datasets/__init__.py
@@ -1,25 +1,33 @@
+# License: BSD 3-Clause
+
+from .data_feature import OpenMLDataFeature
+from .dataset import OpenMLDataset
 from .functions import (
     attributes_arff_from_df,
     check_datasets_active,
     create_dataset,
+    delete_dataset,
+    edit_dataset,
+    fork_dataset,
     get_dataset,
     get_datasets,
     list_datasets,
+    list_qualities,
     status_update,
-    list_qualities
 )
-from .dataset import OpenMLDataset
-from .data_feature import OpenMLDataFeature
 
 __all__ = [
-    'attributes_arff_from_df',
-    'check_datasets_active',
-    'create_dataset',
-    'get_dataset',
-    'get_datasets',
-    'list_datasets',
-    'OpenMLDataset',
-    'OpenMLDataFeature',
-    'status_update',
-    'list_qualities'
+    "OpenMLDataFeature",
+    "OpenMLDataset",
+    "attributes_arff_from_df",
+    "check_datasets_active",
+    "create_dataset",
+    "delete_dataset",
+    "edit_dataset",
+    "fork_dataset",
+    "get_dataset",
+    "get_datasets",
+    "list_datasets",
+    "list_qualities",
+    "status_update",
 ]
diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py
index 077be639e..0598763b0 100644
--- a/openml/datasets/data_feature.py
+++ b/openml/datasets/data_feature.py
@@ -1,4 +1,14 @@
-class OpenMLDataFeature(object):
+# License: BSD 3-Clause
+from __future__ import annotations
+
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Any, ClassVar
+
+if TYPE_CHECKING:
+    from IPython.lib import pretty
+
+
+class OpenMLDataFeature:  # noqa: PLW1641
     """
     Data Feature (a.k.a. Attribute) object.
 
@@ -13,37 +23,62 @@ class OpenMLDataFeature(object):
     nominal_values : list(str)
         list of the possible values, in case of nominal attribute
     number_missing_values : int
+        Number of rows that have a missing value for this feature.
+    ontologies : list(str)
+        list of ontologies attached to this feature. An ontology describes the
+        concept that are described in a feature. An ontology is defined by an
+        URL where the information is provided.
     """
-    LEGAL_DATA_TYPES = ['nominal', 'numeric', 'string', 'date']
 
-    def __init__(self, index, name, data_type, nominal_values,
-                 number_missing_values):
-        if type(index) != int:
-            raise ValueError('Index is of wrong datatype')
+    LEGAL_DATA_TYPES: ClassVar[Sequence[str]] = ["nominal", "numeric", "string", "date"]
+
+    def __init__(  # noqa: PLR0913
+        self,
+        index: int,
+        name: str,
+        data_type: str,
+        nominal_values: list[str],
+        number_missing_values: int,
+        ontologies: list[str] | None = None,
+    ):
+        if not isinstance(index, int):
+            raise TypeError(f"Index must be `int` but is {type(index)}")
+
         if data_type not in self.LEGAL_DATA_TYPES:
-            raise ValueError('data type should be in %s, found: %s' %
-                             (str(self.LEGAL_DATA_TYPES), data_type))
-        if data_type == 'nominal':
+            raise ValueError(
+                f"data type should be in {self.LEGAL_DATA_TYPES!s}, found: {data_type}",
+            )
+
+        if data_type == "nominal":
             if nominal_values is None:
-                raise TypeError('Dataset features require attribute `nominal_values` for nominal '
-                                'feature type.')
-            elif not isinstance(nominal_values, list):
-                raise TypeError('Argument `nominal_values` is of wrong datatype, should be list, '
-                                'but is {}'.format(type(nominal_values)))
-        else:
-            if nominal_values is not None:
-                raise TypeError('Argument `nominal_values` must be None for non-nominal feature.')
-        if type(number_missing_values) != int:
-            raise ValueError('number_missing_values is of wrong datatype')
+                raise TypeError(
+                    "Dataset features require attribute `nominal_values` for nominal feature type.",
+                )
+
+            if not isinstance(nominal_values, list):
+                raise TypeError(
+                    "Argument `nominal_values` is of wrong datatype, should be list, "
+                    f"but is {type(nominal_values)}",
+                )
+        elif nominal_values is not None:
+            raise TypeError("Argument `nominal_values` must be None for non-nominal feature.")
+
+        if not isinstance(number_missing_values, int):
+            msg = f"number_missing_values must be int but is {type(number_missing_values)}"
+            raise TypeError(msg)
 
         self.index = index
         self.name = str(name)
         self.data_type = str(data_type)
         self.nominal_values = nominal_values
         self.number_missing_values = number_missing_values
+        self.ontologies = ontologies
+
+    def __repr__(self) -> str:
+        return f"[{self.index} - {self.name} ({self.data_type})]"
 
-    def __repr__(self):
-        return "[%d - %s (%s)]" % (self.index, self.name, self.data_type)
+    def __eq__(self, other: Any) -> bool:
+        return isinstance(other, OpenMLDataFeature) and self.__dict__ == other.__dict__
 
-    def _repr_pretty_(self, pp, cycle):
+    def _repr_pretty_(self, pp: pretty.PrettyPrinter, cycle: bool) -> None:  # noqa: ARG002
         pp.text(str(self))
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index 630fac35e..59d6205ba 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -1,28 +1,47 @@
-from collections import OrderedDict
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import gzip
-import io
 import logging
 import os
 import pickle
-from typing import List, Optional, Union, Tuple, Iterable
+import re
+import warnings
+from collections.abc import Iterable, Sequence
+from pathlib import Path
+from typing import Any, Literal
 
 import arff
 import numpy as np
 import pandas as pd
 import scipy.sparse
 import xmltodict
-from warnings import warn
 
-import openml._api_calls
-from .data_feature import OpenMLDataFeature
-from ..exceptions import PyOpenMLError
-from ..utils import _tag_entity
+import openml
+from openml.base import OpenMLBase
 
+from .data_feature import OpenMLDataFeature
 
 logger = logging.getLogger(__name__)
 
 
-class OpenMLDataset(object):
+def _ensure_dataframe(
+    data: pd.DataFrame | pd.Series | np.ndarray | scipy.sparse.spmatrix,
+    attribute_names: list | None = None,
+) -> pd.DataFrame:
+    if isinstance(data, pd.DataFrame):
+        return data
+    if scipy.sparse.issparse(data):
+        return pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names)
+    if isinstance(data, np.ndarray):
+        return pd.DataFrame(data, columns=attribute_names)  # type: ignore
+    if isinstance(data, pd.Series):
+        return data.to_frame()
+
+    raise TypeError(f"Data type {type(data)} not supported.")
+
+
+class OpenMLDataset(OpenMLBase):  # noqa: PLW1641
     """Dataset object.
 
     Allows fetching and uploading datasets to OpenML.
@@ -33,8 +52,10 @@ class OpenMLDataset(object):
         Name of the dataset.
     description : str
         Description of the dataset.
-    format : str
+    data_format : str
         Format of the dataset which can be either 'arff' or 'sparse_arff'.
+    cache_format : str
+        Format for caching the dataset which can be either 'feather' or 'pickle'.
     dataset_id : int, optional
         Id autogenerated by the server.
     version : int, optional
@@ -82,32 +103,103 @@ class OpenMLDataset(object):
         Link to a paper describing the dataset.
     update_comment : str, optional
         An explanation for when the dataset is uploaded.
-    status : str, optional
-        Whether the dataset is active.
     md5_checksum : str, optional
         MD5 checksum to check if the dataset is downloaded without corruption.
     data_file : str, optional
         Path to where the dataset is located.
-    features : dict, optional
+    features_file : dict, optional
         A dictionary of dataset features,
         which maps a feature index to a OpenMLDataFeature.
-    qualities : dict, optional
+    qualities_file : dict, optional
         A dictionary of dataset qualities,
         which maps a quality name to a quality value.
     dataset: string, optional
         Serialized arff dataset string.
+    parquet_url: string, optional
+        This is the URL to the storage location where the dataset files are hosted.
+        This can be a MinIO bucket URL. If specified, the data will be accessed
+        from this URL when reading the files.
+    parquet_file: string, optional
+        Path to the local file.
     """
-    def __init__(self, name, description, format=None,
-                 data_format='arff', dataset_id=None, version=None,
-                 creator=None, contributor=None, collection_date=None,
-                 upload_date=None, language=None, licence=None,
-                 url=None, default_target_attribute=None,
-                 row_id_attribute=None, ignore_attribute=None,
-                 version_label=None, citation=None, tag=None,
-                 visibility=None, original_data_url=None,
-                 paper_url=None, update_comment=None,
-                 md5_checksum=None, data_file=None, features=None,
-                 qualities=None, dataset=None):
+
+    def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
+        self,
+        name: str,
+        description: str | None,
+        data_format: Literal["arff", "sparse_arff"] = "arff",
+        cache_format: Literal["feather", "pickle"] = "pickle",
+        dataset_id: int | None = None,
+        version: int | None = None,
+        creator: str | None = None,
+        contributor: str | None = None,
+        collection_date: str | None = None,
+        upload_date: str | None = None,
+        language: str | None = None,
+        licence: str | None = None,
+        url: str | None = None,
+        default_target_attribute: str | None = None,
+        row_id_attribute: str | None = None,
+        ignore_attribute: str | list[str] | None = None,
+        version_label: str | None = None,
+        citation: str | None = None,
+        tag: str | None = None,
+        visibility: str | None = None,
+        original_data_url: str | None = None,
+        paper_url: str | None = None,
+        update_comment: str | None = None,
+        md5_checksum: str | None = None,
+        data_file: str | None = None,
+        features_file: str | None = None,
+        qualities_file: str | None = None,
+        dataset: str | None = None,
+        parquet_url: str | None = None,
+        parquet_file: str | None = None,
+    ):
+        if cache_format not in ["feather", "pickle"]:
+            raise ValueError(
+                "cache_format must be one of 'feather' or 'pickle. "
+                f"Invalid format specified: {cache_format}",
+            )
+
+        def find_invalid_characters(string: str, pattern: str) -> str:
+            invalid_chars = set()
+            regex = re.compile(pattern)
+            for char in string:
+                if not regex.match(char):
+                    invalid_chars.add(char)
+            return ",".join(
+                [f"'{char}'" if char != "'" else f'"{char}"' for char in invalid_chars],
+            )
+
+        if dataset_id is None:
+            pattern = "^[\x00-\x7f]*$"
+            if description and not re.match(pattern, description):
+                # not basiclatin (XSD complains)
+                invalid_characters = find_invalid_characters(description, pattern)
+                raise ValueError(
+                    f"Invalid symbols {invalid_characters} in description: {description}",
+                )
+            pattern = "^[\x00-\x7f]*$"
+            if citation and not re.match(pattern, citation):
+                # not basiclatin (XSD complains)
+                invalid_characters = find_invalid_characters(citation, pattern)
+                raise ValueError(
+                    f"Invalid symbols {invalid_characters} in citation: {citation}",
+                )
+            pattern = "^[a-zA-Z0-9_\\-\\.\\(\\),]+$"
+            if not re.match(pattern, name):
+                # regex given by server in error message
+                invalid_characters = find_invalid_characters(name, pattern)
+                raise ValueError(f"Invalid symbols {invalid_characters} in name: {name}")
+
+        self.ignore_attribute: list[str] | None = None
+        if isinstance(ignore_attribute, str):
+            self.ignore_attribute = [ignore_attribute]
+        elif isinstance(ignore_attribute, list) or ignore_attribute is None:
+            self.ignore_attribute = ignore_attribute
+        else:
+            raise ValueError("Wrong data type for ignore_attribute. Should be list.")
 
         # TODO add function to check if the name is casual_string128
         # Attributes received by querying the RESTful API
@@ -115,13 +207,9 @@ def __init__(self, name, description, format=None,
         self.name = name
         self.version = int(version) if version is not None else None
         self.description = description
-        if format is None:
-            self.format = data_format
-        else:
-            warn("The format parameter in the init will be deprecated "
-                 "in the future."
-                 "Please use data_format instead", DeprecationWarning)
-            self.format = format
+        self.cache_format = cache_format
+        # Has to be called format, otherwise there will be an XML upload error
+        self.format = data_format
         self.creator = creator
         self.contributor = contributor
         self.collection_date = collection_date
@@ -131,13 +219,7 @@ def __init__(self, name, description, format=None,
         self.url = url
         self.default_target_attribute = default_target_attribute
         self.row_id_attribute = row_id_attribute
-        if isinstance(ignore_attribute, str):
-            self.ignore_attribute = [ignore_attribute]
-        elif isinstance(ignore_attribute, list) or ignore_attribute is None:
-            self.ignore_attribute = ignore_attribute
-        else:
-            raise ValueError('Wrong data type for ignore_attribute. '
-                             'Should be list.')
+
         self.version_label = version_label
         self.citation = citation
         self.tag = tag
@@ -147,339 +229,506 @@ def __init__(self, name, description, format=None,
         self.update_comment = update_comment
         self.md5_checksum = md5_checksum
         self.data_file = data_file
-        self.features = None
-        self.qualities = None
+        self.parquet_file = parquet_file
         self._dataset = dataset
-
-        if features is not None:
-            self.features = {}
-            for idx, xmlfeature in enumerate(features['oml:feature']):
-                nr_missing = xmlfeature.get('oml:number_of_missing_values', 0)
-                feature = OpenMLDataFeature(int(xmlfeature['oml:index']),
-                                            xmlfeature['oml:name'],
-                                            xmlfeature['oml:data_type'],
-                                            xmlfeature.get('oml:nominal_value'),
-                                            int(nr_missing))
-                if idx != feature.index:
-                    raise ValueError('Data features not provided '
-                                     'in right order')
-                self.features[feature.index] = feature
-
-        self.qualities = _check_qualities(qualities)
+        self._parquet_url = parquet_url
+
+        self._features: dict[int, OpenMLDataFeature] | None = None
+        self._qualities: dict[str, float] | None = None
+        self._no_qualities_found = False
+
+        if features_file is not None:
+            self._features = _read_features(Path(features_file))
+
+        # "" was the old default value by `get_dataset` and maybe still used by some
+        if qualities_file == "":
+            # TODO(0.15): to switch to "qualities_file is not None" below and remove warning
+            warnings.warn(
+                "Starting from Version 0.15 `qualities_file` must be None and not an empty string "
+                "to avoid reading the qualities from file. Set `qualities_file` to None to avoid "
+                "this warning.",
+                FutureWarning,
+                stacklevel=2,
+            )
+            qualities_file = None
+
+        if qualities_file is not None:
+            self._qualities = _read_qualities(Path(qualities_file))
 
         if data_file is not None:
-            self.data_pickle_file = self._data_arff_to_pickle(data_file)
+            data_pickle, data_feather, feather_attribute = self._compressed_cache_file_paths(
+                Path(data_file)
+            )
+            self.data_pickle_file = data_pickle if Path(data_pickle).exists() else None
+            self.data_feather_file = data_feather if Path(data_feather).exists() else None
+            self.feather_attribute_file = feather_attribute if Path(feather_attribute) else None
         else:
             self.data_pickle_file = None
+            self.data_feather_file = None
+            self.feather_attribute_file = None
+
+    @property
+    def features(self) -> dict[int, OpenMLDataFeature]:
+        """Get the features of this dataset."""
+        if self._features is None:
+            # TODO(eddiebergman): These should return a value so we can set it to be not None
+            self._load_features()
 
-    def __repr__(self):
-        header = "OpenML Dataset"
-        header = '{}\n{}\n'.format(header, '=' * len(header))
-
-        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
-        fields = {"Name": self.name,
-                  "Version": self.version,
-                  "Format": self.format,
-                  "Licence": self.licence,
-                  "Download URL": self.url,
-                  "Data file": self.data_file,
-                  "Pickle file": self.data_pickle_file,
-                  "# of features": len(self.features)}
+        assert self._features is not None
+        return self._features
+
+    @property
+    def qualities(self) -> dict[str, float] | None:
+        """Get the qualities of this dataset."""
+        # TODO(eddiebergman): Better docstring, I don't know what qualities means
+
+        # We have to check `_no_qualities_found` as there might not be qualities for a dataset
+        if self._qualities is None and (not self._no_qualities_found):
+            self._load_qualities()
+
+        return self._qualities
+
+    @property
+    def id(self) -> int | None:
+        """Get the dataset numeric id."""
+        return self.dataset_id
+
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | None]]:
+        """Collect all information to display in the __repr__ body."""
+        # Obtain number of features in accordance with lazy loading.
+        n_features: int | None = None
+        if self._qualities is not None and self._qualities["NumberOfFeatures"] is not None:
+            n_features = int(self._qualities["NumberOfFeatures"])
+        elif self._features is not None:
+            n_features = len(self._features)
+
+        fields: dict[str, int | str | None] = {
+            "Name": self.name,
+            "Version": self.version,
+            "Format": self.format,
+            "Licence": self.licence,
+            "Download URL": self.url,
+            "Data file": str(self.data_file) if self.data_file is not None else None,
+            "Pickle file": (
+                str(self.data_pickle_file) if self.data_pickle_file is not None else None
+            ),
+            "# of features": n_features,
+        }
         if self.upload_date is not None:
-            fields["Upload Date"] = self.upload_date.replace('T', ' ')
+            fields["Upload Date"] = self.upload_date.replace("T", " ")
         if self.dataset_id is not None:
-            fields["OpenML URL"] = "{}d/{}".format(base_url, self.dataset_id)
-        if self.qualities['NumberOfInstances'] is not None:
-            fields["# of instances"] = int(self.qualities['NumberOfInstances'])
+            fields["OpenML URL"] = self.openml_url
+        if self._qualities is not None and self._qualities["NumberOfInstances"] is not None:
+            fields["# of instances"] = int(self._qualities["NumberOfInstances"])
 
         # determines the order in which the information will be printed
-        order = ["Name", "Version", "Format", "Upload Date", "Licence", "Download URL",
-                 "OpenML URL", "Data File", "Pickle File", "# of features", "# of instances"]
-        fields = [(key, fields[key]) for key in order if key in fields]
-
-        longest_field_name_length = max(len(name) for name, value in fields)
-        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
-        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
-        return header + body
-
-    def _data_arff_to_pickle(self, data_file):
-        data_pickle_file = data_file.replace('.arff', '.pkl.py3')
-        if os.path.exists(data_pickle_file):
-            with open(data_pickle_file, "rb") as fh:
-                data, categorical, attribute_names = pickle.load(fh)
-
-            # Between v0.8 and v0.9 the format of pickled data changed from
-            # np.ndarray to pd.DataFrame. This breaks some backwards compatibility,
-            # e.g. for `run_model_on_task`. If a local file still exists with
-            # np.ndarray data, we reprocess the data file to store a pickled
-            # pd.DataFrame blob. See also #646.
-            if isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data):
-                logger.debug("Data pickle file already exists.")
-                return data_pickle_file
+        order = [
+            "Name",
+            "Version",
+            "Format",
+            "Upload Date",
+            "Licence",
+            "Download URL",
+            "OpenML URL",
+            "Data File",
+            "Pickle File",
+            "# of features",
+            "# of instances",
+        ]
+        return [(key, fields[key]) for key in order if key in fields]
+
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, OpenMLDataset):
+            return False
+
+        server_fields = {
+            "dataset_id",
+            "version",
+            "upload_date",
+            "url",
+            "_parquet_url",
+            "dataset",
+            "data_file",
+            "format",
+            "cache_format",
+        }
+
+        cache_fields = {
+            "_dataset",
+            "data_file",
+            "data_pickle_file",
+            "data_feather_file",
+            "feather_attribute_file",
+            "parquet_file",
+        }
+
+        # check that common keys and values are identical
+        ignore_fields = server_fields | cache_fields
+        self_keys = set(self.__dict__.keys()) - ignore_fields
+        other_keys = set(other.__dict__.keys()) - ignore_fields
+        return self_keys == other_keys and all(
+            self.__dict__[key] == other.__dict__[key] for key in self_keys
+        )
+
+    def _download_data(self) -> None:
+        """Download ARFF data file to standard cache directory. Set `self.data_file`."""
+        # import required here to avoid circular import.
+        from .functions import _get_dataset_arff, _get_dataset_parquet
+
+        skip_parquet = (
+            os.environ.get(openml.config.OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true"
+        )
+        if self._parquet_url is not None and not skip_parquet:
+            parquet_file = _get_dataset_parquet(self)
+            self.parquet_file = None if parquet_file is None else str(parquet_file)
+        if self.parquet_file is None:
+            self.data_file = str(_get_dataset_arff(self))
+
+    def _get_arff(self, format: str) -> dict:  # noqa: A002
+        """Read ARFF file and return decoded arff.
+
+        Reads the file referenced in self.data_file.
+
+        Parameters
+        ----------
+        format : str
+            Format of the ARFF file.
+            Must be one of 'arff' or 'sparse_arff' or a string that will be either of those
+            when converted to lower case.
+
+
+
+        Returns
+        -------
+        dict
+            Decoded arff.
 
+        """
+        # TODO: add a partial read method which only returns the attribute
+        # headers of the corresponding .arff file!
+        import struct
+
+        filename = self.data_file
+        assert filename is not None
+        filepath = Path(filename)
+
+        bits = 8 * struct.calcsize("P")
+
+        # Files can be considered too large on a 32-bit system,
+        # if it exceeds 120mb (slightly more than covtype dataset size)
+        # This number is somewhat arbitrary.
+        if bits != 64:
+            MB_120 = 120_000_000
+            file_size = filepath.stat().st_size
+            if file_size > MB_120:
+                raise NotImplementedError(
+                    f"File '{filename}' ({file_size / 1e6:.1f} MB)"
+                    f"exceeds the maximum supported size of 120 MB. "
+                    f"This limitation applies to {bits}-bit systems. "
+                    f"Large dataset handling is currently not fully supported. "
+                    f"Please consider using a smaller dataset"
+                )
+
+        if format.lower() == "arff":
+            return_type = arff.DENSE
+        elif format.lower() == "sparse_arff":
+            return_type = arff.COO
+        else:
+            raise ValueError(f"Unknown data format {format}")
+
+        def decode_arff(fh: Any) -> dict:
+            decoder = arff.ArffDecoder()
+            return decoder.decode(fh, encode_nominal=True, return_type=return_type)  # type: ignore
+
+        if filepath.suffix.endswith(".gz"):
+            with gzip.open(filename) as zipfile:
+                return decode_arff(zipfile)
+        else:
+            with filepath.open(encoding="utf8") as fh:
+                return decode_arff(fh)
+
+    def _parse_data_from_arff(  # noqa: C901, PLR0912, PLR0915
+        self,
+        arff_file_path: Path,
+    ) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool], list[str]]:
+        """Parse all required data from arff file.
+
+        Parameters
+        ----------
+        arff_file_path : str
+            Path to the file on disk.
+
+        Returns
+        -------
+        Tuple[Union[pd.DataFrame, scipy.sparse.csr_matrix], List[bool], List[str]]
+            DataFrame or csr_matrix: dataset
+            List[bool]: List indicating which columns contain categorical variables.
+            List[str]: List of column names.
+        """
         try:
             data = self._get_arff(self.format)
         except OSError as e:
-            logger.critical("Please check that the data file %s is "
-                            "there and can be read.", data_file)
+            logger.critical(
+                f"Please check that the data file {arff_file_path} is there and can be read.",
+            )
             raise e
 
         ARFF_DTYPES_TO_PD_DTYPE = {
-            'INTEGER': 'integer',
-            'REAL': 'floating',
-            'NUMERIC': 'floating',
-            'STRING': 'string'
+            "INTEGER": "integer",
+            "REAL": "floating",
+            "NUMERIC": "floating",
+            "STRING": "string",
         }
         attribute_dtype = {}
         attribute_names = []
         categories_names = {}
         categorical = []
-        for name, type_ in data['attributes']:
-            # if the feature is nominal and the a sparse matrix is
+        for name, type_ in data["attributes"]:
+            # if the feature is nominal and a sparse matrix is
             # requested, the categories need to be numeric
-            if (isinstance(type_, list)
-                    and self.format.lower() == 'sparse_arff'):
+            if isinstance(type_, list) and self.format.lower() == "sparse_arff":
                 try:
-                    np.array(type_, dtype=np.float32)
-                except ValueError:
+                    # checks if the strings which should be the class labels
+                    # can be encoded into integers
+                    pd.factorize(np.array(type_))[0]
+                except ValueError as e:
                     raise ValueError(
-                        "Categorical data needs to be numeric when "
-                        "using sparse ARFF."
-                    )
+                        "Categorical data needs to be numeric when using sparse ARFF."
+                    ) from e
+
             # string can only be supported with pandas DataFrame
-            elif (type_ == 'STRING'
-                  and self.format.lower() == 'sparse_arff'):
-                raise ValueError(
-                    "Dataset containing strings is not supported "
-                    "with sparse ARFF."
-                )
+            elif type_ == "STRING" and self.format.lower() == "sparse_arff":
+                raise ValueError("Dataset containing strings is not supported with sparse ARFF.")
 
             # infer the dtype from the ARFF header
             if isinstance(type_, list):
                 categorical.append(True)
                 categories_names[name] = type_
                 if len(type_) == 2:
-                    type_norm = [cat.lower().capitalize()
-                                 for cat in type_]
-                    if set(['True', 'False']) == set(type_norm):
-                        categories_names[name] = [
-                            True if cat == 'True' else False
-                            for cat in type_norm
-                        ]
-                        attribute_dtype[name] = 'boolean'
+                    type_norm = [cat.lower().capitalize() for cat in type_]
+                    if {"True", "False"} == set(type_norm):
+                        categories_names[name] = [cat == "True" for cat in type_norm]
+                        attribute_dtype[name] = "boolean"
                     else:
-                        attribute_dtype[name] = 'categorical'
+                        attribute_dtype[name] = "categorical"
                 else:
-                    attribute_dtype[name] = 'categorical'
+                    attribute_dtype[name] = "categorical"
             else:
                 categorical.append(False)
                 attribute_dtype[name] = ARFF_DTYPES_TO_PD_DTYPE[type_]
             attribute_names.append(name)
 
-        if self.format.lower() == 'sparse_arff':
-            X = data['data']
+        if self.format.lower() == "sparse_arff":
+            X = data["data"]
             X_shape = (max(X[1]) + 1, max(X[2]) + 1)
-            X = scipy.sparse.coo_matrix(
-                (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
+            X = scipy.sparse.coo_matrix((X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
             X = X.tocsr()
-
-        elif self.format.lower() == 'arff':
-            X = pd.DataFrame(data['data'], columns=attribute_names)
+        elif self.format.lower() == "arff":
+            X = pd.DataFrame(data["data"], columns=attribute_names)
 
             col = []
             for column_name in X.columns:
-                if attribute_dtype[column_name] in ('categorical',
-                                                    'boolean'):
-                    col.append(self._unpack_categories(
-                        X[column_name], categories_names[column_name]))
+                if attribute_dtype[column_name] in ("categorical", "boolean"):
+                    categories = self._unpack_categories(
+                        X[column_name],  # type: ignore
+                        categories_names[column_name],
+                    )
+                    col.append(categories)
+                elif attribute_dtype[column_name] in ("floating", "integer"):
+                    X_col = X[column_name]
+                    if X_col.min() >= 0 and X_col.max() <= 255:
+                        try:
+                            X_col_uint = X_col.astype("uint8")
+                            if (X_col == X_col_uint).all():
+                                col.append(X_col_uint)
+                                continue
+                        except ValueError:
+                            pass
+                    col.append(X[column_name])
                 else:
                     col.append(X[column_name])
             X = pd.concat(col, axis=1)
+        else:
+            raise ValueError(f"Dataset format '{self.format}' is not a valid format.")
 
-        # Pickle the dataframe or the sparse matrix.
-        with open(data_pickle_file, "wb") as fh:
-            pickle.dump((X, categorical, attribute_names), fh, -1)
-        logger.debug("Saved dataset {did}: {name} to file {path}"
-                     .format(did=int(self.dataset_id or -1),
-                             name=self.name,
-                             path=data_pickle_file)
-                     )
-        return data_pickle_file
+        return X, categorical, attribute_names  # type: ignore
 
-    def push_tag(self, tag):
-        """Annotates this data set with a tag on the server.
+    def _compressed_cache_file_paths(self, data_file: Path) -> tuple[Path, Path, Path]:
+        data_pickle_file = data_file.with_suffix(".pkl.py3")
+        data_feather_file = data_file.with_suffix(".feather")
+        feather_attribute_file = data_file.with_suffix(".feather.attributes.pkl.py3")
+        return data_pickle_file, data_feather_file, feather_attribute_file
 
-        Parameters
-        ----------
-        tag : str
-            Tag to attach to the dataset.
-        """
-        _tag_entity('data', self.dataset_id, tag)
-
-    def remove_tag(self, tag):
-        """Removes a tag from this dataset on the server.
+    def _cache_compressed_file_from_file(
+        self,
+        data_file: Path,
+    ) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool], list[str]]:
+        """Store data from the local file in compressed format.
 
-        Parameters
-        ----------
-        tag : str
-            Tag to attach to the dataset.
+        If a local parquet file is present it will be used instead of the arff file.
+        Sets cache_format to 'pickle' if data is sparse.
         """
-        _tag_entity('data', self.dataset_id, tag, untag=True)
+        (
+            data_pickle_file,
+            data_feather_file,
+            feather_attribute_file,
+        ) = self._compressed_cache_file_paths(data_file)
 
-    def __eq__(self, other):
+        attribute_names, categorical, data = self._parse_data_from_file(data_file)
 
-        if type(other) != OpenMLDataset:
-            return False
+        # Feather format does not work for sparse datasets, so we use pickle for sparse datasets
+        if scipy.sparse.issparse(data):
+            self.cache_format = "pickle"
 
-        server_fields = {
-            'dataset_id',
-            'version',
-            'upload_date',
-            'url',
-            'dataset',
-            'data_file',
-        }
-
-        # check that the keys are identical
-        self_keys = set(self.__dict__.keys()) - server_fields
-        other_keys = set(other.__dict__.keys()) - server_fields
-        if self_keys != other_keys:
-            return False
-
-        # check that values of the common keys are identical
-        return all(self.__dict__[key] == other.__dict__[key]
-                   for key in self_keys)
-
-    def _get_arff(self, format):
-        """Read ARFF file and return decoded arff.
+        logger.info(f"{self.cache_format} write {self.name}")
+        if self.cache_format == "feather":
+            assert isinstance(data, pd.DataFrame)
 
-        Reads the file referenced in self.data_file.
+            data.to_feather(data_feather_file)
+            with open(feather_attribute_file, "wb") as fh:  # noqa: PTH123
+                pickle.dump((categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
+            self.data_feather_file = data_feather_file
+            self.feather_attribute_file = feather_attribute_file
 
-        Returns
-        -------
-        dict
-            Decoded arff.
-
-        """
-
-        # TODO: add a partial read method which only returns the attribute
-        # headers of the corresponding .arff file!
-        import struct
-
-        filename = self.data_file
-        bits = (8 * struct.calcsize("P"))
-        # Files can be considered too large on a 32-bit system,
-        # if it exceeds 120mb (slightly more than covtype dataset size)
-        # This number is somewhat arbitrary.
-        if bits != 64 and os.path.getsize(filename) > 120000000:
-            return NotImplementedError("File too big")
-
-        if format.lower() == 'arff':
-            return_type = arff.DENSE
-        elif format.lower() == 'sparse_arff':
-            return_type = arff.COO
         else:
-            raise ValueError('Unknown data format %s' % format)
-
-        def decode_arff(fh):
-            decoder = arff.ArffDecoder()
-            return decoder.decode(fh, encode_nominal=True,
-                                  return_type=return_type)
-
-        if filename[-3:] == ".gz":
-            with gzip.open(filename) as fh:
-                return decode_arff(fh)
+            with open(data_pickle_file, "wb") as fh:  # noqa: PTH123
+                pickle.dump((data, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
+            self.data_pickle_file = data_pickle_file
+
+        data_file = data_pickle_file if self.cache_format == "pickle" else data_feather_file
+        logger.debug(f"Saved dataset {int(self.dataset_id or -1)}: {self.name} to file {data_file}")
+
+        return data, categorical, attribute_names
+
+    def _parse_data_from_file(
+        self,
+        data_file: Path,
+    ) -> tuple[list[str], list[bool], pd.DataFrame | scipy.sparse.csr_matrix]:
+        if data_file.suffix == ".arff":
+            data, categorical, attribute_names = self._parse_data_from_arff(data_file)
+        elif data_file.suffix == ".pq":
+            attribute_names, categorical, data = self._parse_data_from_pq(data_file)
         else:
-            with io.open(filename, encoding='utf8') as fh:
-                return decode_arff(fh)
+            raise ValueError(f"Unknown file type for file '{data_file}'.")
 
-    @staticmethod
-    def _convert_array_format(data, array_format, attribute_names):
-        """Convert a dataset to a given array format.
+        return attribute_names, categorical, data
 
-        Converts to numpy array if data is non-sparse.
-        Converts to a sparse dataframe if data is sparse.
+    def _parse_data_from_pq(self, data_file: Path) -> tuple[list[str], list[bool], pd.DataFrame]:
+        try:
+            data = pd.read_parquet(data_file)
+        except Exception as e:
+            raise Exception(f"File: {data_file}") from e
+        categorical = [data[c].dtype.name == "category" for c in data.columns]
+        attribute_names = list(data.columns)
+        return attribute_names, categorical, data
+
+    def _load_data(self) -> tuple[pd.DataFrame, list[bool], list[str]]:  # noqa: PLR0912, C901, PLR0915
+        """Load data from compressed format or arff. Download data if not present on disk."""
+        need_to_create_pickle = self.cache_format == "pickle" and self.data_pickle_file is None
+        need_to_create_feather = self.cache_format == "feather" and self.data_feather_file is None
+
+        if need_to_create_pickle or need_to_create_feather:
+            if self.data_file is None:
+                self._download_data()
 
-        Parameters
-        ----------
-        array_format : str {'array', 'dataframe'}
-            Desired data type of the output
-            - If array_format='array'
-                If data is non-sparse
-                    Converts to numpy-array
-                    Enforces numeric encoding of categorical columns
-                    Missing values are represented as NaN in the numpy-array
-                else returns data as is
-            - If array_format='dataframe'
-                If data is sparse
-                    Works only on sparse data
-                    Converts sparse data to sparse dataframe
-                else returns data as is
+            file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
+            assert file_to_load is not None
+            data, cats, attrs = self._cache_compressed_file_from_file(Path(file_to_load))
+            return _ensure_dataframe(data, attrs), cats, attrs
 
-        """
-        if array_format == "array" and not scipy.sparse.issparse(data):
-            # We encode the categories such that they are integer to be able
-            # to make a conversion to numeric for backward compatibility
-            def _encode_if_category(column):
-                if column.dtype.name == 'category':
-                    column = column.cat.codes.astype(np.float32)
-                    mask_nan = column == -1
-                    column[mask_nan] = np.nan
-                return column
-            if data.ndim == 2:
-                columns = {
-                    column_name: _encode_if_category(data.loc[:, column_name])
-                    for column_name in data.columns
-                }
-                data = pd.DataFrame(columns)
+        # helper variable to help identify where errors occur
+        fpath = self.data_feather_file if self.cache_format == "feather" else self.data_pickle_file
+        logger.info(f"{self.cache_format} load data {self.name}")
+        try:
+            if self.cache_format == "feather":
+                assert self.data_feather_file is not None
+                assert self.feather_attribute_file is not None
+
+                data = pd.read_feather(self.data_feather_file)
+                fpath = self.feather_attribute_file
+                with self.feather_attribute_file.open("rb") as fh:
+                    categorical, attribute_names = pickle.load(fh)  # noqa: S301
             else:
-                data = _encode_if_category(data)
-            try:
-                return np.asarray(data, dtype=np.float32)
-            except ValueError:
-                raise PyOpenMLError(
-                    'PyOpenML cannot handle string when returning numpy'
-                    ' arrays. Use dataset_format="dataframe".'
+                assert self.data_pickle_file is not None
+                with self.data_pickle_file.open("rb") as fh:
+                    data, categorical, attribute_names = pickle.load(fh)  # noqa: S301
+
+        except FileNotFoundError as e:
+            raise ValueError(
+                f"Cannot find file for dataset {self.name} at location '{fpath}'."
+            ) from e
+        except (EOFError, ModuleNotFoundError, ValueError, AttributeError) as e:
+            error_message = getattr(e, "message", e.args[0])
+            hint = ""
+
+            if isinstance(e, EOFError):
+                readable_error = "Detected a corrupt cache file"
+            elif isinstance(e, (ModuleNotFoundError, AttributeError)):
+                readable_error = "Detected likely dependency issues"
+                hint = (
+                    "This can happen if the cache was constructed with a different pandas version "
+                    "than the one that is used to load the data. See also "
                 )
-        elif array_format == "dataframe" and scipy.sparse.issparse(data):
-            return pd.SparseDataFrame(data, columns=attribute_names)
-        else:
-            data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data"
-            warn("Cannot convert {} to '{}'. Returning input data.".format(data_type, array_format))
-        return data
+                if isinstance(e, ModuleNotFoundError):
+                    hint += "https://github.com/openml/openml-python/issues/918. "
+                elif isinstance(e, AttributeError):
+                    hint += "https://github.com/openml/openml-python/pull/1121. "
+
+            elif isinstance(e, ValueError) and "unsupported pickle protocol" in e.args[0]:
+                readable_error = "Encountered unsupported pickle protocol"
+            else:
+                raise e
+
+            logger.warning(
+                f"{readable_error} when loading dataset {self.id} from '{fpath}'. "
+                f"{hint}"
+                f"Error message was: {error_message}. "
+                "We will continue loading data from the arff-file, "
+                "but this will be much slower for big datasets. "
+                "Please manually delete the cache file if you want OpenML-Python "
+                "to attempt to reconstruct it.",
+            )
+            file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
+            assert file_to_load is not None
+            attr, cat, df = self._parse_data_from_file(Path(file_to_load))
+            return _ensure_dataframe(df), cat, attr
+
+        data_up_to_date = isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data)
+        if self.cache_format == "pickle" and not data_up_to_date:
+            logger.info("Updating outdated pickle file.")
+            file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
+            assert file_to_load is not None
+
+            data, cats, attrs = self._cache_compressed_file_from_file(Path(file_to_load))
+
+        return _ensure_dataframe(data, attribute_names), categorical, attribute_names
 
     @staticmethod
-    def _unpack_categories(series, categories):
+    def _unpack_categories(series: pd.Series, categories: list) -> pd.Series:
+        # nan-likes can not be explicitly specified as a category
+        def valid_category(cat: Any) -> bool:
+            return isinstance(cat, str) or (cat is not None and not np.isnan(cat))
+
+        filtered_categories = [c for c in categories if valid_category(c)]
         col = []
         for x in series:
             try:
                 col.append(categories[int(x)])
             except (TypeError, ValueError):
                 col.append(np.nan)
+
         # We require two lines to create a series of categories as detailed here:
-        # https://pandas.pydata.org/pandas-docs/version/0.24/user_guide/categorical.html#series-creation  # noqa E501
-        raw_cat = pd.Categorical(col, ordered=True, categories=categories)
+        # https://pandas.pydata.org/pandas-docs/version/0.24/user_guide/categorical.html#series-creation
+        raw_cat = pd.Categorical(col, ordered=True, categories=filtered_categories)
         return pd.Series(raw_cat, index=series.index, name=series.name)
 
-    def _download_data(self) -> None:
-        """ Download ARFF data file to standard cache directory. Set `self.data_file`. """
-        # import required here to avoid circular import.
-        from .functions import _get_dataset_arff
-        self.data_file = _get_dataset_arff(self)
-
-    def get_data(
-            self,
-            target: Optional[Union[List[str], str]] = None,
-            include_row_id: bool = False,
-            include_ignore_attribute: bool = False,
-            dataset_format: str = "dataframe",
-    ) -> Tuple[
-            Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix],
-            Optional[Union[np.ndarray, pd.DataFrame]],
-            List[bool],
-            List[str]
-    ]:
-        """ Returns dataset content as dataframes or sparse matrices.
+    def get_data(  # noqa: C901
+        self,
+        target: list[str] | str | None = None,
+        include_row_id: bool = False,  # noqa: FBT002
+        include_ignore_attribute: bool = False,  # noqa: FBT002
+    ) -> tuple[pd.DataFrame, pd.Series | None, list[bool], list[str]]:
+        """Returns dataset content as dataframes.
 
         Parameters
         ----------
@@ -491,34 +740,20 @@ def get_data(
         include_ignore_attribute : boolean (default=False)
             Whether to include columns that are marked as "ignore"
             on the server in the dataset.
-        dataset_format : string (default='dataframe')
-            The format of returned dataset.
-            If ``array``, the returned dataset will be a NumPy array or a SciPy sparse matrix.
-            If ``dataframe``, the returned dataset will be a Pandas DataFrame or SparseDataFrame.
+
 
         Returns
         -------
-        X : ndarray, dataframe, or sparse matrix, shape (n_samples, n_columns)
-            Dataset
-        y : ndarray or pd.Series, shape (n_samples, ) or None
+        X : dataframe, shape (n_samples, n_columns)
+            Dataset, may have sparse dtypes in the columns if required.
+        y : pd.Series, shape (n_samples, ) or None
             Target column
-        categorical_indicator : boolean ndarray
+        categorical_indicator : list[bool]
             Mask that indicate categorical features.
-        attribute_names : List[str]
+        attribute_names : list[str]
             List of attribute names.
         """
-        if self.data_pickle_file is None:
-            if self.data_file is None:
-                self._download_data()
-            self.data_pickle_file = self._data_arff_to_pickle(self.data_file)
-
-        path = self.data_pickle_file
-        if not os.path.exists(path):
-            raise ValueError("Cannot find a pickle file for dataset %s at "
-                             "location %s " % (self.name, path))
-        else:
-            with open(path, "rb") as fh:
-                data, categorical, attribute_names = pickle.load(fh)
+        data, categorical_mask, attribute_names = self._load_data()
 
         to_exclude = []
         if not include_row_id and self.row_id_attribute is not None:
@@ -534,64 +769,76 @@ def get_data(
                 to_exclude.extend(self.ignore_attribute)
 
         if len(to_exclude) > 0:
-            logger.info("Going to remove the following attributes:"
-                        " %s" % to_exclude)
-            keep = np.array([True if column not in to_exclude else False
-                             for column in attribute_names])
-            if hasattr(data, 'iloc'):
-                data = data.iloc[:, keep]
-            else:
-                data = data[:, keep]
-            categorical = [cat for cat, k in zip(categorical, keep) if k]
-            attribute_names = [att for att, k in
-                               zip(attribute_names, keep) if k]
+            logger.info(f"Going to remove the following attributes: {to_exclude}")
+            keep = np.array([column not in to_exclude for column in attribute_names])
+            data = data.drop(columns=to_exclude)
+            categorical_mask = [cat for cat, k in zip(categorical_mask, keep, strict=False) if k]
+            attribute_names = [att for att, k in zip(attribute_names, keep, strict=False) if k]
 
         if target is None:
-            data = self._convert_array_format(data, dataset_format,
-                                              attribute_names)
-            targets = None
-        else:
-            if isinstance(target, str):
-                if ',' in target:
-                    target = target.split(',')
-                else:
-                    target = [target]
-            targets = np.array([True if column in target else False
-                                for column in attribute_names])
-            if np.sum(targets) > 1:
-                raise NotImplementedError(
-                    "Number of requested targets %d is not implemented." %
-                    np.sum(targets)
-                )
-            target_categorical = [
-                cat for cat, column in zip(categorical, attribute_names)
-                if column in target
-            ]
-            target_dtype = int if target_categorical[0] else float
-
-            if hasattr(data, 'iloc'):
-                x = data.iloc[:, ~targets]
-                y = data.iloc[:, targets]
-            else:
-                x = data[:, ~targets]
-                y = data[:, targets].astype(target_dtype)
-
-            categorical = [cat for cat, t in zip(categorical, targets)
-                           if not t]
-            attribute_names = [att for att, k in zip(attribute_names, targets)
-                               if not k]
+            return data, None, categorical_mask, attribute_names
 
-            x = self._convert_array_format(x, dataset_format, attribute_names)
-            if scipy.sparse.issparse(y):
-                y = np.asarray(y.todense()).astype(target_dtype).flatten()
-            y = y.squeeze()
-            y = self._convert_array_format(y, dataset_format, attribute_names)
-            y = y.astype(target_dtype) if dataset_format == 'array' else y
-            data, targets = x, y
-
-        return data, targets, categorical, attribute_names
+        if isinstance(target, str):
+            target_names = target.split(",") if "," in target else [target]
+        else:
+            target_names = target
+
+        # All the assumptions below for the target are dependant on the number of targets being 1
+        n_targets = len(target_names)
+        if n_targets > 1:
+            raise NotImplementedError(
+                f"Multi-target prediction is not yet supported."
+                f"Found {n_targets} target columns: {target_names}. "
+                f"Currently, only single-target datasets are supported. "
+                f"Please select a single target column."
+            )
+
+        target_name = target_names[0]
+        x = data.drop(columns=[target_name])
+        y = data[target_name].squeeze()
+
+        # Finally, remove the target from the list of attributes and categorical mask
+        target_index = attribute_names.index(target_name)
+        categorical_mask.pop(target_index)
+        attribute_names.remove(target_name)
+
+        assert isinstance(y, pd.Series)
+        return x, y, categorical_mask, attribute_names
+
+    def _load_features(self) -> None:
+        """Load the features metadata from the server and store it in the dataset object."""
+        # Delayed Import to avoid circular imports or having to import all of dataset.functions to
+        # import OpenMLDataset.
+        from openml.datasets.functions import _get_dataset_features_file
+
+        if self.dataset_id is None:
+            raise ValueError(
+                "No dataset id specified. Please set the dataset id. Otherwise we cannot load "
+                "metadata.",
+            )
+
+        features_file = _get_dataset_features_file(None, self.dataset_id)
+        self._features = _read_features(features_file)
+
+    def _load_qualities(self) -> None:
+        """Load qualities information from the server and store it in the dataset object."""
+        # same reason as above for _load_features
+        from openml.datasets.functions import _get_dataset_qualities_file
+
+        if self.dataset_id is None:
+            raise ValueError(
+                "No dataset id specified. Please set the dataset id. Otherwise we cannot load "
+                "metadata.",
+            )
+
+        qualities_file = _get_dataset_qualities_file(None, self.dataset_id)
+
+        if qualities_file is None:
+            self._no_qualities_found = True
+        else:
+            self._qualities = _read_qualities(qualities_file)
 
-    def retrieve_class_labels(self, target_name: str = 'class') -> Union[None, List[str]]:
+    def retrieve_class_labels(self, target_name: str = "class") -> None | list[str]:
         """Reads the datasets arff to determine the class-labels.
 
         If the task has no class labels (for example a regression problem)
@@ -609,13 +856,27 @@ def retrieve_class_labels(self, target_name: str = 'class') -> Union[None, List[
         list
         """
         for feature in self.features.values():
-            if (feature.name == target_name) and (feature.data_type == 'nominal'):
-                return feature.nominal_values
+            if feature.name == target_name:
+                if feature.data_type == "nominal":
+                    return feature.nominal_values
+
+                if feature.data_type == "string":
+                    # Rel.: #1311
+                    # The target is invalid for a classification task if the feature type is string
+                    # and not nominal. For such miss-configured tasks, we silently fix it here as
+                    # we can safely interpreter string as nominal.
+                    df, *_ = self.get_data()
+                    return list(df[feature.name].unique())
+
         return None
 
-    def get_features_by_type(self, data_type, exclude=None,
-                             exclude_ignore_attribute=True,
-                             exclude_row_id_attribute=True):
+    def get_features_by_type(  # noqa: C901
+        self,
+        data_type: str,
+        exclude: list[str] | None = None,
+        exclude_ignore_attribute: bool = True,  # noqa: FBT002
+        exclude_row_id_attribute: bool = True,  # noqa: FBT002
+    ) -> list[int]:
         """
         Return indices of features of a given type, e.g. all nominal features.
         Optional parameters to exclude various features by index or ontology.
@@ -625,8 +886,7 @@ def get_features_by_type(self, data_type, exclude=None,
         data_type : str
             The data type to return (e.g., nominal, numeric, date, string)
         exclude : list(int)
-            Indices to exclude (and adapt the return values as if these indices
-                        are not present)
+            List of columns to exclude from the return value
         exclude_ignore_attribute : bool
             Whether to exclude the defined ignore attributes (and adapt the
             return values as if these indices are not present)
@@ -641,15 +901,12 @@ def get_features_by_type(self, data_type, exclude=None,
         """
         if data_type not in OpenMLDataFeature.LEGAL_DATA_TYPES:
             raise TypeError("Illegal feature type requested")
-        if self.ignore_attribute is not None:
-            if not isinstance(self.ignore_attribute, list):
-                raise TypeError("ignore_attribute should be a list")
-        if self.row_id_attribute is not None:
-            if not isinstance(self.row_id_attribute, str):
-                raise TypeError("row id attribute should be a str")
-        if exclude is not None:
-            if not isinstance(exclude, list):
-                raise TypeError("Exclude should be a list")
+        if self.ignore_attribute is not None and not isinstance(self.ignore_attribute, list):
+            raise TypeError("ignore_attribute should be a list")
+        if self.row_id_attribute is not None and not isinstance(self.row_id_attribute, str):
+            raise TypeError("row id attribute should be a str")
+        if exclude is not None and not isinstance(exclude, list):
+            raise TypeError("Exclude should be a list")
             # assert all(isinstance(elem, str) for elem in exclude),
             #            "Exclude should be a list of strings"
         to_exclude = []
@@ -668,101 +925,161 @@ def get_features_by_type(self, data_type, exclude=None,
             name = self.features[idx].name
             if name in to_exclude:
                 offset += 1
-            else:
-                if self.features[idx].data_type == data_type:
-                    result.append(idx - offset)
+            elif self.features[idx].data_type == data_type:
+                result.append(idx - offset)
         return result
 
-    def publish(self):
-        """Publish the dataset on the OpenML server.
+    def _get_file_elements(self) -> dict:
+        """Adds the 'dataset' to file elements."""
+        file_elements: dict = {}
+        path = None if self.data_file is None else Path(self.data_file).absolute()
 
-        Upload the dataset description and dataset content to openml.
+        if self._dataset is not None:
+            file_elements["dataset"] = self._dataset
+        elif path is not None and path.exists():
+            with path.open("rb") as fp:
+                file_elements["dataset"] = fp.read()
 
-        Returns
-        -------
-        dataset_id: int
-            Id of the dataset uploaded to the server.
-        """
-        file_elements = {'description': self._to_xml()}
+            try:
+                dataset_utf8 = str(file_elements["dataset"], encoding="utf8")
+                arff.ArffDecoder().decode(dataset_utf8, encode_nominal=True)
+            except arff.ArffException as e:
+                raise ValueError("The file you have provided is not a valid arff file.") from e
+
+        elif self.url is None:
+            raise ValueError("No valid url/path to the data file was given.")
+        return file_elements
+
+    def _parse_publish_response(self, xml_response: dict) -> None:
+        """Parse the id from the xml_response and assign it to self."""
+        self.dataset_id = int(xml_response["oml:upload_data_set"]["oml:id"])
+
+    def _to_dict(self) -> dict[str, dict]:
+        """Creates a dictionary representation of self."""
+        props = [
+            "id",
+            "name",
+            "version",
+            "description",
+            "format",
+            "creator",
+            "contributor",
+            "collection_date",
+            "upload_date",
+            "language",
+            "licence",
+            "url",
+            "default_target_attribute",
+            "row_id_attribute",
+            "ignore_attribute",
+            "version_label",
+            "citation",
+            "tag",
+            "visibility",
+            "original_data_url",
+            "paper_url",
+            "update_comment",
+            "md5_checksum",
+        ]
+
+        prop_values = {}
+        for prop in props:
+            content = getattr(self, prop, None)
+            if content is not None:
+                prop_values["oml:" + prop] = content
 
-        # the arff dataset string is available
-        if self._dataset is not None:
-            file_elements['dataset'] = self._dataset
-        else:
-            # the path to the arff dataset is given
-            if self.data_file is not None:
-                path = os.path.abspath(self.data_file)
-                if os.path.exists(path):
-                    try:
-
-                        with io.open(path, encoding='utf8') as fh:
-                            # check if arff is valid
-                            decoder = arff.ArffDecoder()
-                            decoder.decode(fh, encode_nominal=True)
-                    except arff.ArffException:
-                        raise ValueError("The file you have provided is not "
-                                         "a valid arff file.")
-
-                    with open(path, 'rb') as fp:
-                        file_elements['dataset'] = fp.read()
-            else:
-                if self.url is None:
-                    raise ValueError("No url/path to the data file was given")
+        return {
+            "oml:data_set_description": {
+                "@xmlns:oml": "http://openml.org/openml",
+                **prop_values,
+            }
+        }
 
-        return_value = openml._api_calls._perform_api_call(
-            "data/", 'post',
-            file_elements=file_elements,
-        )
-        response = xmltodict.parse(return_value)
-        self.dataset_id = int(response['oml:upload_data_set']['oml:id'])
-        return self.dataset_id
 
-    def _to_xml(self):
-        """ Serialize object to xml for upload
+def _read_features(features_file: Path) -> dict[int, OpenMLDataFeature]:
+    features_pickle_file = Path(_get_features_pickle_file(str(features_file)))
+    try:
+        with features_pickle_file.open("rb") as fh_binary:
+            return pickle.load(fh_binary)  # type: ignore  # noqa: S301
 
-        Returns
-        -------
-        xml_dataset : str
-            XML description of the data.
-        """
-        props = ['id', 'name', 'version', 'description', 'format', 'creator',
-                 'contributor', 'collection_date', 'upload_date', 'language',
-                 'licence', 'url', 'default_target_attribute',
-                 'row_id_attribute', 'ignore_attribute', 'version_label',
-                 'citation', 'tag', 'visibility', 'original_data_url',
-                 'paper_url', 'update_comment', 'md5_checksum']
+    except:  # noqa: E722
+        with Path(features_file).open("r", encoding="utf8") as fh:
+            features_xml_string = fh.read()
 
-        data_container = OrderedDict()
-        data_dict = OrderedDict([('@xmlns:oml', 'http://openml.org/openml')])
-        data_container['oml:data_set_description'] = data_dict
+        features = _parse_features_xml(features_xml_string)
 
-        for prop in props:
-            content = getattr(self, prop, None)
-            if content is not None:
-                data_dict["oml:" + prop] = content
+        with features_pickle_file.open("wb") as fh_binary:
+            pickle.dump(features, fh_binary)
 
-        xml_string = xmltodict.unparse(
-            input_dict=data_container,
-            pretty=True,
+        return features
+
+
+def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature]:
+    xml_dict = xmltodict.parse(
+        features_xml_string, force_list=("oml:feature", "oml:nominal_value"), strip_whitespace=False
+    )
+    features_xml = xml_dict["oml:data_features"]
+
+    features: dict[int, OpenMLDataFeature] = {}
+    for idx, xmlfeature in enumerate(features_xml["oml:feature"]):
+        nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
+        feature = OpenMLDataFeature(
+            int(xmlfeature["oml:index"]),
+            xmlfeature["oml:name"],
+            xmlfeature["oml:data_type"],
+            xmlfeature.get("oml:nominal_value"),
+            int(nr_missing),
+            xmlfeature.get("oml:ontology"),
         )
-        # A flow may not be uploaded with the xml encoding specification:
-        # <?xml version="1.0" encoding="utf-8"?>
-        xml_string = xml_string.split('\n', 1)[-1]
-        return xml_string
-
-
-def _check_qualities(qualities):
-    if qualities is not None:
-        qualities_ = {}
-        for xmlquality in qualities:
-            name = xmlquality['oml:name']
-            if xmlquality.get('oml:value', None) is None:
-                value = float('NaN')
-            elif xmlquality['oml:value'] == 'null':
-                value = float('NaN')
-            else:
-                value = float(xmlquality['oml:value'])
-            qualities_[name] = value
-        return qualities_
-    else:
-        return None
+        if idx != feature.index:
+            raise ValueError("Data features not provided in right order")
+        features[feature.index] = feature
+
+    return features
+
+
+# TODO(eddiebergman): Should this really exist?
+def _get_features_pickle_file(features_file: str) -> str:
+    """Exists so it can be mocked during unit testing"""
+    return features_file + ".pkl"
+
+
+# TODO(eddiebergman): Should this really exist?
+def _get_qualities_pickle_file(qualities_file: str) -> str:
+    """Exists so it can be mocked during unit testing."""
+    return qualities_file + ".pkl"
+
+
+def _read_qualities(qualities_file: str | Path) -> dict[str, float]:
+    qualities_file = Path(qualities_file)
+    qualities_pickle_file = Path(_get_qualities_pickle_file(str(qualities_file)))
+    try:
+        with qualities_pickle_file.open("rb") as fh_binary:
+            return pickle.load(fh_binary)  # type: ignore  # noqa: S301
+    except:  # noqa: E722
+        with qualities_file.open(encoding="utf8") as fh:
+            qualities_xml = fh.read()
+
+        qualities = _parse_qualities_xml(qualities_xml)
+        with qualities_pickle_file.open("wb") as fh_binary:
+            pickle.dump(qualities, fh_binary)
+
+        return qualities
+
+
+def _check_qualities(qualities: list[dict[str, str]]) -> dict[str, float]:
+    qualities_ = {}
+    for xmlquality in qualities:
+        name = xmlquality["oml:name"]
+        if xmlquality.get("oml:value", None) is None or xmlquality["oml:value"] == "null":
+            value = float("NaN")
+        else:
+            value = float(xmlquality["oml:value"])
+        qualities_[name] = value
+    return qualities_
+
+
+def _parse_qualities_xml(qualities_xml: str) -> dict[str, float]:
+    xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
+    qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
+    return _check_qualities(qualities)
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 1ed888ec1..432938520 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -1,172 +1,61 @@
-import io
+# License: BSD 3-Clause
+# ruff: noqa: PLR0913
+from __future__ import annotations
+
+import logging
 import os
-import re
-from typing import List, Dict, Union, Optional
+import warnings
+from collections import OrderedDict
+from functools import partial
+from pathlib import Path
+from pyexpat import ExpatError
+from typing import TYPE_CHECKING, Any, Literal
 
-import numpy as np
 import arff
+import minio.error
+import numpy as np
 import pandas as pd
-
+import urllib3
 import xmltodict
 from scipy.sparse import coo_matrix
-from collections import OrderedDict
 
-import openml.utils
+import openml
 import openml._api_calls
-from .dataset import OpenMLDataset
-from ..exceptions import (
-    OpenMLCacheException,
+import openml.utils
+from openml.exceptions import (
     OpenMLHashException,
-    OpenMLServerException,
     OpenMLPrivateDatasetError,
+    OpenMLServerError,
+    OpenMLServerException,
 )
-from ..utils import (
-    _create_cache_directory,
+from openml.utils import (
+    _create_cache_directory_for_id,
+    _get_cache_dir_for_id,
     _remove_cache_dir_for_id,
-    _create_cache_directory_for_id
 )
 
+from .dataset import OpenMLDataset
 
-DATASETS_CACHE_DIR_NAME = 'datasets'
-
-############################################################################
-# Local getters/accessors to the cache directory
-
-
-def _list_cached_datasets():
-    """ Return list with ids of all cached datasets.
-
-    Returns
-    -------
-    list
-        List with IDs of all cached datasets.
-    """
-    datasets = []
-
-    dataset_cache_dir = _create_cache_directory(DATASETS_CACHE_DIR_NAME)
-    directory_content = os.listdir(dataset_cache_dir)
-    directory_content.sort()
-
-    # Find all dataset ids for which we have downloaded the dataset
-    # description
-    for directory_name in directory_content:
-        # First check if the directory name could be an OpenML dataset id
-        if not re.match(r"[0-9]*", directory_name):
-            continue
-
-        dataset_id = int(directory_name)
-
-        directory_name = os.path.join(dataset_cache_dir,
-                                      directory_name)
-        dataset_directory_content = os.listdir(directory_name)
-
-        if ("dataset.arff" in dataset_directory_content
-           and "description.xml" in dataset_directory_content):
-            if dataset_id not in datasets:
-                datasets.append(dataset_id)
-
-    datasets.sort()
-    return datasets
-
-
-def _get_cached_datasets():
-    """Searches for all OpenML datasets in the OpenML cache dir.
-
-    Return a dictionary which maps dataset ids to dataset objects"""
-    dataset_list = _list_cached_datasets()
-    datasets = OrderedDict()
-
-    for dataset_id in dataset_list:
-        datasets[dataset_id] = _get_cached_dataset(dataset_id)
-
-    return datasets
-
-
-def _get_cached_dataset(
-    dataset_id: int
-) -> OpenMLDataset:
-    """Get cached dataset for ID.
-
-    Returns
-    -------
-    OpenMLDataset
-    """
-    description = _get_cached_dataset_description(dataset_id)
-    arff_file = _get_cached_dataset_arff(dataset_id)
-    features = _get_cached_dataset_features(dataset_id)
-    qualities = _get_cached_dataset_qualities(dataset_id)
-    dataset = _create_dataset_from_description(description,
-                                               features,
-                                               qualities,
-                                               arff_file)
-
-    return dataset
-
-
-def _get_cached_dataset_description(dataset_id):
-    did_cache_dir = _create_cache_directory_for_id(
-        DATASETS_CACHE_DIR_NAME, dataset_id,
-    )
-    description_file = os.path.join(did_cache_dir, "description.xml")
-    try:
-        with io.open(description_file, encoding='utf8') as fh:
-            dataset_xml = fh.read()
-        return xmltodict.parse(dataset_xml)["oml:data_set_description"]
-    except (IOError, OSError):
-        raise OpenMLCacheException(
-            "Dataset description for dataset id %d not "
-            "cached" % dataset_id)
-
+if TYPE_CHECKING:
+    import scipy
 
-def _get_cached_dataset_features(dataset_id):
-    did_cache_dir = _create_cache_directory_for_id(
-        DATASETS_CACHE_DIR_NAME, dataset_id,
-    )
-    features_file = os.path.join(did_cache_dir, "features.xml")
-    try:
-        return _load_features_from_file(features_file)
-    except (IOError, OSError):
-        raise OpenMLCacheException("Dataset features for dataset id %d not "
-                                   "cached" % dataset_id)
+DATASETS_CACHE_DIR_NAME = "datasets"
+logger = logging.getLogger(__name__)
 
+NO_ACCESS_GRANTED_ERRCODE = 112
 
-def _get_cached_dataset_qualities(dataset_id):
-    did_cache_dir = _create_cache_directory_for_id(
-        DATASETS_CACHE_DIR_NAME, dataset_id,
-    )
-    qualities_file = os.path.join(did_cache_dir, "qualities.xml")
-    try:
-        with io.open(qualities_file, encoding='utf8') as fh:
-            qualities_xml = fh.read()
-            qualities_dict = xmltodict.parse(qualities_xml)
-            return qualities_dict["oml:data_qualities"]['oml:quality']
-    except (IOError, OSError):
-        raise OpenMLCacheException("Dataset qualities for dataset id %d not "
-                                   "cached" % dataset_id)
-
-
-def _get_cached_dataset_arff(dataset_id):
-    did_cache_dir = _create_cache_directory_for_id(
-        DATASETS_CACHE_DIR_NAME, dataset_id,
-    )
-    output_file = os.path.join(did_cache_dir, "dataset.arff")
-
-    try:
-        with io.open(output_file, encoding='utf8'):
-            pass
-        return output_file
-    except (OSError, IOError):
-        raise OpenMLCacheException("ARFF file for dataset id %d not "
-                                   "cached" % dataset_id)
+############################################################################
+# Local getters/accessors to the cache directory
 
 
-def _get_cache_directory(dataset: OpenMLDataset) -> str:
-    """ Return the cache directory of the OpenMLDataset """
+def _get_cache_directory(dataset: OpenMLDataset) -> Path:
+    """Creates and returns the cache directory of the OpenMLDataset."""
+    assert dataset.dataset_id is not None
     return _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset.dataset_id)
 
 
-def list_qualities() -> List[str]:
-    """ Return list of data qualities available.
+def list_qualities() -> list[str]:
+    """Return list of data qualities available.
 
     The function performs an API call to retrieve the entire list of
     data qualities that are computed on the datasets uploaded.
@@ -176,34 +65,40 @@ def list_qualities() -> List[str]:
     list
     """
     api_call = "data/qualities/list"
-    xml_string = openml._api_calls._perform_api_call(api_call, 'get')
-    qualities = xmltodict.parse(xml_string, force_list=('oml:quality'))
+    xml_string = openml._api_calls._perform_api_call(api_call, "get")
+    qualities = xmltodict.parse(xml_string, force_list=("oml:quality"))
     # Minimalistic check if the XML is useful
-    if 'oml:data_qualities_list' not in qualities:
-        raise ValueError('Error in return XML, does not contain '
-                         '"oml:data_qualities_list"')
-    if not isinstance(qualities['oml:data_qualities_list']['oml:quality'], list):
-        raise TypeError('Error in return XML, does not contain '
-                        '"oml:quality" as a list')
-    qualities = qualities['oml:data_qualities_list']['oml:quality']
-    return qualities
+    if "oml:data_qualities_list" not in qualities:
+        raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"')
+
+    if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list):
+        raise TypeError('Error in return XML, does not contain "oml:quality" as a list')
+
+    return qualities["oml:data_qualities_list"]["oml:quality"]
 
 
 def list_datasets(
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    status: Optional[str] = None,
-    tag: Optional[str] = None,
-    output_format: str = 'dict',
-    **kwargs
-) -> Union[Dict, pd.DataFrame]:
+    data_id: list[int] | None = None,
+    offset: int | None = None,
+    size: int | None = None,
+    status: str | None = None,
+    tag: str | None = None,
+    data_name: str | None = None,
+    data_version: int | None = None,
+    number_instances: int | str | None = None,
+    number_features: int | str | None = None,
+    number_classes: int | str | None = None,
+    number_missing_values: int | str | None = None,
+) -> pd.DataFrame:
+    """Return a dataframe of all dataset which are on OpenML.
 
-    """
-    Return a list of all dataset which are on OpenML.
     Supports large amount of results.
 
     Parameters
     ----------
+    data_id : list, optional
+        A list of data ids, to specify which datasets should be
+        listed
     offset : int, optional
         The number of datasets to skip, starting from the first.
     size : int, optional
@@ -213,63 +108,67 @@ def list_datasets(
         default active datasets are returned, but also datasets
         from another status can be requested.
     tag : str, optional
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
-    kwargs : dict, optional
-        Legal filter operators (keys in the dict):
-        data_name, data_version, number_instances,
-        number_features, number_classes, number_missing_values.
+    data_name : str, optional
+    data_version : int, optional
+    number_instances : int | str, optional
+    number_features : int | str, optional
+    number_classes : int | str, optional
+    number_missing_values : int | str, optional
 
     Returns
     -------
-    datasets : dict of dicts, or dataframe
-        - If output_format='dict'
-            A mapping from dataset ID to dict.
-
-            Every dataset is represented by a dictionary containing
-            the following information:
-            - dataset id
-            - name
-            - format
-            - status
-            If qualities are calculated for the dataset, some of
-            these are also returned.
-
-        - If output_format='dataframe'
-            Each row maps to a dataset
-            Each column contains the following information:
-            - dataset id
-            - name
-            - format
-            - status
-            If qualities are calculated for the dataset, some of
-            these are also included as columns.
+    datasets: dataframe
+        Each row maps to a dataset
+        Each column contains the following information:
+        - dataset id
+        - name
+        - format
+        - status
+        If qualities are calculated for the dataset, some of
+        these are also included as columns.
     """
-    if output_format not in ['dataframe', 'dict']:
-        raise ValueError("Invalid output format selected. "
-                         "Only 'dict' or 'dataframe' applicable.")
+    listing_call = partial(
+        _list_datasets,
+        data_id=data_id,
+        status=status,
+        tag=tag,
+        data_name=data_name,
+        data_version=data_version,
+        number_instances=number_instances,
+        number_features=number_features,
+        number_classes=number_classes,
+        number_missing_values=number_missing_values,
+    )
+    batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+    if len(batches) == 0:
+        return pd.DataFrame()
 
-    return openml.utils._list_all(output_format=output_format,
-                                  listing_call=_list_datasets,
-                                  offset=offset,
-                                  size=size,
-                                  status=status,
-                                  tag=tag,
-                                  **kwargs)
+    return pd.concat(batches)
 
 
-def _list_datasets(output_format='dict', **kwargs):
+def _list_datasets(
+    limit: int,
+    offset: int,
+    *,
+    data_id: list[int] | None = None,
+    **kwargs: Any,
+) -> pd.DataFrame:
     """
     Perform api call to return a list of all datasets.
 
     Parameters
     ----------
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
+    The arguments that are lists are separated from the single value
+    ones which are put into the kwargs.
+    display_errors is also separated from the kwargs since it has a
+    default value.
+
+    limit : int
+        The maximum number of datasets to show.
+    offset : int
+        The number of datasets to skip, starting from the first.
+    data_id : list, optional
+
     kwargs : dict, optional
         Legal filter operators (keys in the dict):
         tag, status, limit, offset, data_name, data_version, number_instances,
@@ -277,92 +176,125 @@ def _list_datasets(output_format='dict', **kwargs):
 
     Returns
     -------
-    datasets : dict of dicts, or dataframe
+    datasets : dataframe
     """
-
     api_call = "data/list"
 
+    if limit is not None:
+        api_call += f"/limit/{limit}"
+    if offset is not None:
+        api_call += f"/offset/{offset}"
+
     if kwargs is not None:
         for operator, value in kwargs.items():
-            api_call += "/%s/%s" % (operator, value)
-    return __list_datasets(api_call=api_call, output_format=output_format)
-
+            if value is not None:
+                api_call += f"/{operator}/{value}"
+    if data_id is not None:
+        api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}"
+    return __list_datasets(api_call=api_call)
 
-def __list_datasets(api_call, output_format='dict'):
 
-    xml_string = openml._api_calls._perform_api_call(api_call, 'get')
-    datasets_dict = xmltodict.parse(xml_string, force_list=('oml:dataset',))
+def __list_datasets(api_call: str) -> pd.DataFrame:
+    xml_string = openml._api_calls._perform_api_call(api_call, "get")
+    datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
 
     # Minimalistic check if the XML is useful
-    assert type(datasets_dict['oml:data']['oml:dataset']) == list, \
-        type(datasets_dict['oml:data'])
-    assert datasets_dict['oml:data']['@xmlns:oml'] == \
-        'http://openml.org/openml', datasets_dict['oml:data']['@xmlns:oml']
-
-    datasets = dict()
-    for dataset_ in datasets_dict['oml:data']['oml:dataset']:
-        ignore_attribute = ['oml:file_id', 'oml:quality']
-        dataset = {k.replace('oml:', ''): v
-                   for (k, v) in dataset_.items()
-                   if k not in ignore_attribute}
-        dataset['did'] = int(dataset['did'])
-        dataset['version'] = int(dataset['version'])
+    assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type(
+        datasets_dict["oml:data"],
+    )
+    assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[
+        "oml:data"
+    ]["@xmlns:oml"]
+
+    datasets = {}
+    for dataset_ in datasets_dict["oml:data"]["oml:dataset"]:
+        ignore_attribute = ["oml:file_id", "oml:quality"]
+        dataset = {
+            k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute
+        }
+        dataset["did"] = int(dataset["did"])
+        dataset["version"] = int(dataset["version"])
 
         # The number of qualities can range from 0 to infinity
-        for quality in dataset_.get('oml:quality', list()):
+        for quality in dataset_.get("oml:quality", []):
             try:
-                dataset[quality['@name']] = int(quality['#text'])
+                dataset[quality["@name"]] = int(quality["#text"])
             except ValueError:
-                dataset[quality['@name']] = float(quality['#text'])
-        datasets[dataset['did']] = dataset
+                dataset[quality["@name"]] = float(quality["#text"])
+        datasets[dataset["did"]] = dataset
+
+    return pd.DataFrame.from_dict(datasets, orient="index").astype(
+        {
+            "did": int,
+            "version": int,
+            "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
+        }
+    )
 
-    if output_format == 'dataframe':
-        datasets = pd.DataFrame.from_dict(datasets, orient='index')
 
-    return datasets
+def _expand_parameter(parameter: str | list[str] | None) -> list[str]:
+    expanded_parameter = []
+    if isinstance(parameter, str):
+        expanded_parameter = [x.strip() for x in parameter.split(",")]
+    elif isinstance(parameter, list):
+        expanded_parameter = parameter
+    return expanded_parameter
 
 
-def _load_features_from_file(features_file: str) -> Dict:
-    with io.open(features_file, encoding='utf8') as fh:
-        features_xml = fh.read()
-        xml_dict = xmltodict.parse(features_xml,
-                                   force_list=('oml:feature', 'oml:nominal_value'))
-        return xml_dict["oml:data_features"]
+def _validated_data_attributes(
+    attributes: list[str],
+    data_attributes: list[tuple[str, Any]],
+    parameter_name: str,
+) -> None:
+    for attribute_ in attributes:
+        is_attribute_a_data_attribute = any(dattr[0] == attribute_ for dattr in data_attributes)
+        if not is_attribute_a_data_attribute:
+            raise ValueError(
+                f"all attribute of '{parameter_name}' should be one of the data attribute. "
+                f" Got '{attribute_}' while candidates are"
+                f" {[dattr[0] for dattr in data_attributes]}.",
+            )
 
 
-def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]:
+def check_datasets_active(
+    dataset_ids: list[int],
+    raise_error_if_not_exist: bool = True,  # noqa: FBT002
+) -> dict[int, bool]:
     """
     Check if the dataset ids provided are active.
 
+    Raises an error if a dataset_id in the given list
+    of dataset_ids does not exist on the server and
+    `raise_error_if_not_exist` is set to True (default).
+
     Parameters
     ----------
     dataset_ids : List[int]
         A list of integers representing dataset ids.
+    raise_error_if_not_exist : bool (default=True)
+        Flag that if activated can raise an error, if one or more of the
+        given dataset ids do not exist on the server.
 
     Returns
     -------
     dict
         A dictionary with items {did: bool}
     """
-    dataset_list = list_datasets(status='all')
-    active = {}
-
-    for did in dataset_ids:
-        dataset = dataset_list.get(did, None)
-        if dataset is None:
-            raise ValueError('Could not find dataset {} in OpenML dataset list.'.format(did))
-        else:
-            active[did] = (dataset['status'] == 'active')
-
-    return active
+    datasets = list_datasets(status="all", data_id=dataset_ids)
+    missing = set(dataset_ids) - set(datasets.index)
+    if raise_error_if_not_exist and missing:
+        missing_str = ", ".join(str(did) for did in missing)
+        raise ValueError(f"Could not find dataset(s) {missing_str} in OpenML dataset list.")
+    mask = datasets["status"] == "active"
+    return dict(mask)
 
 
 def _name_to_id(
     dataset_name: str,
-    version: Optional[int] = None,
-    error_if_multiple: bool = False
+    version: int | None = None,
+    error_if_multiple: bool = False,  # noqa: FBT002
 ) -> int:
-    """ Attempt to find the dataset id of the dataset with the given name.
+    """Attempt to find the dataset id of the dataset with the given name.
 
     If multiple datasets with the name exist, and ``error_if_multiple`` is ``False``,
     then return the least recent still active dataset.
@@ -374,34 +306,43 @@ def _name_to_id(
     ----------
     dataset_name : str
         The name of the dataset for which to find its id.
-    version : int
+    version : int, optional
         Version to retrieve. If not specified, the oldest active version is returned.
     error_if_multiple : bool (default=False)
         If `False`, if multiple datasets match, return the least recent active dataset.
         If `True`, if multiple datasets match, raise an error.
+    download_qualities : bool, optional (default=True)
+        If `True`, also download qualities.xml file. If False it skip the qualities.xml.
 
     Returns
     -------
     int
        The id of the dataset.
     """
-    status = None if version is not None else 'active'
-    candidates = list_datasets(data_name=dataset_name, status=status, data_version=version)
+    status = None if version is not None else "active"
+    candidates = list_datasets(
+        data_name=dataset_name,
+        status=status,
+        data_version=version,
+    )
     if error_if_multiple and len(candidates) > 1:
-        raise ValueError("Multiple active datasets exist with name {}".format(dataset_name))
-    if len(candidates) == 0:
-        no_dataset_for_name = "No active datasets exist with name {}".format(dataset_name)
-        and_version = " and version {}".format(version) if version is not None else ""
+        msg = f"Multiple active datasets exist with name '{dataset_name}'."
+        raise ValueError(msg)
+
+    if candidates.empty:
+        no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'"
+        and_version = f" and version '{version}'." if version is not None else "."
         raise RuntimeError(no_dataset_for_name + and_version)
 
     # Dataset ids are chronological so we can just sort based on ids (instead of version)
-    return sorted(candidates)[0]
+    return candidates["did"].min()  # type: ignore
 
 
 def get_datasets(
-    dataset_ids: List[Union[str, int]],
-    download_data: bool = True,
-) -> List[OpenMLDataset]:
+    dataset_ids: list[str | int],
+    download_data: bool = False,  # noqa: FBT002
+    download_qualities: bool = False,  # noqa: FBT002
+) -> list[OpenMLDataset]:
     """Download datasets.
 
     This function iterates :meth:`openml.datasets.get_dataset`.
@@ -416,6 +357,8 @@ def get_datasets(
         make the operation noticeably slower. Metadata is also still retrieved.
         If False, create the OpenMLDataset and only populate it with the metadata.
         The data may later be retrieved through the `OpenMLDataset.get_data` method.
+    download_qualities : bool, optional (default=True)
+        If True, also download qualities.xml file. If False it skip the qualities.xml.
 
     Returns
     -------
@@ -424,33 +367,49 @@ def get_datasets(
     """
     datasets = []
     for dataset_id in dataset_ids:
-        datasets.append(get_dataset(dataset_id, download_data))
+        datasets.append(
+            get_dataset(dataset_id, download_data, download_qualities=download_qualities),
+        )
     return datasets
 
 
 @openml.utils.thread_safe_if_oslo_installed
-def get_dataset(
-    dataset_id: Union[int, str],
-    download_data: bool = True,
-    version: int = None,
-    error_if_multiple: bool = False
+def get_dataset(  # noqa: C901, PLR0912
+    dataset_id: int | str,
+    download_data: bool = False,  # noqa: FBT002
+    version: int | None = None,
+    error_if_multiple: bool = False,  # noqa: FBT002
+    cache_format: Literal["pickle", "feather"] = "pickle",
+    download_qualities: bool = False,  # noqa: FBT002
+    download_features_meta_data: bool = False,  # noqa: FBT002
+    download_all_files: bool = False,  # noqa: FBT002
+    force_refresh_cache: bool = False,  # noqa: FBT002
 ) -> OpenMLDataset:
-    """ Download the OpenML dataset representation, optionally also download actual data file.
+    """Download the OpenML dataset representation, optionally also download actual data file.
+
+    This function is by default NOT thread/multiprocessing safe, as this function uses caching.
+    A check will be performed to determine if the information has previously been downloaded to a
+    cache, and if so be loaded from disk instead of retrieved from the server.
+
+    To make this function thread safe, you can install the python package ``oslo.concurrency``.
+    If ``oslo.concurrency`` is installed `get_dataset` becomes thread safe.
 
-    This function is thread/multiprocessing safe.
-    This function uses caching. A check will be performed to determine if the information has
-    previously been downloaded, and if so be loaded from disk instead of retrieved from the server.
+    Alternatively, to make this function thread/multiprocessing safe initialize the cache first by
+    calling `get_dataset(args)` once before calling `get_dataset(args)` many times in parallel.
+    This will initialize the cache and later calls will use the cache in a thread/multiprocessing
+    safe way.
 
     If dataset is retrieved by name, a version may be specified.
     If no version is specified and multiple versions of the dataset exist,
     the earliest version of the dataset that is still active will be returned.
-    This scenario will raise an error instead if `exception_if_multiple` is `True`.
+    If no version is specified, multiple versions of the dataset exist and
+    ``exception_if_multiple`` is set to ``True``, this function will raise an exception.
 
     Parameters
     ----------
     dataset_id : int or str
-        Dataset ID of the dataset to download
-    download_data : bool, optional (default=True)
+        Dataset ID (integer) or dataset name (string) of the dataset to download.
+    download_data : bool (default=False)
         If True, also download the data file. Beware that some datasets are large and it might
         make the operation noticeably slower. Metadata is also still retrieved.
         If False, create the OpenMLDataset and only populate it with the metadata.
@@ -458,56 +417,124 @@ def get_dataset(
     version : int, optional (default=None)
         Specifies the version if `dataset_id` is specified by name.
         If no version is specified, retrieve the least recent still active version.
-    error_if_multiple : bool, optional (default=False)
-        If `True` raise an error if multiple datasets are found with matching criteria.
+    error_if_multiple : bool (default=False)
+        If ``True`` raise an error if multiple datasets are found with matching criteria.
+    cache_format : str (default='pickle') in {'pickle', 'feather'}
+        Format for caching the dataset - may be feather or pickle
+        Note that the default 'pickle' option may load slower than feather when
+        no.of.rows is very high.
+    download_qualities : bool (default=False)
+        Option to download 'qualities' meta-data in addition to the minimal dataset description.
+        If True, download and cache the qualities file.
+        If False, create the OpenMLDataset without qualities metadata. The data may later be added
+        to the OpenMLDataset through the `OpenMLDataset.load_metadata(qualities=True)` method.
+    download_features_meta_data : bool (default=False)
+        Option to download 'features' meta-data in addition to the minimal dataset description.
+        If True, download and cache the features file.
+        If False, create the OpenMLDataset without features metadata. The data may later be added
+        to the OpenMLDataset through the `OpenMLDataset.load_metadata(features=True)` method.
+    download_all_files: bool (default=False)
+        EXPERIMENTAL. Download all files related to the dataset that reside on the server.
+        Useful for datasets which refer to auxiliary files (e.g., meta-album).
+    force_refresh_cache : bool (default=False)
+        Force the cache to refreshed by deleting the cache directory and re-downloading the data.
+        Note, if `force_refresh_cache` is True, `get_dataset` is NOT thread/multiprocessing safe,
+        because this creates a race condition to creating and deleting the cache; as in general with
+        the cache.
 
     Returns
     -------
     dataset : :class:`openml.OpenMLDataset`
         The downloaded dataset.
     """
+    if download_all_files:
+        warnings.warn(
+            "``download_all_files`` is experimental and is likely to break with new releases.",
+            FutureWarning,
+            stacklevel=2,
+        )
+
+    if cache_format not in ["feather", "pickle"]:
+        raise ValueError(
+            "cache_format must be one of 'feather' or 'pickle. "
+            f"Invalid format specified: {cache_format}",
+        )
+
     if isinstance(dataset_id, str):
         try:
             dataset_id = int(dataset_id)
         except ValueError:
             dataset_id = _name_to_id(dataset_id, version, error_if_multiple)  # type: ignore
     elif not isinstance(dataset_id, int):
-        raise TypeError("`dataset_id` must be one of `str` or `int`, not {}."
-                        .format(type(dataset_id)))
+        raise TypeError(
+            f"`dataset_id` must be one of `str` or `int`, not {type(dataset_id)}.",
+        )
+
+    if force_refresh_cache:
+        did_cache_dir = _get_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, dataset_id)
+        if did_cache_dir.exists():
+            _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir)
 
     did_cache_dir = _create_cache_directory_for_id(
-        DATASETS_CACHE_DIR_NAME, dataset_id,
+        DATASETS_CACHE_DIR_NAME,
+        dataset_id,
     )
 
+    remove_dataset_cache = True
     try:
-        remove_dataset_cache = True
         description = _get_dataset_description(did_cache_dir, dataset_id)
-        features = _get_dataset_features(did_cache_dir, dataset_id)
-        qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
+        features_file = None
+        qualities_file = None
 
-        arff_file = _get_dataset_arff(description) if download_data else None
+        if download_features_meta_data:
+            features_file = _get_dataset_features_file(did_cache_dir, dataset_id)
+        if download_qualities:
+            qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
+
+        parquet_file = None
+        skip_parquet = (
+            os.environ.get(openml.config.OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true"
+        )
+        download_parquet = "oml:parquet_url" in description and not skip_parquet
+        if download_parquet and (download_data or download_all_files):
+            try:
+                parquet_file = _get_dataset_parquet(
+                    description,
+                    download_all_files=download_all_files,
+                )
+            except urllib3.exceptions.MaxRetryError:
+                parquet_file = None
+
+        arff_file = None
+        if parquet_file is None and download_data:
+            if download_parquet:
+                logger.warning("Failed to download parquet, fallback on ARFF.")
+            arff_file = _get_dataset_arff(description)
 
         remove_dataset_cache = False
     except OpenMLServerException as e:
-        # if there was an exception,
+        # if there was an exception
         # check if the user had access to the dataset
-        if e.code == 112:
+        if e.code == NO_ACCESS_GRANTED_ERRCODE:
             raise OpenMLPrivateDatasetError(e.message) from None
-        else:
-            raise e
+
+        raise e
     finally:
         if remove_dataset_cache:
-            _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME,
-                                     did_cache_dir)
-
-    dataset = _create_dataset_from_description(
-        description, features, qualities, arff_file
+            _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir)
+
+    return _create_dataset_from_description(
+        description,
+        features_file,
+        qualities_file,
+        arff_file,
+        parquet_file,
+        cache_format,
     )
-    return dataset
 
 
-def attributes_arff_from_df(df):
-    """ Describe attributes of the dataframe according to ARFF specification.
+def attributes_arff_from_df(df: pd.DataFrame) -> list[tuple[str, list[str] | str]]:
+    """Describe attributes of the dataframe according to ARFF specification.
 
     Parameters
     ----------
@@ -516,57 +543,73 @@ def attributes_arff_from_df(df):
 
     Returns
     -------
-    attributes_arff : str
+    attributes_arff : list[str]
         The data set attributes as required by the ARFF format.
     """
-    PD_DTYPES_TO_ARFF_DTYPE = {
-        'integer': 'INTEGER',
-        'floating': 'REAL',
-        'string': 'STRING'
-    }
-    attributes_arff = []
+    PD_DTYPES_TO_ARFF_DTYPE = {"integer": "INTEGER", "floating": "REAL", "string": "STRING"}
+    attributes_arff: list[tuple[str, list[str] | str]] = []
+
+    if not all(isinstance(column_name, str) for column_name in df.columns):
+        logger.warning("Converting non-str column names to str.")
+        df.columns = [str(column_name) for column_name in df.columns]
+
     for column_name in df:
         # skipna=True does not infer properly the dtype. The NA values are
         # dropped before the inference instead.
-        column_dtype = pd.api.types.infer_dtype(df[column_name].dropna())
+        column_dtype = pd.api.types.infer_dtype(df[column_name].dropna(), skipna=False)
 
-        if column_dtype == 'categorical':
+        if column_dtype == "categorical":
             # for categorical feature, arff expects a list string. However, a
             # categorical column can contain mixed type and should therefore
             # raise an error asking to convert all entries to string.
             categories = df[column_name].cat.categories
             categories_dtype = pd.api.types.infer_dtype(categories)
-            if categories_dtype not in ('string', 'unicode'):
-                raise ValueError("The column '{}' of the dataframe is of "
-                                 "'category' dtype. Therefore, all values in "
-                                 "this columns should be string. Please "
-                                 "convert the entries which are not string. "
-                                 "Got {} dtype in this column."
-                                 .format(column_name, categories_dtype))
+            if categories_dtype not in ("string", "unicode"):
+                raise ValueError(
+                    f"The column '{column_name}' of the dataframe is of "
+                    "'category' dtype. Therefore, all values in "
+                    "this columns should be string. Please "
+                    "convert the entries which are not string. "
+                    f"Got {categories_dtype} dtype in this column.",
+                )
             attributes_arff.append((column_name, categories.tolist()))
-        elif column_dtype == 'boolean':
+        elif column_dtype == "boolean":
             # boolean are encoded as categorical.
-            attributes_arff.append((column_name, ['True', 'False']))
-        elif column_dtype in PD_DTYPES_TO_ARFF_DTYPE.keys():
-            attributes_arff.append((column_name,
-                                    PD_DTYPES_TO_ARFF_DTYPE[column_dtype]))
+            attributes_arff.append((column_name, ["True", "False"]))
+        elif column_dtype in PD_DTYPES_TO_ARFF_DTYPE:
+            attributes_arff.append((column_name, PD_DTYPES_TO_ARFF_DTYPE[column_dtype]))
         else:
-            raise ValueError("The dtype '{}' of the column '{}' is not "
-                             "currently supported by liac-arff. Supported "
-                             "dtypes are categorical, string, integer, "
-                             "floating, and boolean."
-                             .format(column_dtype, column_name))
+            raise ValueError(
+                f"The dtype '{column_dtype}' of the column '{column_name}' is not "
+                "currently supported by liac-arff. Supported "
+                "dtypes are categorical, string, integer, "
+                "floating, and boolean.",
+            )
     return attributes_arff
 
 
-def create_dataset(name, description, creator, contributor,
-                   collection_date, language,
-                   licence, attributes, data,
-                   default_target_attribute,
-                   ignore_attribute, citation,
-                   row_id_attribute=None,
-                   original_data_url=None, paper_url=None,
-                   update_comment=None, version_label=None):
+def create_dataset(  # noqa: C901, PLR0912, PLR0915
+    name: str,
+    description: str | None,
+    creator: str | None,
+    contributor: str | None,
+    collection_date: str | None,
+    language: str | None,
+    licence: str | None,
+    # TODO(eddiebergman): Docstring says `type` but I don't know what this is other than strings
+    # Edit: Found it could also be like ["True", "False"]
+    attributes: list[tuple[str, str | list[str]]] | dict[str, str | list[str]] | Literal["auto"],
+    data: pd.DataFrame | np.ndarray | scipy.sparse.coo_matrix,
+    # TODO(eddiebergman): Function requires `default_target_attribute` exist but API allows None
+    default_target_attribute: str,
+    ignore_attribute: str | list[str] | None,
+    citation: str,
+    row_id_attribute: str | None = None,
+    original_data_url: str | None = None,
+    paper_url: str | None = None,
+    update_comment: str | None = None,
+    version_label: str | None = None,
+) -> OpenMLDataset:
     """Create a dataset.
 
     This function creates an OpenMLDataset object.
@@ -607,6 +650,7 @@ def create_dataset(name, description, creator, contributor,
     ignore_attribute : str | list
         Attributes that should be excluded in modelling,
         such as identifiers and indexes.
+        Can have multiple values, comma separated.
     citation : str
         Reference(s) that should be cited when building on this data.
     version_label : str, optional
@@ -631,9 +675,9 @@ def create_dataset(name, description, creator, contributor,
     Returns
     -------
     class:`openml.OpenMLDataset`
-        Dataset description."""
-
-    if isinstance(data, (pd.DataFrame, pd.SparseDataFrame)):
+    Dataset description.
+    """
+    if isinstance(data, pd.DataFrame):
         # infer the row id from the index of the dataset
         if row_id_attribute is None:
             row_id_attribute = data.index.name
@@ -642,72 +686,76 @@ def create_dataset(name, description, creator, contributor,
         if data.index.name is not None:
             data = data.reset_index()
 
-    if attributes == 'auto' or isinstance(attributes, dict):
-        if not hasattr(data, "columns"):
-            raise ValueError("Automatically inferring attributes requires "
-                             "a pandas DataFrame or SparseDataFrame. "
-                             "A {!r} was given instead.".format(data))
+    if attributes == "auto" or isinstance(attributes, dict):
+        if not isinstance(data, pd.DataFrame):
+            raise ValueError(
+                "Automatically inferring attributes requires "
+                f"a pandas DataFrame. A {data!r} was given instead.",
+            )
         # infer the type of data for each column of the DataFrame
         attributes_ = attributes_arff_from_df(data)
         if isinstance(attributes, dict):
             # override the attributes which was specified by the user
             for attr_idx in range(len(attributes_)):
                 attr_name = attributes_[attr_idx][0]
-                if attr_name in attributes.keys():
+                if attr_name in attributes:
                     attributes_[attr_idx] = (attr_name, attributes[attr_name])
     else:
         attributes_ = attributes
+    ignore_attributes = _expand_parameter(ignore_attribute)
+    _validated_data_attributes(ignore_attributes, attributes_, "ignore_attribute")
+
+    default_target_attributes = _expand_parameter(default_target_attribute)
+    _validated_data_attributes(default_target_attributes, attributes_, "default_target_attribute")
 
     if row_id_attribute is not None:
-        is_row_id_an_attribute = any([attr[0] == row_id_attribute
-                                      for attr in attributes_])
+        is_row_id_an_attribute = any(attr[0] == row_id_attribute for attr in attributes_)
         if not is_row_id_an_attribute:
             raise ValueError(
                 "'row_id_attribute' should be one of the data attribute. "
-                " Got '{}' while candidates are {}."
-                .format(row_id_attribute, [attr[0] for attr in attributes_])
+                f" Got '{row_id_attribute}' while candidates are"
+                f" {[attr[0] for attr in attributes_]}.",
             )
 
-    if hasattr(data, "columns"):
-        if isinstance(data, pd.SparseDataFrame):
-            data = data.to_coo()
+    if isinstance(data, pd.DataFrame):
+        if all(isinstance(dtype, pd.SparseDtype) for dtype in data.dtypes):
+            data = data.sparse.to_coo()
             # liac-arff only support COO matrices with sorted rows
-            row_idx_sorted = np.argsort(data.row)
-            data.row = data.row[row_idx_sorted]
-            data.col = data.col[row_idx_sorted]
-            data.data = data.data[row_idx_sorted]
+            row_idx_sorted = np.argsort(data.row)  # type: ignore
+            data.row = data.row[row_idx_sorted]  # type: ignore
+            data.col = data.col[row_idx_sorted]  # type: ignore
+            data.data = data.data[row_idx_sorted]  # type: ignore
         else:
-            data = data.values
+            data = data.to_numpy()
 
+    data_format: Literal["arff", "sparse_arff"]
     if isinstance(data, (list, np.ndarray)):
         if isinstance(data[0], (list, np.ndarray)):
-            data_format = 'arff'
+            data_format = "arff"
         elif isinstance(data[0], dict):
-            data_format = 'sparse_arff'
+            data_format = "sparse_arff"
         else:
             raise ValueError(
-                'When giving a list or a numpy.ndarray, '
-                'they should contain a list/ numpy.ndarray '
-                'for dense data or a dictionary for sparse '
-                'data. Got {!r} instead.'
-                .format(data[0])
+                "When giving a list or a numpy.ndarray, "
+                "they should contain a list/ numpy.ndarray "
+                "for dense data or a dictionary for sparse "
+                f"data. Got {data[0]!r} instead.",
             )
     elif isinstance(data, coo_matrix):
-        data_format = 'sparse_arff'
+        data_format = "sparse_arff"
     else:
         raise ValueError(
-            'When giving a list or a numpy.ndarray, '
-            'they should contain a list/ numpy.ndarray '
-            'for dense data or a dictionary for sparse '
-            'data. Got {!r} instead.'
-            .format(data[0])
+            "When giving a list or a numpy.ndarray, "
+            "they should contain a list/ numpy.ndarray "
+            "for dense data or a dictionary for sparse "
+            f"data. Got {data[0]!r} instead.",
         )
 
     arff_object = {
-        'relation': name,
-        'description': description,
-        'attributes': attributes_,
-        'data': data
+        "relation": name,
+        "description": description,
+        "attributes": attributes_,
+        "data": data,
     }
 
     # serializes the ARFF dataset object and returns a string
@@ -715,15 +763,12 @@ def create_dataset(name, description, creator, contributor,
     try:
         # check if ARFF is valid
         decoder = arff.ArffDecoder()
-        return_type = arff.COO if data_format == 'sparse_arff' else arff.DENSE
-        decoder.decode(
-            arff_dataset,
-            encode_nominal=True,
-            return_type=return_type
-        )
-    except arff.ArffException:
-        raise ValueError("The arguments you have provided \
-                             do not construct a valid ARFF file")
+        return_type = arff.COO if data_format == "sparse_arff" else arff.DENSE
+        decoder.decode(arff_dataset, encode_nominal=True, return_type=return_type)
+    except arff.ArffException as e:
+        raise ValueError(
+            "The arguments you have provided do not construct a valid ARFF file"
+        ) from e
 
     return OpenMLDataset(
         name=name,
@@ -746,12 +791,12 @@ def create_dataset(name, description, creator, contributor,
     )
 
 
-def status_update(data_id, status):
+def status_update(data_id: int, status: Literal["active", "deactivated"]) -> None:
     """
     Updates the status of a dataset to either 'active' or 'deactivated'.
     Please see the OpenML API documentation for a description of the status
     and all legal status transitions:
-    https://docs.openml.org/#dataset-status
+    https://docs.openml.org/concepts/data/#dataset-status
 
     Parameters
     ----------
@@ -760,30 +805,280 @@ def status_update(data_id, status):
     status : str,
         'active' or 'deactivated'
     """
-    legal_status = {'active', 'deactivated'}
+    legal_status = {"active", "deactivated"}
     if status not in legal_status:
-        raise ValueError('Illegal status value. '
-                         'Legal values: %s' % legal_status)
-    data = {'data_id': data_id, 'status': status}
-    result_xml = openml._api_calls._perform_api_call("data/status/update",
-                                                     'post',
-                                                     data=data)
+        raise ValueError(f"Illegal status value. Legal values: {legal_status}")
+
+    data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status}
+    result_xml = openml._api_calls._perform_api_call("data/status/update", "post", data=data)
     result = xmltodict.parse(result_xml)
-    server_data_id = result['oml:data_status_update']['oml:id']
-    server_status = result['oml:data_status_update']['oml:status']
+    server_data_id = result["oml:data_status_update"]["oml:id"]
+    server_status = result["oml:data_status_update"]["oml:status"]
     if status != server_status or int(data_id) != int(server_data_id):
         # This should never happen
-        raise ValueError('Data id/status does not collide')
+        raise ValueError("Data id/status does not collide")
+
+
+def edit_dataset(
+    data_id: int,
+    description: str | None = None,
+    creator: str | None = None,
+    contributor: str | None = None,
+    collection_date: str | None = None,
+    language: str | None = None,
+    default_target_attribute: str | None = None,
+    ignore_attribute: str | list[str] | None = None,
+    citation: str | None = None,
+    row_id_attribute: str | None = None,
+    original_data_url: str | None = None,
+    paper_url: str | None = None,
+) -> int:
+    """Edits an OpenMLDataset.
+
+    In addition to providing the dataset id of the dataset to edit (through data_id),
+    you must specify a value for at least one of the optional function arguments,
+    i.e. one value for a field to edit.
+
+    This function allows editing of both non-critical and critical fields.
+    Critical fields are default_target_attribute, ignore_attribute, row_id_attribute.
+
+     - Editing non-critical data fields is allowed for all authenticated users.
+     - Editing critical fields is allowed only for the owner, provided there are no tasks
+       associated with this dataset.
+
+    If dataset has tasks or if the user is not the owner, the only way
+    to edit critical fields is to use fork_dataset followed by edit_dataset.
+
+    Parameters
+    ----------
+    data_id : int
+        ID of the dataset.
+    description : str
+        Description of the dataset.
+    creator : str
+        The person who created the dataset.
+    contributor : str
+        People who contributed to the current version of the dataset.
+    collection_date : str
+        The date the data was originally collected, given by the uploader.
+    language : str
+        Language in which the data is represented.
+        Starts with 1 upper case letter, rest lower case, e.g. 'English'.
+    default_target_attribute : str
+        The default target attribute, if it exists.
+        Can have multiple values, comma separated.
+    ignore_attribute : str | list
+        Attributes that should be excluded in modelling,
+        such as identifiers and indexes.
+    citation : str
+        Reference(s) that should be cited when building on this data.
+    row_id_attribute : str, optional
+        The attribute that represents the row-id column, if present in the
+        dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not
+        specified, the index of the dataframe will be used as the
+        ``row_id_attribute``. If the name of the index is ``None``, it will
+        be discarded.
+
+        .. versionadded: 0.8
+            Inference of ``row_id_attribute`` from a dataframe.
+    original_data_url : str, optional
+        For derived data, the url to the original dataset.
+    paper_url : str, optional
+        Link to a paper describing the dataset.
+
+    Returns
+    -------
+    Dataset id
+    """
+    if not isinstance(data_id, int):
+        raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
+
+    # compose data edit parameters as xml
+    form_data = {"data_id": data_id}  # type: openml._api_calls.DATA_TYPE
+    xml = OrderedDict()  # type: 'OrderedDict[str, OrderedDict]'
+    xml["oml:data_edit_parameters"] = OrderedDict()
+    xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml"
+    xml["oml:data_edit_parameters"]["oml:description"] = description
+    xml["oml:data_edit_parameters"]["oml:creator"] = creator
+    xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
+    xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
+    xml["oml:data_edit_parameters"]["oml:language"] = language
+    xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute
+    xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute
+    xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute
+    xml["oml:data_edit_parameters"]["oml:citation"] = citation
+    xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
+    xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url
+
+    # delete None inputs
+    for k in list(xml["oml:data_edit_parameters"]):
+        if not xml["oml:data_edit_parameters"][k]:
+            del xml["oml:data_edit_parameters"][k]
+
+    file_elements = {
+        "edit_parameters": ("description.xml", xmltodict.unparse(xml)),
+    }  # type: openml._api_calls.FILE_ELEMENTS_TYPE
+    result_xml = openml._api_calls._perform_api_call(
+        "data/edit",
+        "post",
+        data=form_data,
+        file_elements=file_elements,
+    )
+    result = xmltodict.parse(result_xml)
+    data_id = result["oml:data_edit"]["oml:id"]
+    return int(data_id)
+
+
+def fork_dataset(data_id: int) -> int:
+    """
+     Creates a new dataset version, with the authenticated user as the new owner.
+     The forked dataset can have distinct dataset meta-data,
+     but the actual data itself is shared with the original version.
+
+     This API is intended for use when a user is unable to edit the critical fields of a dataset
+     through the edit_dataset API.
+     (Critical fields are default_target_attribute, ignore_attribute, row_id_attribute.)
+
+     Specifically, this happens when the user is:
+            1. Not the owner of the dataset.
+            2. User is the owner of the dataset, but the dataset has tasks.
+
+     In these two cases the only way to edit critical fields is:
+            1. STEP 1: Fork the dataset using fork_dataset API
+            2. STEP 2: Call edit_dataset API on the forked version.
+
+
+    Parameters
+    ----------
+    data_id : int
+        id of the dataset to be forked
+
+    Returns
+    -------
+    Dataset id of the forked dataset
+
+    """
+    if not isinstance(data_id, int):
+        raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
+    # compose data fork parameters
+    form_data = {"data_id": data_id}  # type: openml._api_calls.DATA_TYPE
+    result_xml = openml._api_calls._perform_api_call("data/fork", "post", data=form_data)
+    result = xmltodict.parse(result_xml)
+    data_id = result["oml:data_fork"]["oml:id"]
+    return int(data_id)
+
+
+def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool:
+    """
+    An ontology describes the concept that are described in a feature. An
+    ontology is defined by an URL where the information is provided. Adds
+    an ontology (URL) to a given dataset feature (defined by a dataset id
+    and index). The dataset has to exists on OpenML and needs to have been
+    processed by the evaluation engine.
+
+    Parameters
+    ----------
+    data_id : int
+        id of the dataset to which the feature belongs
+    index : int
+        index of the feature in dataset (0-based)
+    ontology : str
+        URL to ontology (max. 256 characters)
+
+    Returns
+    -------
+    True or throws an OpenML server exception
+    """
+    upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology}
+    openml._api_calls._perform_api_call("data/feature/ontology/add", "post", data=upload_data)
+    # an error will be thrown in case the request was unsuccessful
+    return True
+
+
+def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> bool:
+    """
+    Removes an existing ontology (URL) from a given dataset feature (defined
+    by a dataset id and index). The dataset has to exists on OpenML and needs
+    to have been processed by the evaluation engine. Ontology needs to be
+    attached to the specific fearure.
+
+    Parameters
+    ----------
+    data_id : int
+        id of the dataset to which the feature belongs
+    index : int
+        index of the feature in dataset (0-based)
+    ontology : str
+        URL to ontology (max. 256 characters)
+
+    Returns
+    -------
+    True or throws an OpenML server exception
+    """
+    upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology}
+    openml._api_calls._perform_api_call("data/feature/ontology/remove", "post", data=upload_data)
+    # an error will be thrown in case the request was unsuccessful
+    return True
+
+
+def _topic_add_dataset(data_id: int, topic: str) -> int:
+    """
+    Adds a topic for a dataset.
+    This API is not available for all OpenML users and is accessible only by admins.
+
+    Parameters
+    ----------
+    data_id : int
+        id of the dataset for which the topic needs to be added
+    topic : str
+        Topic to be added for the dataset
+
+    Returns
+    -------
+    Dataset id
+    """
+    if not isinstance(data_id, int):
+        raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
+    form_data = {"data_id": data_id, "topic": topic}  # type: openml._api_calls.DATA_TYPE
+    result_xml = openml._api_calls._perform_api_call("data/topicadd", "post", data=form_data)
+    result = xmltodict.parse(result_xml)
+    data_id = result["oml:data_topic"]["oml:id"]
+    return int(data_id)
+
+
+def _topic_delete_dataset(data_id: int, topic: str) -> int:
+    """
+    Removes a topic from a dataset.
+    This API is not available for all OpenML users and is accessible only by admins.
 
+    Parameters
+    ----------
+    data_id : int
+        id of the dataset to be forked
+    topic : str
+        Topic to be deleted
+
+    Returns
+    -------
+    Dataset id
+    """
+    if not isinstance(data_id, int):
+        raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.")
+    form_data = {"data_id": data_id, "topic": topic}  # type: openml._api_calls.DATA_TYPE
+    result_xml = openml._api_calls._perform_api_call("data/topicdelete", "post", data=form_data)
+    result = xmltodict.parse(result_xml)
+    data_id = result["oml:data_topic"]["oml:id"]
+    return int(data_id)
 
-def _get_dataset_description(did_cache_dir, dataset_id):
+
+def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str, Any]:
     """Get the dataset description as xml dictionary.
 
     This function is NOT thread/multiprocessing safe.
 
     Parameters
     ----------
-    did_cache_dir : str
+    did_cache_dir : Path
         Cache subdirectory for this dataset.
 
     dataset_id : int
@@ -795,29 +1090,105 @@ def _get_dataset_description(did_cache_dir, dataset_id):
         XML Dataset description parsed to a dict.
 
     """
-
     # TODO implement a cache for this that invalidates itself after some time
     # This can be saved on disk, but cannot be cached properly, because
     # it contains the information on whether a dataset is active.
-    description_file = os.path.join(did_cache_dir, "description.xml")
+    description_file = did_cache_dir / "description.xml"
 
     try:
-        return _get_cached_dataset_description(dataset_id)
-    except OpenMLCacheException:
-        url_extension = "data/{}".format(dataset_id)
-        dataset_xml = openml._api_calls._perform_api_call(url_extension, 'get')
-        with io.open(description_file, "w", encoding='utf8') as fh:
+        with description_file.open(encoding="utf8") as fh:
+            dataset_xml = fh.read()
+        description = xmltodict.parse(dataset_xml)["oml:data_set_description"]
+    except Exception:  # noqa: BLE001
+        url_extension = f"data/{dataset_id}"
+        dataset_xml = openml._api_calls._perform_api_call(url_extension, "get")
+        try:
+            description = xmltodict.parse(dataset_xml)["oml:data_set_description"]
+        except ExpatError as e:
+            url = openml._api_calls._create_url_from_endpoint(url_extension)
+            raise OpenMLServerError(f"Dataset description XML at '{url}' is malformed.") from e
+
+        with description_file.open("w", encoding="utf8") as fh:
             fh.write(dataset_xml)
 
-    description = xmltodict.parse(dataset_xml)[
-        "oml:data_set_description"]
+    return description  # type: ignore
+
+
+def _get_dataset_parquet(
+    description: dict | OpenMLDataset,
+    cache_directory: Path | None = None,
+    download_all_files: bool = False,  # noqa: FBT002
+) -> Path | None:
+    """Return the path to the local parquet file of the dataset. If is not cached, it is downloaded.
+
+    Checks if the file is in the cache, if yes, return the path to the file.
+    If not, downloads the file and caches it, then returns the file path.
+    The cache directory is generated based on dataset information, but can also be specified.
+
+    This function is NOT thread/multiprocessing safe.
+    Unlike the ARFF equivalent, checksums are not available/used (for now).
+
+    Parameters
+    ----------
+    description : dictionary or OpenMLDataset
+        Either a dataset description as dict or OpenMLDataset.
 
-    return description
+    cache_directory: Path, optional (default=None)
+        Folder to store the parquet file in.
+        If None, use the default cache directory for the dataset.
 
+    download_all_files: bool, optional (default=False)
+        If `True`, download all data found in the bucket to which the description's
+        ``parquet_url`` points, only download the parquet file otherwise.
 
-def _get_dataset_arff(description: Union[Dict, OpenMLDataset],
-                      cache_directory: str = None) -> str:
-    """ Return the path to the local arff file of the dataset. If is not cached, it is downloaded.
+    Returns
+    -------
+    output_filename : Path, optional
+        Location of the Parquet file if successfully downloaded, None otherwise.
+    """
+    if isinstance(description, dict):
+        url = str(description.get("oml:parquet_url"))
+        did = int(description.get("oml:id"))  # type: ignore
+    elif isinstance(description, OpenMLDataset):
+        url = str(description._parquet_url)
+        assert description.dataset_id is not None
+
+        did = int(description.dataset_id)
+    else:
+        raise TypeError("`description` should be either OpenMLDataset or Dict.")
+
+    if cache_directory is None:
+        cache_directory = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did)
+
+    output_file_path = cache_directory / f"dataset_{did}.pq"
+
+    old_file_path = cache_directory / "dataset.pq"
+    if old_file_path.is_file():
+        old_file_path.rename(output_file_path)
+
+    # The call below skips files already on disk, so avoids downloading the parquet file twice.
+    # To force the old behavior of always downloading everything, use `force_refresh_cache`
+    # of `get_dataset`
+    if download_all_files:
+        openml._api_calls._download_minio_bucket(source=url, destination=cache_directory)
+
+    if not output_file_path.is_file():
+        try:
+            openml._api_calls._download_minio_file(
+                source=url,
+                destination=output_file_path,
+            )
+        except (FileNotFoundError, urllib3.exceptions.MaxRetryError, minio.error.ServerError) as e:
+            logger.warning(f"Could not download file from {url}: {e}")
+            return None
+    return output_file_path
+
+
+def _get_dataset_arff(
+    description: dict | OpenMLDataset,
+    cache_directory: Path | None = None,
+) -> Path:
+    """Return the path to the local arff file of the dataset. If is not cached, it is downloaded.
 
     Checks if the file is in the cache, if yes, return the path to the file.
     If not, downloads the file and caches it, then returns the file path.
@@ -830,46 +1201,57 @@ def _get_dataset_arff(description: Union[Dict, OpenMLDataset],
     description : dictionary or OpenMLDataset
         Either a dataset description as dict or OpenMLDataset.
 
-    cache_directory: str, optional (default=None)
+    cache_directory: Path, optional (default=None)
         Folder to store the arff file in.
         If None, use the default cache directory for the dataset.
 
     Returns
     -------
-    output_filename : string
+    output_filename : Path
         Location of ARFF file.
     """
     if isinstance(description, dict):
         md5_checksum_fixture = description.get("oml:md5_checksum")
-        url = description['oml:url']
-        did = description.get('oml:id')
+        url = str(description["oml:url"])
+        did = int(description.get("oml:id"))  # type: ignore
     elif isinstance(description, OpenMLDataset):
         md5_checksum_fixture = description.md5_checksum
+        assert description.url is not None
+        assert description.dataset_id is not None
+
         url = description.url
-        did = description.dataset_id
+        did = int(description.dataset_id)
     else:
         raise TypeError("`description` should be either OpenMLDataset or Dict.")
 
-    if cache_directory is None:
-        cache_directory = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did)
-    output_file_path = os.path.join(cache_directory, "dataset.arff")
+    save_cache_directory = (
+        _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did)
+        if cache_directory is None
+        else Path(cache_directory)
+    )
+    output_file_path = save_cache_directory / "dataset.arff"
 
     try:
-        openml.utils._download_text_file(
+        openml._api_calls._download_text_file(
             source=url,
             output_path=output_file_path,
-            md5_checksum=md5_checksum_fixture
+            md5_checksum=md5_checksum_fixture,
         )
     except OpenMLHashException as e:
-        additional_info = " Raised when downloading dataset {}.".format(did)
+        additional_info = f" Raised when downloading dataset {did}."
         e.args = (e.args[0] + additional_info,)
-        raise
+        raise e
 
     return output_file_path
 
 
-def _get_dataset_features(did_cache_dir, dataset_id):
-    """API call to get dataset features (cached)
+def _get_features_xml(dataset_id: int) -> str:
+    url_extension = f"data/features/{dataset_id}"
+    return openml._api_calls._perform_api_call(url_extension, "get")
+
+
+def _get_dataset_features_file(did_cache_dir: str | Path | None, dataset_id: int) -> Path:
+    """API call to load dataset features. Loads from cache or downloads them.
 
     Features are feature descriptions for each column.
     (name, index, categorical, ...)
@@ -878,7 +1260,7 @@ def _get_dataset_features(did_cache_dir, dataset_id):
 
     Parameters
     ----------
-    did_cache_dir : str
+    did_cache_dir : str or None
         Cache subdirectory for this dataset
 
     dataset_id : int
@@ -886,31 +1268,43 @@ def _get_dataset_features(did_cache_dir, dataset_id):
 
     Returns
     -------
-    features : dict
-        Dictionary containing dataset feature descriptions, parsed from XML.
+    Path
+        Path of the cached dataset feature file
     """
-    features_file = os.path.join(did_cache_dir, "features.xml")
+    did_cache_dir = Path(did_cache_dir) if did_cache_dir is not None else None
+    if did_cache_dir is None:
+        did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id)
+
+    features_file = did_cache_dir / "features.xml"
 
     # Dataset features aren't subject to change...
-    if not os.path.isfile(features_file):
-        url_extension = "data/features/{}".format(dataset_id)
-        features_xml = openml._api_calls._perform_api_call(url_extension, 'get')
-        with io.open(features_file, "w", encoding='utf8') as fh:
+    if not features_file.is_file():
+        features_xml = _get_features_xml(dataset_id)
+        with features_file.open("w", encoding="utf8") as fh:
             fh.write(features_xml)
 
-    return _load_features_from_file(features_file)
+    return features_file
+
 
+def _get_qualities_xml(dataset_id: int) -> str:
+    url_extension = f"data/qualities/{dataset_id!s}"
+    return openml._api_calls._perform_api_call(url_extension, "get")
 
-def _get_dataset_qualities(did_cache_dir, dataset_id):
-    """API call to get dataset qualities (cached)
 
+def _get_dataset_qualities_file(
+    did_cache_dir: str | Path | None,
+    dataset_id: int,
+) -> Path | None:
+    """Get the path for the dataset qualities file, or None if no qualities exist.
+
+    Loads from cache or downloads them.
     Features are metafeatures (number of features, number of classes, ...)
 
     This function is NOT thread/multiprocessing safe.
 
     Parameters
     ----------
-    did_cache_dir : str
+    did_cache_dir : str or None
         Cache subdirectory for this dataset
 
     dataset_id : int
@@ -918,32 +1312,43 @@ def _get_dataset_qualities(did_cache_dir, dataset_id):
 
     Returns
     -------
-    qualities : dict
-        Dictionary containing dataset qualities, parsed from XML.
+    str
+        Path of the cached qualities file
     """
+    save_did_cache_dir = (
+        _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id)
+        if did_cache_dir is None
+        else Path(did_cache_dir)
+    )
+
     # Dataset qualities are subject to change and must be fetched every time
-    qualities_file = os.path.join(did_cache_dir, "qualities.xml")
+    qualities_file = save_did_cache_dir / "qualities.xml"
     try:
-        with io.open(qualities_file, encoding='utf8') as fh:
+        with qualities_file.open(encoding="utf8") as fh:
             qualities_xml = fh.read()
-    except (OSError, IOError):
-        url_extension = "data/qualities/{}".format(dataset_id)
-        qualities_xml = openml._api_calls._perform_api_call(url_extension, 'get')
-
-        with io.open(qualities_file, "w", encoding='utf8') as fh:
-            fh.write(qualities_xml)
+    except OSError:
+        try:
+            qualities_xml = _get_qualities_xml(dataset_id)
+            with qualities_file.open("w", encoding="utf8") as fh:
+                fh.write(qualities_xml)
+        except OpenMLServerException as e:
+            if e.code == 362 and str(e) == "No qualities found - None":
+                # quality file stays as None
+                logger.warning(f"No qualities found for dataset {dataset_id}")
+                return None
 
-    xml_as_dict = xmltodict.parse(qualities_xml, force_list=('oml:quality',))
-    qualities = xml_as_dict['oml:data_qualities']['oml:quality']
+            raise e
 
-    return qualities
+    return qualities_file
 
 
 def _create_dataset_from_description(
-        description: Dict[str, str],
-        features: Dict,
-        qualities: List,
-        arff_file: str = None,
+    description: dict[str, str],
+    features_file: Path | None = None,
+    qualities_file: Path | None = None,
+    arff_file: Path | None = None,
+    parquet_file: Path | None = None,
+    cache_format: Literal["pickle", "feather"] = "pickle",
 ) -> OpenMLDataset:
     """Create a dataset object from a description dict.
 
@@ -951,12 +1356,16 @@ def _create_dataset_from_description(
     ----------
     description : dict
         Description of a dataset in xml dict.
-    features : dict
-        Description of a dataset features.
-    qualities : list
-        Description of a dataset qualities.
+    features_file : str
+        Path of the dataset features as xml file.
+    qualities_file : list
+        Path of the dataset qualities as xml file.
     arff_file : string, optional
         Path of dataset ARFF file.
+    parquet_file : string, optional
+        Path of dataset Parquet file.
+    cache_format: string, optional
+        Caching option for datasets (feather/pickle)
 
     Returns
     -------
@@ -966,9 +1375,9 @@ def _create_dataset_from_description(
     return OpenMLDataset(
         description["oml:name"],
         description.get("oml:description"),
-        data_format=description["oml:format"],
-        dataset_id=description["oml:id"],
-        version=description["oml:version"],
+        data_format=description["oml:format"],  # type: ignore
+        dataset_id=int(description["oml:id"]),
+        version=int(description["oml:version"]),
         creator=description.get("oml:creator"),
         contributor=description.get("oml:contributor"),
         collection_date=description.get("oml:collection_date"),
@@ -987,13 +1396,16 @@ def _create_dataset_from_description(
         paper_url=description.get("oml:paper_url"),
         update_comment=description.get("oml:update_comment"),
         md5_checksum=description.get("oml:md5_checksum"),
-        data_file=arff_file,
-        features=features,
-        qualities=qualities,
+        data_file=str(arff_file) if arff_file is not None else None,
+        cache_format=cache_format,
+        features_file=str(features_file) if features_file is not None else None,
+        qualities_file=str(qualities_file) if qualities_file is not None else None,
+        parquet_url=description.get("oml:parquet_url"),
+        parquet_file=str(parquet_file) if parquet_file is not None else None,
     )
 
 
-def _get_online_dataset_arff(dataset_id):
+def _get_online_dataset_arff(dataset_id: int) -> str | None:
     """Download the ARFF file for a given dataset id
     from the OpenML website.
 
@@ -1004,22 +1416,19 @@ def _get_online_dataset_arff(dataset_id):
 
     Returns
     -------
-    str
-        A string representation of an ARFF file.
+    str or None
+        A string representation of an ARFF file. Or None if file already exists.
     """
-    dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id,
-                                                      'get')
+    dataset_xml = openml._api_calls._perform_api_call(f"data/{dataset_id}", "get")
     # build a dict from the xml.
     # use the url from the dataset description and return the ARFF string
-    return openml._api_calls._read_url(
-        xmltodict.parse(dataset_xml)['oml:data_set_description']['oml:url'],
-        request_method='get'
+    return openml._api_calls._download_text_file(
+        xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:url"],
     )
 
 
-def _get_online_dataset_format(dataset_id):
-    """Get the dataset format for a given dataset id
-    from the OpenML website.
+def _get_online_dataset_format(dataset_id: int) -> str:
+    """Get the dataset format for a given dataset id from the OpenML website.
 
     Parameters
     ----------
@@ -1031,9 +1440,25 @@ def _get_online_dataset_format(dataset_id):
     str
         Dataset format.
     """
-    dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id,
-                                                      'get')
+    dataset_xml = openml._api_calls._perform_api_call(f"data/{dataset_id}", "get")
     # build a dict from the xml and get the format from the dataset description
-    return xmltodict\
-        .parse(dataset_xml)['oml:data_set_description']['oml:format']\
-        .lower()
+    return xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:format"].lower()  # type: ignore
+
+
+def delete_dataset(dataset_id: int) -> bool:
+    """Delete dataset with id `dataset_id` from the OpenML server.
+
+    This can only be done if you are the owner of the dataset and
+    no tasks are attached to the dataset.
+
+    Parameters
+    ----------
+    dataset_id : int
+        OpenML id of the dataset
+
+    Returns
+    -------
+    bool
+        True if the deletion was successful. False otherwise.
+    """
+    return openml.utils._delete_entity("data", dataset_id)
diff --git a/openml/evaluations/__init__.py b/openml/evaluations/__init__.py
index 03a41375f..b56d0c2d5 100644
--- a/openml/evaluations/__init__.py
+++ b/openml/evaluations/__init__.py
@@ -1,4 +1,11 @@
+# License: BSD 3-Clause
+
 from .evaluation import OpenMLEvaluation
-from .functions import list_evaluations, list_evaluation_measures
+from .functions import list_evaluation_measures, list_evaluations, list_evaluations_setups
 
-__all__ = ['OpenMLEvaluation', 'list_evaluations', 'list_evaluation_measures']
+__all__ = [
+    "OpenMLEvaluation",
+    "list_evaluation_measures",
+    "list_evaluations",
+    "list_evaluations_setups",
+]
diff --git a/openml/evaluations/evaluation.py b/openml/evaluations/evaluation.py
index 48b407575..87df8454a 100644
--- a/openml/evaluations/evaluation.py
+++ b/openml/evaluations/evaluation.py
@@ -1,7 +1,16 @@
-import openml.config
+# License: BSD 3-Clause
+from __future__ import annotations
 
+from dataclasses import asdict, dataclass
 
-class OpenMLEvaluation(object):
+import openml.datasets
+import openml.flows
+import openml.runs
+import openml.tasks
+
+
+@dataclass
+class OpenMLEvaluation:
     """
     Contains all meta-information about a run / evaluation combination,
     according to the evaluation/list function
@@ -26,6 +35,10 @@ class OpenMLEvaluation(object):
         The evaluation metric of this item (e.g., accuracy).
     upload_time : str
         The time of evaluation.
+    uploader: int
+        Uploader ID (user ID)
+    upload_name : str
+        Name of the uploader of this evaluation
     value : float
         The value (score) of this evaluation.
     values : List[float]
@@ -34,47 +47,63 @@ class OpenMLEvaluation(object):
         list of information per class.
         (e.g., in case of precision, auroc, recall)
     """
-    def __init__(self, run_id, task_id, setup_id, flow_id, flow_name,
-                 data_id, data_name, function, upload_time, value, values,
-                 array_data=None):
-        self.run_id = run_id
-        self.task_id = task_id
-        self.setup_id = setup_id
-        self.flow_id = flow_id
-        self.flow_name = flow_name
-        self.data_id = data_id
-        self.data_name = data_name
-        self.function = function
-        self.upload_time = upload_time
-        self.value = value
-        self.values = values
-        self.array_data = array_data
 
-    def __repr__(self):
+    run_id: int
+    task_id: int
+    setup_id: int
+    flow_id: int
+    flow_name: str
+    data_id: int
+    data_name: str
+    function: str
+    upload_time: str
+    uploader: int
+    uploader_name: str
+    value: float | None
+    values: list[float] | None
+    array_data: str | None = None
+
+    def _to_dict(self) -> dict:
+        return asdict(self)
+
+    def __repr__(self) -> str:
         header = "OpenML Evaluation"
-        header = '{}\n{}\n'.format(header, '=' * len(header))
+        header = f"{header}\n{'=' * len(header)}\n"
 
-        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
-        fields = {"Upload Date": self.upload_time,
-                  "Run ID": self.run_id,
-                  "OpenML Run URL": "{}r/{}".format(base_url, self.run_id),
-                  "Task ID": self.task_id,
-                  "OpenML Task URL": "{}t/{}".format(base_url, self.task_id),
-                  "Flow ID": self.flow_id,
-                  "OpenML Flow URL": "{}f/{}".format(base_url, self.flow_id),
-                  "Setup ID": self.setup_id,
-                  "Data ID": self.data_id,
-                  "Data Name": self.data_name,
-                  "OpenML Data URL": "{}d/{}".format(base_url, self.data_id),
-                  "Metric Used": self.function,
-                  "Result": self.value}
+        fields = {
+            "Upload Date": self.upload_time,
+            "Run ID": self.run_id,
+            "OpenML Run URL": openml.runs.OpenMLRun.url_for_id(self.run_id),
+            "Task ID": self.task_id,
+            "OpenML Task URL": openml.tasks.OpenMLTask.url_for_id(self.task_id),
+            "Flow ID": self.flow_id,
+            "OpenML Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id),
+            "Setup ID": self.setup_id,
+            "Data ID": self.data_id,
+            "Data Name": self.data_name,
+            "OpenML Data URL": openml.datasets.OpenMLDataset.url_for_id(self.data_id),
+            "Metric Used": self.function,
+            "Result": self.value,
+        }
 
-        order = ["Uploader Date", "Run ID", "OpenML Run URL", "Task ID", "OpenML Task URL"
-                 "Flow ID", "OpenML Flow URL", "Setup ID", "Data ID", "Data Name",
-                 "OpenML Data URL", "Metric Used", "Result"]
-        fields = [(key, fields[key]) for key in order if key in fields]
+        order = [
+            "Upload Date",
+            "Run ID",
+            "OpenML Run URL",
+            "Task ID",
+            "OpenML Task URL",
+            "Flow ID",
+            "OpenML Flow URL",
+            "Setup ID",
+            "Data ID",
+            "Data Name",
+            "OpenML Data URL",
+            "Metric Used",
+            "Result",
+        ]
+        _fields = [(key, fields[key]) for key in order if key in fields]
 
-        longest_field_name_length = max(len(name) for name, value in fields)
-        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
-        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
+        longest_field_name_length = max(len(name) for name, _ in _fields)
+        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
+        body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
         return header + body
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index 37789a752..61c95a480 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -1,30 +1,76 @@
+# License: BSD 3-Clause
+# ruff: noqa: PLR0913
+from __future__ import annotations
+
 import json
-import xmltodict
+from functools import partial
+from itertools import chain
+from typing import Any, Literal
+from typing_extensions import overload
+
+import numpy as np
 import pandas as pd
-from typing import Union, List, Optional, Dict
-import collections
+import xmltodict
 
-import openml.utils
+import openml
 import openml._api_calls
-from ..evaluations import OpenMLEvaluation
+import openml.utils
+from openml.evaluations import OpenMLEvaluation
 
 
+@overload
 def list_evaluations(
     function: str,
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    id: Optional[List] = None,
-    task: Optional[List] = None,
-    setup: Optional[List] = None,
-    flow: Optional[List] = None,
-    uploader: Optional[List] = None,
-    tag: Optional[str] = None,
-    per_fold: Optional[bool] = None,
-    sort_order: Optional[str] = None,
-    output_format: str = 'object'
-) -> Union[Dict, pd.DataFrame]:
-    """
-    List all run-evaluation pairs matching all of the given filters.
+    offset: int | None = None,
+    size: int | None = None,
+    tasks: list[str | int] | None = None,
+    setups: list[str | int] | None = None,
+    flows: list[str | int] | None = None,
+    runs: list[str | int] | None = None,
+    uploaders: list[str | int] | None = None,
+    tag: str | None = None,
+    study: int | None = None,
+    per_fold: bool | None = None,
+    sort_order: str | None = None,
+    output_format: Literal["dataframe"] = ...,
+) -> pd.DataFrame: ...
+
+
+@overload
+def list_evaluations(
+    function: str,
+    offset: int | None = None,
+    size: int | None = None,
+    tasks: list[str | int] | None = None,
+    setups: list[str | int] | None = None,
+    flows: list[str | int] | None = None,
+    runs: list[str | int] | None = None,
+    uploaders: list[str | int] | None = None,
+    tag: str | None = None,
+    study: int | None = None,
+    per_fold: bool | None = None,
+    sort_order: str | None = None,
+    output_format: Literal["object"] = "object",
+) -> dict[int, OpenMLEvaluation]: ...
+
+
+def list_evaluations(
+    function: str,
+    offset: int | None = None,
+    size: int | None = None,
+    tasks: list[str | int] | None = None,
+    setups: list[str | int] | None = None,
+    flows: list[str | int] | None = None,
+    runs: list[str | int] | None = None,
+    uploaders: list[str | int] | None = None,
+    tag: str | None = None,
+    study: int | None = None,
+    per_fold: bool | None = None,
+    sort_order: str | None = None,
+    output_format: Literal["object", "dataframe"] = "object",
+) -> dict[int, OpenMLEvaluation] | pd.DataFrame:
+    """List all run-evaluation pairs matching all of the given filters.
+
     (Supports large amount of results)
 
     Parameters
@@ -33,20 +79,24 @@ def list_evaluations(
         the evaluation function. e.g., predictive_accuracy
     offset : int, optional
         the number of runs to skip, starting from the first
-    size : int, optional
-        the maximum number of runs to show
-
-    id : list, optional
-
-    task : list, optional
-
-    setup: list, optional
-
-    flow : list, optional
-
-    uploader : list, optional
-
+    size : int, default 10000
+        The maximum number of runs to show.
+        If set to ``None``, it returns all the results.
+
+    tasks : list[int,str], optional
+        the list of task IDs
+    setups: list[int,str], optional
+        the list of setup IDs
+    flows : list[int,str], optional
+        the list of flow IDs
+    runs :list[int,str], optional
+        the list of run IDs
+    uploaders : list[int,str], optional
+        the list of uploader IDs
     tag : str, optional
+        filter evaluation based on given tag
+
+    study : int, optional
 
     per_fold : bool, optional
 
@@ -56,47 +106,56 @@ def list_evaluations(
     output_format: str, optional (default='object')
         The parameter decides the format of the output.
         - If 'object' the output is a dict of OpenMLEvaluation objects
-        - If 'dict' the output is a dict of dict
         - If 'dataframe' the output is a pandas DataFrame
 
     Returns
     -------
     dict or dataframe
     """
-    if output_format not in ['dataframe', 'dict', 'object']:
-        raise ValueError("Invalid output format selected. "
-                         "Only 'object', 'dataframe', or 'dict' applicable.")
+    if output_format not in ("dataframe", "object"):
+        raise ValueError("Invalid output format. Only 'object', 'dataframe'.")
 
     per_fold_str = None
     if per_fold is not None:
         per_fold_str = str(per_fold).lower()
 
-    return openml.utils._list_all(output_format=output_format,
-                                  listing_call=_list_evaluations,
-                                  function=function,
-                                  offset=offset,
-                                  size=size,
-                                  id=id,
-                                  task=task,
-                                  setup=setup,
-                                  flow=flow,
-                                  uploader=uploader,
-                                  tag=tag,
-                                  sort_order=sort_order,
-                                  per_fold=per_fold_str)
-
-
-def _list_evaluations(
+    listing_call = partial(
+        _list_evaluations,
+        function=function,
+        tasks=tasks,
+        setups=setups,
+        flows=flows,
+        runs=runs,
+        uploaders=uploaders,
+        tag=tag,
+        study=study,
+        sort_order=sort_order,
+        per_fold=per_fold_str,
+    )
+    eval_collection = openml.utils._list_all(listing_call, offset=offset, limit=size)
+
+    flattened = list(chain.from_iterable(eval_collection))
+    if output_format == "dataframe":
+        records = [item._to_dict() for item in flattened]
+        return pd.DataFrame.from_records(records)  # No index...
+
+    return {e.run_id: e for e in flattened}
+
+
+def _list_evaluations(  # noqa: C901
+    limit: int,
+    offset: int,
+    *,
     function: str,
-    id: Optional[List] = None,
-    task: Optional[List] = None,
-    setup: Optional[List] = None,
-    flow: Optional[List] = None,
-    uploader: Optional[List] = None,
-    sort_order: Optional[str] = None,
-    output_format: str = 'object',
-    **kwargs
-) -> Union[Dict, pd.DataFrame]:
+    tasks: list | None = None,
+    setups: list | None = None,
+    flows: list | None = None,
+    runs: list | None = None,
+    uploaders: list | None = None,
+    study: int | None = None,
+    sort_order: str | None = None,
+    **kwargs: Any,
+) -> list[OpenMLEvaluation]:
     """
     Perform API call ``/evaluation/function{function}/{filters}``
 
@@ -105,117 +164,118 @@ def _list_evaluations(
     The arguments that are lists are separated from the single value
     ones which are put into the kwargs.
 
+    limit : int
+        the number of evaluations to return
+    offset : int
+        the number of evaluations to skip, starting from the first
     function : str
         the evaluation function. e.g., predictive_accuracy
 
-    id : list, optional
-
-    task : list, optional
-
-    setup: list, optional
-
-    flow : list, optional
+    tasks : list[int,str], optional
+        the list of task IDs
+    setups: list[int,str], optional
+        the list of setup IDs
+    flows : list[int,str], optional
+        the list of flow IDs
+    runs :list[int,str], optional
+        the list of run IDs
+    uploaders : list[int,str], optional
+        the list of uploader IDs
 
-    uploader : list, optional
+    study : int, optional
 
     kwargs: dict, optional
-        Legal filter operators: tag, limit, offset.
+        Legal filter operators: tag, per_fold
 
     sort_order : str, optional
         order of sorting evaluations, ascending ("asc") or descending ("desc")
 
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
-        - If 'dataframe' the output is a pandas DataFrame
-
     Returns
     -------
-    dict of objects, or dataframe
+    list of OpenMLEvaluation objects
     """
-
-    api_call = "evaluation/list/function/%s" % function
+    api_call = f"evaluation/list/function/{function}"
+    if limit is not None:
+        api_call += f"/limit/{limit}"
+    if offset is not None:
+        api_call += f"/offset/{offset}"
     if kwargs is not None:
         for operator, value in kwargs.items():
-            api_call += "/%s/%s" % (operator, value)
-    if id is not None:
-        api_call += "/run/%s" % ','.join([str(int(i)) for i in id])
-    if task is not None:
-        api_call += "/task/%s" % ','.join([str(int(i)) for i in task])
-    if setup is not None:
-        api_call += "/setup/%s" % ','.join([str(int(i)) for i in setup])
-    if flow is not None:
-        api_call += "/flow/%s" % ','.join([str(int(i)) for i in flow])
-    if uploader is not None:
-        api_call += "/uploader/%s" % ','.join([str(int(i)) for i in uploader])
+            if value is not None:
+                api_call += f"/{operator}/{value}"
+    if tasks is not None:
+        api_call += f"/task/{','.join([str(int(i)) for i in tasks])}"
+    if setups is not None:
+        api_call += f"/setup/{','.join([str(int(i)) for i in setups])}"
+    if flows is not None:
+        api_call += f"/flow/{','.join([str(int(i)) for i in flows])}"
+    if runs is not None:
+        api_call += f"/run/{','.join([str(int(i)) for i in runs])}"
+    if uploaders is not None:
+        api_call += f"/uploader/{','.join([str(int(i)) for i in uploaders])}"
+    if study is not None:
+        api_call += f"/study/{study}"
     if sort_order is not None:
-        api_call += "/sort_order/%s" % sort_order
+        api_call += f"/sort_order/{sort_order}"
 
-    return __list_evaluations(api_call, output_format=output_format)
+    return __list_evaluations(api_call)
 
 
-def __list_evaluations(api_call, output_format='object'):
+def __list_evaluations(api_call: str) -> list[OpenMLEvaluation]:
     """Helper function to parse API calls which are lists of runs"""
-    xml_string = openml._api_calls._perform_api_call(api_call, 'get')
-    evals_dict = xmltodict.parse(xml_string, force_list=('oml:evaluation',))
+    xml_string = openml._api_calls._perform_api_call(api_call, "get")
+    evals_dict = xmltodict.parse(xml_string, force_list=("oml:evaluation",))
     # Minimalistic check if the XML is useful
-    if 'oml:evaluations' not in evals_dict:
-        raise ValueError('Error in return XML, does not contain '
-                         '"oml:evaluations": %s' % str(evals_dict))
-
-    assert type(evals_dict['oml:evaluations']['oml:evaluation']) == list, \
-        type(evals_dict['oml:evaluations'])
-
-    evals = collections.OrderedDict()
-    for eval_ in evals_dict['oml:evaluations']['oml:evaluation']:
-        run_id = int(eval_['oml:run_id'])
-        value = None
-        values = None
-        array_data = None
-        if 'oml:value' in eval_:
-            value = float(eval_['oml:value'])
-        if 'oml:values' in eval_:
-            values = json.loads(eval_['oml:values'])
-        if 'oml:array_data' in eval_:
-            array_data = eval_['oml:array_data']
-
-        if output_format == 'object':
-            evals[run_id] = OpenMLEvaluation(int(eval_['oml:run_id']),
-                                             int(eval_['oml:task_id']),
-                                             int(eval_['oml:setup_id']),
-                                             int(eval_['oml:flow_id']),
-                                             eval_['oml:flow_name'],
-                                             eval_['oml:data_id'],
-                                             eval_['oml:data_name'],
-                                             eval_['oml:function'],
-                                             eval_['oml:upload_time'],
-                                             value, values, array_data)
-        else:
-            # for output_format in ['dict', 'dataframe']
-            evals[run_id] = {'run_id': int(eval_['oml:run_id']),
-                             'task_id': int(eval_['oml:task_id']),
-                             'setup_id': int(eval_['oml:setup_id']),
-                             'flow_id': int(eval_['oml:flow_id']),
-                             'flow_name': eval_['oml:flow_name'],
-                             'data_id': eval_['oml:data_id'],
-                             'data_name': eval_['oml:data_name'],
-                             'function': eval_['oml:function'],
-                             'upload_time': eval_['oml:upload_time'],
-                             'value': value,
-                             'values': values,
-                             'array_data': array_data}
-
-    if output_format == 'dataframe':
-        evals = pd.DataFrame.from_dict(evals, orient='index')
+    if "oml:evaluations" not in evals_dict:
+        raise ValueError(
+            f'Error in return XML, does not contain "oml:evaluations": {evals_dict!s}',
+        )
+
+    assert isinstance(evals_dict["oml:evaluations"]["oml:evaluation"], list), (
+        "Expected 'oml:evaluation' to be a list, but got"
+        f"{type(evals_dict['oml:evaluations']['oml:evaluation']).__name__}. "
+    )
+
+    uploader_ids = list(
+        {eval_["oml:uploader"] for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]},
+    )
+    api_users = "user/list/user_id/" + ",".join(uploader_ids)
+    xml_string_user = openml._api_calls._perform_api_call(api_users, "get")
+
+    users = xmltodict.parse(xml_string_user, force_list=("oml:user",))
+    user_dict = {user["oml:id"]: user["oml:username"] for user in users["oml:users"]["oml:user"]}
+
+    evals = []
+    for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]:
+        run_id = int(eval_["oml:run_id"])
+        value = float(eval_["oml:value"]) if "oml:value" in eval_ else None
+        values = json.loads(eval_["oml:values"]) if eval_.get("oml:values", None) else None
+        array_data = eval_.get("oml:array_data")
+
+        evals.append(
+            OpenMLEvaluation(
+                run_id=run_id,
+                task_id=int(eval_["oml:task_id"]),
+                setup_id=int(eval_["oml:setup_id"]),
+                flow_id=int(eval_["oml:flow_id"]),
+                flow_name=eval_["oml:flow_name"],
+                data_id=int(eval_["oml:data_id"]),
+                data_name=eval_["oml:data_name"],
+                function=eval_["oml:function"],
+                upload_time=eval_["oml:upload_time"],
+                uploader=int(eval_["oml:uploader"]),
+                uploader_name=user_dict[eval_["oml:uploader"]],
+                value=value,
+                values=values,
+                array_data=array_data,
+            )
+        )
 
     return evals
 
 
-def list_evaluation_measures() -> List[str]:
-    """ Return list of evaluation measures available.
+def list_evaluation_measures() -> list[str]:
+    """Return list of evaluation measures available.
 
     The function performs an API call to retrieve the entire list of
     evaluation measures that are available.
@@ -226,15 +286,153 @@ def list_evaluation_measures() -> List[str]:
 
     """
     api_call = "evaluationmeasure/list"
-    xml_string = openml._api_calls._perform_api_call(api_call, 'get')
-    qualities = xmltodict.parse(xml_string, force_list=('oml:measures'))
+    xml_string = openml._api_calls._perform_api_call(api_call, "get")
+    qualities = xmltodict.parse(xml_string, force_list=("oml:measures"))
+    # Minimalistic check if the XML is useful
+    if "oml:evaluation_measures" not in qualities:
+        raise ValueError('Error in return XML, does not contain "oml:evaluation_measures"')
+
+    if not isinstance(qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"], list):
+        raise TypeError('Error in return XML, does not contain "oml:measure" as a list')
+
+    return qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"]
+
+
+def list_estimation_procedures() -> list[str]:
+    """Return list of evaluation procedures available.
+
+    The function performs an API call to retrieve the entire list of
+    evaluation procedures' names that are available.
+
+    Returns
+    -------
+    list
+    """
+    api_call = "estimationprocedure/list"
+    xml_string = openml._api_calls._perform_api_call(api_call, "get")
+    api_results = xmltodict.parse(xml_string)
+
     # Minimalistic check if the XML is useful
-    if 'oml:evaluation_measures' not in qualities:
-        raise ValueError('Error in return XML, does not contain '
-                         '"oml:evaluation_measures"')
-    if not isinstance(qualities['oml:evaluation_measures']['oml:measures'][0]['oml:measure'],
-                      list):
-        raise TypeError('Error in return XML, does not contain '
-                        '"oml:measure" as a list')
-    qualities = qualities['oml:evaluation_measures']['oml:measures'][0]['oml:measure']
-    return qualities
+    if "oml:estimationprocedures" not in api_results:
+        raise ValueError('Error in return XML, does not contain "oml:estimationprocedures"')
+
+    if "oml:estimationprocedure" not in api_results["oml:estimationprocedures"]:
+        raise ValueError('Error in return XML, does not contain "oml:estimationprocedure"')
+
+    if not isinstance(api_results["oml:estimationprocedures"]["oml:estimationprocedure"], list):
+        raise TypeError('Error in return XML, does not contain "oml:estimationprocedure" as a list')
+
+    return [
+        prod["oml:name"]
+        for prod in api_results["oml:estimationprocedures"]["oml:estimationprocedure"]
+    ]
+
+
+def list_evaluations_setups(
+    function: str,
+    offset: int | None = None,
+    size: int | None = None,
+    tasks: list | None = None,
+    setups: list | None = None,
+    flows: list | None = None,
+    runs: list | None = None,
+    uploaders: list | None = None,
+    tag: str | None = None,
+    per_fold: bool | None = None,
+    sort_order: str | None = None,
+    parameters_in_separate_columns: bool = False,  # noqa: FBT002
+) -> pd.DataFrame:
+    """List all run-evaluation pairs matching all of the given filters
+    and their hyperparameter settings.
+
+    Parameters
+    ----------
+    function : str
+        the evaluation function. e.g., predictive_accuracy
+    offset : int, optional
+        the number of runs to skip, starting from the first
+    size : int, optional
+        the maximum number of runs to show
+    tasks : list[int], optional
+        the list of task IDs
+    setups: list[int], optional
+        the list of setup IDs
+    flows : list[int], optional
+        the list of flow IDs
+    runs : list[int], optional
+        the list of run IDs
+    uploaders : list[int], optional
+        the list of uploader IDs
+    tag : str, optional
+        filter evaluation based on given tag
+    per_fold : bool, optional
+    sort_order : str, optional
+       order of sorting evaluations, ascending ("asc") or descending ("desc")
+    parameters_in_separate_columns: bool, optional (default= False)
+        Returns hyperparameters in separate columns if set to True.
+        Valid only for a single flow
+
+    Returns
+    -------
+    dataframe with hyperparameter settings as a list of tuples.
+    """
+    if parameters_in_separate_columns and (flows is None or len(flows) != 1):
+        raise ValueError("Can set parameters_in_separate_columns to true only for single flow_id")
+
+    # List evaluations
+    evals = list_evaluations(
+        function=function,
+        offset=offset,
+        size=size,
+        runs=runs,
+        tasks=tasks,
+        setups=setups,
+        flows=flows,
+        uploaders=uploaders,
+        tag=tag,
+        per_fold=per_fold,
+        sort_order=sort_order,
+        output_format="dataframe",
+    )
+    # List setups
+    # list_setups by setup id does not support large sizes (exceeds URL length limit)
+    # Hence we split the list of unique setup ids returned by list_evaluations into chunks of size N
+    _df = pd.DataFrame()
+    if len(evals) != 0:
+        N = 100  # size of section
+        uniq = np.asarray(evals["setup_id"].unique())
+        length = len(uniq)
+
+        # array_split - allows indices_or_sections to not equally divide the array
+        # array_split -length % N sub-arrays of size length//N + 1 and the rest of size length//N.
+        split_size = ((length - 1) // N) + 1
+        setup_chunks = np.array_split(uniq, split_size)
+
+        setup_data = pd.DataFrame()
+        for _setups in setup_chunks:
+            result = openml.setups.list_setups(setup=_setups, output_format="dataframe")
+            assert isinstance(result, pd.DataFrame)
+            result = result.drop("flow_id", axis=1)
+            # concat resulting setup chunks into single datframe
+            setup_data = pd.concat([setup_data, result])
+
+        parameters = []
+        # Convert parameters of setup into dict of (hyperparameter, value)
+        for parameter_dict in setup_data["parameters"]:
+            if parameter_dict is not None:
+                parameters.append(
+                    {param["full_name"]: param["value"] for param in parameter_dict.values()},
+                )
+            else:
+                parameters.append({})
+        setup_data["parameters"] = parameters
+        # Merge setups with evaluations
+        _df = evals.merge(setup_data, on="setup_id", how="left")
+
+    if parameters_in_separate_columns:
+        _df = pd.concat(
+            [_df.drop("parameters", axis=1), _df["parameters"].apply(pd.Series)],
+            axis=1,
+        )
+
+    return _df
diff --git a/openml/exceptions.py b/openml/exceptions.py
index 492587adc..1c1343ff3 100644
--- a/openml/exceptions.py
+++ b/openml/exceptions.py
@@ -1,4 +1,10 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+
 class PyOpenMLError(Exception):
+    """Base class for all exceptions in OpenML-Python."""
+
     def __init__(self, message: str):
         self.message = message
         super().__init__(message)
@@ -6,57 +12,79 @@ def __init__(self, message: str):
 
 class OpenMLServerError(PyOpenMLError):
     """class for when something is really wrong on the server
-       (result did not parse to dict), contains unparsed error."""
-
-    def __init__(self, message: str):
-        super().__init__(message)
+    (result did not parse to dict), contains unparsed error.
+    """
 
 
-class OpenMLServerException(OpenMLServerError):
+class OpenMLServerException(OpenMLServerError):  # noqa: N818
     """exception for when the result of the server was
-       not 200 (e.g., listing call w/o results). """
+    not 200 (e.g., listing call w/o results).
+    """
 
-    # Code needs to be optional to allow the exceptino to be picklable:
+    # Code needs to be optional to allow the exception to be picklable:
     # https://stackoverflow.com/questions/16244923/how-to-make-a-custom-exception-class-with-multiple-init-args-pickleable  # noqa: E501
-    def __init__(self, message: str, code: str = None, additional: str = None, url: str = None):
+    def __init__(self, message: str, code: int | None = None, url: str | None = None):
         self.message = message
         self.code = code
-        self.additional = additional
         self.url = url
         super().__init__(message)
 
-    def __repr__(self):
-        return '%s returned code %s: %s' % (
-            self.url, self.code, self.message,
-        )
+    def __str__(self) -> str:
+        return f"{self.url} returned code {self.code}: {self.message}"
 
 
 class OpenMLServerNoResult(OpenMLServerException):
-    """exception for when the result of the server is empty. """
-    pass
+    """Exception for when the result of the server is empty."""
 
 
-class OpenMLCacheException(PyOpenMLError):
+class OpenMLCacheException(PyOpenMLError):  # noqa: N818
     """Dataset / task etc not found in cache"""
-    def __init__(self, message: str):
-        super().__init__(message)
 
 
-class OpenMLHashException(PyOpenMLError):
+class OpenMLHashException(PyOpenMLError):  # noqa: N818
     """Locally computed hash is different than hash announced by the server."""
-    pass
 
 
 class OpenMLPrivateDatasetError(PyOpenMLError):
-    """ Exception thrown when the user has no rights to access the dataset. """
-    def __init__(self, message: str):
-        super().__init__(message)
+    """Exception thrown when the user has no rights to access the dataset."""
 
 
 class OpenMLRunsExistError(PyOpenMLError):
-    """ Indicates run(s) already exists on the server when they should not be duplicated. """
-    def __init__(self, run_ids: set, message: str):
+    """Indicates run(s) already exists on the server when they should not be duplicated."""
+
+    def __init__(self, run_ids: set[int], message: str) -> None:
         if len(run_ids) < 1:
             raise ValueError("Set of run ids must be non-empty.")
         self.run_ids = run_ids
         super().__init__(message)
+
+
+class OpenMLNotAuthorizedError(OpenMLServerError):
+    """Indicates an authenticated user is not authorized to execute the requested action."""
+
+
+class OpenMLAuthenticationError(OpenMLServerError):
+    """Exception raised when API authentication fails.
+
+    This typically occurs when:
+    - No API key is configured
+    - The API key is invalid or expired
+    - The API key format is incorrect
+
+    This is different from authorization (OpenMLNotAuthorizedError), which occurs
+    when a valid API key lacks permissions for the requested operation.
+    """
+
+    def __init__(self, message: str):
+        help_text = (
+            "\n\nTo fix this:\n"
+            "1. Get your API key from https://www.openml.org/\n"
+            "   (you'll need to register for a free account if you don't have one)\n"
+            "2. Configure your API key by following the authentication guide:\n"
+            "   https://openml.github.io/openml-python/latest/examples/Basics/introduction_tutorial/#authentication"
+        )
+        super().__init__(message + help_text)
+
+
+class ObjectNotPublishedError(PyOpenMLError):
+    """Indicates an object has not been published yet."""
diff --git a/openml/extensions/__init__.py b/openml/extensions/__init__.py
index 374e856e3..979986182 100644
--- a/openml/extensions/__init__.py
+++ b/openml/extensions/__init__.py
@@ -1,15 +1,15 @@
-from typing import List, Type  # noqa: F401
+# License: BSD 3-Clause
 
-from .extension_interface import Extension
-from .functions import register_extension, get_extension_by_model, get_extension_by_flow
 
+from .extension_interface import Extension
+from .functions import get_extension_by_flow, get_extension_by_model, register_extension
 
-extensions = []  # type: List[Type[Extension]]
+extensions: list[type[Extension]] = []
 
 
 __all__ = [
-    'Extension',
-    'register_extension',
-    'get_extension_by_model',
-    'get_extension_by_flow',
+    "Extension",
+    "get_extension_by_flow",
+    "get_extension_by_model",
+    "register_extension",
 ]
diff --git a/openml/extensions/extension_interface.py b/openml/extensions/extension_interface.py
index 6346cb0bf..e391d109a 100644
--- a/openml/extensions/extension_interface.py
+++ b/openml/extensions/extension_interface.py
@@ -1,19 +1,21 @@
-from abc import ABC, abstractmethod
-from collections import OrderedDict  # noqa: F401
-from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING, Union
+# License: BSD 3-Clause
+from __future__ import annotations
 
-import numpy as np
-import scipy.sparse
+from abc import ABC, abstractmethod
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any
 
 # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
 if TYPE_CHECKING:
+    import numpy as np
+    import scipy.sparse
+
     from openml.flows import OpenMLFlow
+    from openml.runs.trace import OpenMLRunTrace, OpenMLTraceIteration  # F401
     from openml.tasks.task import OpenMLTask
-    from openml.runs.trace import OpenMLRunTrace, OpenMLTraceIteration  # noqa F401
 
 
 class Extension(ABC):
-
     """Defines the interface to connect machine learning libraries to OpenML-Python.
 
     See ``openml.extension.sklearn.extension`` for an implementation to bootstrap from.
@@ -24,7 +26,7 @@ class Extension(ABC):
 
     @classmethod
     @abstractmethod
-    def can_handle_flow(cls, flow: 'OpenMLFlow') -> bool:
+    def can_handle_flow(cls, flow: OpenMLFlow) -> bool:
         """Check whether a given flow can be handled by this extension.
 
         This is typically done by parsing the ``external_version`` field.
@@ -58,7 +60,12 @@ def can_handle_model(cls, model: Any) -> bool:
     # Abstract methods for flow serialization and de-serialization
 
     @abstractmethod
-    def flow_to_model(self, flow: 'OpenMLFlow', initialize_with_defaults: bool = False) -> Any:
+    def flow_to_model(
+        self,
+        flow: OpenMLFlow,
+        initialize_with_defaults: bool = False,  # noqa: FBT002
+        strict_version: bool = True,  # noqa: FBT002
+    ) -> Any:
         """Instantiate a model from the flow representation.
 
         Parameters
@@ -69,13 +76,16 @@ def flow_to_model(self, flow: 'OpenMLFlow', initialize_with_defaults: bool = Fal
             If this flag is set, the hyperparameter values of flows will be
             ignored and a flow with its defaults is returned.
 
+        strict_version : bool, default=True
+            Whether to fail if version requirements are not fulfilled.
+
         Returns
         -------
         Any
         """
 
     @abstractmethod
-    def model_to_flow(self, model: Any) -> 'OpenMLFlow':
+    def model_to_flow(self, model: Any) -> OpenMLFlow:
         """Transform a model to a flow for uploading it to OpenML.
 
         Parameters
@@ -88,7 +98,7 @@ def model_to_flow(self, model: Any) -> 'OpenMLFlow':
         """
 
     @abstractmethod
-    def get_version_information(self) -> List[str]:
+    def get_version_information(self) -> list[str]:
         """List versions of libraries required by the flow.
 
         Returns
@@ -129,7 +139,7 @@ def is_estimator(self, model: Any) -> bool:
         """
 
     @abstractmethod
-    def seed_model(self, model: Any, seed: Optional[int]) -> Any:
+    def seed_model(self, model: Any, seed: int | None) -> Any:
         """Set the seed of all the unseeded components of a model and return the seeded model.
 
         Required so that all seed information can be uploaded to OpenML for reproducible results.
@@ -146,17 +156,17 @@ def seed_model(self, model: Any, seed: Optional[int]) -> Any:
         """
 
     @abstractmethod
-    def _run_model_on_fold(
+    def _run_model_on_fold(  # noqa: PLR0913
         self,
         model: Any,
-        task: 'OpenMLTask',
-        X_train: Union[np.ndarray, scipy.sparse.spmatrix],
+        task: OpenMLTask,
+        X_train: np.ndarray | scipy.sparse.spmatrix,
         rep_no: int,
         fold_no: int,
-        y_train: Optional[np.ndarray] = None,
-        X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix]] = None,
-    ) -> Tuple[np.ndarray, np.ndarray, 'OrderedDict[str, float]', Optional['OpenMLRunTrace']]:
-        """Run a model on a repeat,fold,subsample triplet of the task and return prediction information.
+        y_train: np.ndarray | None = None,
+        X_test: np.ndarray | scipy.sparse.spmatrix | None = None,
+    ) -> tuple[np.ndarray, np.ndarray | None, OrderedDict[str, float], OpenMLRunTrace | None]:
+        """Run a model on a repeat, fold, subsample triplet of the task.
 
         Returns the data that is necessary to construct the OpenML Run object. Is used by
         :func:`openml.runs.run_flow_on_task`.
@@ -195,9 +205,9 @@ def _run_model_on_fold(
     @abstractmethod
     def obtain_parameter_values(
         self,
-        flow: 'OpenMLFlow',
+        flow: OpenMLFlow,
         model: Any = None,
-    ) -> List[Dict[str, Any]]:
+    ) -> list[dict[str, Any]]:
         """Extracts all parameter settings required for the flow from the model.
 
         If no explicit model is provided, the parameters will be extracted from `flow.model`
@@ -221,6 +231,19 @@ def obtain_parameter_values(
             - ``oml:component`` : int: flow id to which the parameter belongs
         """
 
+    @abstractmethod
+    def check_if_model_fitted(self, model: Any) -> bool:
+        """Returns True/False denoting if the model has already been fitted/trained.
+
+        Parameters
+        ----------
+        model : Any
+
+        Returns
+        -------
+        bool
+        """
+
     ################################################################################################
     # Abstract methods for hyperparameter optimization
 
@@ -228,7 +251,7 @@ def obtain_parameter_values(
     def instantiate_model_from_hpo_class(
         self,
         model: Any,
-        trace_iteration: 'OpenMLTraceIteration',
+        trace_iteration: OpenMLTraceIteration,
     ) -> Any:
         """Instantiate a base model which can be searched over by the hyperparameter optimization
         model.
diff --git a/openml/extensions/functions.py b/openml/extensions/functions.py
index 93fab5345..44df5ec69 100644
--- a/openml/extensions/functions.py
+++ b/openml/extensions/functions.py
@@ -1,5 +1,9 @@
-from typing import Any, Optional, Type, TYPE_CHECKING
-from . import Extension
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import importlib.util
+from typing import TYPE_CHECKING, Any
+
 # Need to implement the following by its full path because otherwise it won't be possible to
 # access openml.extensions.extensions
 import openml.extensions
@@ -8,8 +12,18 @@
 if TYPE_CHECKING:
     from openml.flows import OpenMLFlow
 
+    from . import Extension
+
+SKLEARN_HINT = (
+    "But it looks related to scikit-learn. "
+    "Please install the OpenML scikit-learn extension (openml-sklearn) and try again. "
+    "You can use `pip install openml-sklearn` for installation."
+    "For more information, see "
+    "https://docs.openml.org/python/extensions/"
+)
 
-def register_extension(extension: Type[Extension]) -> None:
+
+def register_extension(extension: type[Extension]) -> None:
     """Register an extension.
 
     Registered extensions are considered by ``get_extension_by_flow`` and
@@ -27,9 +41,9 @@ def register_extension(extension: Type[Extension]) -> None:
 
 
 def get_extension_by_flow(
-    flow: 'OpenMLFlow',
-    raise_if_no_extension: bool = False,
-) -> Optional[Extension]:
+    flow: OpenMLFlow,
+    raise_if_no_extension: bool = False,  # noqa: FBT002
+) -> Extension | None:
     """Get an extension which can handle the given flow.
 
     Iterates all registered extensions and checks whether they can handle the presented flow.
@@ -46,28 +60,39 @@ def get_extension_by_flow(
     -------
     Extension or None
     """
+    # import openml_sklearn to register SklearnExtension
+    if importlib.util.find_spec("openml_sklearn"):
+        import openml_sklearn  # noqa: F401
+
     candidates = []
     for extension_class in openml.extensions.extensions:
         if extension_class.can_handle_flow(flow):
             candidates.append(extension_class())
     if len(candidates) == 0:
         if raise_if_no_extension:
-            raise ValueError('No extension registered which can handle flow: {}'.format(flow))
-        else:
-            return None
-    elif len(candidates) == 1:
+            install_instruction = ""
+            if flow.name.startswith("sklearn"):
+                install_instruction = SKLEARN_HINT
+            raise ValueError(
+                f"No extension registered which can handle flow: {flow.flow_id} ({flow.name}). "
+                f"{install_instruction}"
+            )
+
+        return None
+
+    if len(candidates) == 1:
         return candidates[0]
-    else:
-        raise ValueError(
-            'Multiple extensions registered which can handle flow: {}, but only one '
-            'is allowed ({}).'.format(flow, candidates)
-        )
+
+    raise ValueError(
+        f"Multiple extensions registered which can handle flow: {flow}, but only one "
+        f"is allowed ({candidates}).",
+    )
 
 
 def get_extension_by_model(
     model: Any,
-    raise_if_no_extension: bool = False,
-) -> Optional[Extension]:
+    raise_if_no_extension: bool = False,  # noqa: FBT002
+) -> Extension | None:
     """Get an extension which can handle the given flow.
 
     Iterates all registered extensions and checks whether they can handle the presented model.
@@ -84,19 +109,29 @@ def get_extension_by_model(
     -------
     Extension or None
     """
+    # import openml_sklearn to register SklearnExtension
+    if importlib.util.find_spec("openml_sklearn"):
+        import openml_sklearn  # noqa: F401
+
     candidates = []
     for extension_class in openml.extensions.extensions:
         if extension_class.can_handle_model(model):
             candidates.append(extension_class())
     if len(candidates) == 0:
         if raise_if_no_extension:
-            raise ValueError('No extension registered which can handle model: {}'.format(model))
-        else:
-            return None
-    elif len(candidates) == 1:
+            install_instruction = ""
+            if type(model).__module__.startswith("sklearn"):
+                install_instruction = SKLEARN_HINT
+            raise ValueError(
+                f"No extension registered which can handle model: {model}. {install_instruction}"
+            )
+
+        return None
+
+    if len(candidates) == 1:
         return candidates[0]
-    else:
-        raise ValueError(
-            'Multiple extensions registered which can handle model: {}, but only one '
-            'is allowed ({}).'.format(model, candidates)
-        )
+
+    raise ValueError(
+        f"Multiple extensions registered which can handle model: {model}, but only one "
+        f"is allowed ({candidates}).",
+    )
diff --git a/openml/extensions/sklearn/__init__.py b/openml/extensions/sklearn/__init__.py
deleted file mode 100644
index c125f51bd..000000000
--- a/openml/extensions/sklearn/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .extension import SklearnExtension
-
-
-__all__ = ['SklearnExtension']
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
deleted file mode 100644
index d44b61ae7..000000000
--- a/openml/extensions/sklearn/extension.py
+++ /dev/null
@@ -1,1744 +0,0 @@
-from collections import OrderedDict  # noqa: F401
-import copy
-from distutils.version import LooseVersion
-import importlib
-import inspect
-import json
-import logging
-import re
-import sys
-import time
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
-import warnings
-
-import numpy as np
-import pandas as pd
-import scipy.stats
-import scipy.sparse
-import sklearn.base
-import sklearn.model_selection
-import sklearn.pipeline
-
-import openml
-from openml.exceptions import PyOpenMLError
-from openml.extensions import Extension, register_extension
-from openml.flows import OpenMLFlow
-from openml.runs.trace import OpenMLRunTrace, OpenMLTraceIteration, PREFIX
-from openml.tasks import (
-    OpenMLTask,
-    OpenMLSupervisedTask,
-    OpenMLClassificationTask,
-    OpenMLLearningCurveTask,
-    OpenMLClusteringTask,
-    OpenMLRegressionTask,
-)
-
-
-if sys.version_info >= (3, 5):
-    from json.decoder import JSONDecodeError
-else:
-    JSONDecodeError = ValueError
-
-
-DEPENDENCIES_PATTERN = re.compile(
-    r'^(?P<name>[\w\-]+)((?P<operation>==|>=|>)'
-    r'(?P<version>(\d+\.)?(\d+\.)?(\d+)?(dev)?[0-9]*))?$'
-)
-
-
-SIMPLE_NUMPY_TYPES = [nptype for type_cat, nptypes in np.sctypes.items()
-                      for nptype in nptypes if type_cat != 'others']
-SIMPLE_TYPES = tuple([bool, int, float, str] + SIMPLE_NUMPY_TYPES)
-
-
-class SklearnExtension(Extension):
-    """Connect scikit-learn to OpenML-Python."""
-
-    ################################################################################################
-    # General setup
-
-    @classmethod
-    def can_handle_flow(cls, flow: 'OpenMLFlow') -> bool:
-        """Check whether a given describes a scikit-learn estimator.
-
-        This is done by parsing the ``external_version`` field.
-
-        Parameters
-        ----------
-        flow : OpenMLFlow
-
-        Returns
-        -------
-        bool
-        """
-        return cls._is_sklearn_flow(flow)
-
-    @classmethod
-    def can_handle_model(cls, model: Any) -> bool:
-        """Check whether a model is an instance of ``sklearn.base.BaseEstimator``.
-
-        Parameters
-        ----------
-        model : Any
-
-        Returns
-        -------
-        bool
-        """
-        return isinstance(model, sklearn.base.BaseEstimator)
-
-    @classmethod
-    def trim_flow_name(
-            cls,
-            long_name: str,
-            extra_trim_length: int = 100,
-            _outer: bool = True
-    ) -> str:
-        """ Shorten generated sklearn flow name to at most `max_length` characters.
-
-        Flows are assumed to have the following naming structure:
-        (model_selection)? (pipeline)? (steps)+
-        and will be shortened to:
-        sklearn.(selection.)?(pipeline.)?(steps)+
-        e.g. (white spaces and newlines added for readability)
-        sklearn.pipeline.Pipeline(
-            columntransformer=sklearn.compose._column_transformer.ColumnTransformer(
-                numeric=sklearn.pipeline.Pipeline(
-                    imputer=sklearn.preprocessing.imputation.Imputer,
-                    standardscaler=sklearn.preprocessing.data.StandardScaler),
-                nominal=sklearn.pipeline.Pipeline(
-                    simpleimputer=sklearn.impute.SimpleImputer,
-                    onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)),
-            variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
-            svc=sklearn.svm.classes.SVC)
-        ->
-        sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)
-
-        Parameters
-        ----------
-        long_name : str
-            The full flow name generated by the scikit-learn extension.
-        extra_trim_length: int (default=100)
-            If the trimmed name would exceed `extra_trim_length` characters, additional trimming
-            of the short name is performed. This reduces the produced short name length.
-            There is no guarantee the end result will not exceed `extra_trim_length`.
-        _outer : bool (default=True)
-            For internal use only. Specifies if the function is called recursively.
-
-        Returns
-        -------
-        str
-
-        """
-        def remove_all_in_parentheses(string: str) -> str:
-            string, removals = re.subn(r"\([^()]*\)", "", string)
-            while removals > 0:
-                string, removals = re.subn(r"\([^()]*\)", "", string)
-            return string
-
-        # Generally, we want to trim all hyperparameters, the exception to that is for model
-        # selection, as the `estimator` hyperparameter is very indicative of what is in the flow.
-        # So we first trim name of the `estimator` specified in mode selection. For reference, in
-        # the example below, we want to trim `sklearn.tree.tree.DecisionTreeClassifier`, and
-        # keep it in the final trimmed flow name:
-        # sklearn.pipeline.Pipeline(Imputer=sklearn.preprocessing.imputation.Imputer,
-        # VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
-        # Estimator=sklearn.model_selection._search.RandomizedSearchCV(estimator=
-        # sklearn.tree.tree.DecisionTreeClassifier))
-        if 'sklearn.model_selection' in long_name:
-            start_index = long_name.index('sklearn.model_selection')
-            estimator_start = (start_index
-                               + long_name[start_index:].index('estimator=')
-                               + len('estimator='))
-
-            model_select_boilerplate = long_name[start_index:estimator_start]
-            # above is .g. "sklearn.model_selection._search.RandomizedSearchCV(estimator="
-            model_selection_class = model_select_boilerplate.split('(')[0].split('.')[-1]
-
-            # Now we want to also find and parse the `estimator`, for this we find the closing
-            # parenthesis to the model selection technique:
-            closing_parenthesis_expected = 1
-            for i, char in enumerate(long_name[estimator_start:], start=estimator_start):
-                if char == '(':
-                    closing_parenthesis_expected += 1
-                if char == ')':
-                    closing_parenthesis_expected -= 1
-                if closing_parenthesis_expected == 0:
-                    break
-
-            model_select_pipeline = long_name[estimator_start:i]
-            trimmed_pipeline = cls.trim_flow_name(model_select_pipeline, _outer=False)
-            _, trimmed_pipeline = trimmed_pipeline.split('.', maxsplit=1)  # trim module prefix
-            model_select_short = "sklearn.{}[{}]".format(model_selection_class, trimmed_pipeline)
-            name = long_name[:start_index] + model_select_short + long_name[i + 1:]
-        else:
-            name = long_name
-
-        module_name = long_name.split('.')[0]
-        short_name = module_name + '.{}'
-
-        if name.startswith('sklearn.pipeline'):
-            full_pipeline_class, pipeline = name[:-1].split('(', maxsplit=1)
-            pipeline_class = full_pipeline_class.split('.')[-1]
-            # We don't want nested pipelines in the short name, so we trim all complicated
-            # subcomponents, i.e. those with parentheses:
-            pipeline = remove_all_in_parentheses(pipeline)
-
-            # then the pipeline steps are formatted e.g.:
-            # step1name=sklearn.submodule.ClassName,step2name...
-            components = [component.split('.')[-1] for component in pipeline.split(',')]
-            pipeline = "{}({})".format(pipeline_class, ','.join(components))
-            if len(short_name.format(pipeline)) > extra_trim_length:
-                pipeline = "{}(...,{})".format(pipeline_class, components[-1])
-        else:
-            # Just a simple component: e.g. sklearn.tree.DecisionTreeClassifier
-            pipeline = remove_all_in_parentheses(name).split('.')[-1]
-
-        if not _outer:
-            # Anything from parenthesis in inner calls should not be culled, so we use brackets
-            pipeline = pipeline.replace('(', '[').replace(')', ']')
-        else:
-            # Square brackets may be introduced with nested model_selection
-            pipeline = pipeline.replace('[', '(').replace(']', ')')
-
-        return short_name.format(pipeline)
-
-    ################################################################################################
-    # Methods for flow serialization and de-serialization
-
-    def flow_to_model(self, flow: 'OpenMLFlow', initialize_with_defaults: bool = False) -> Any:
-        """Initializes a sklearn model based on a flow.
-
-        Parameters
-        ----------
-        flow : mixed
-            the object to deserialize (can be flow object, or any serialized
-            parameter value that is accepted by)
-
-        initialize_with_defaults : bool, optional (default=False)
-            If this flag is set, the hyperparameter values of flows will be
-            ignored and a flow with its defaults is returned.
-
-        Returns
-        -------
-        mixed
-        """
-        return self._deserialize_sklearn(flow, initialize_with_defaults=initialize_with_defaults)
-
-    def _deserialize_sklearn(
-        self,
-        o: Any,
-        components: Optional[Dict] = None,
-        initialize_with_defaults: bool = False,
-        recursion_depth: int = 0,
-    ) -> Any:
-        """Recursive function to deserialize a scikit-learn flow.
-
-        This function delegates all work to the respective functions to deserialize special data
-        structures etc.
-
-        Parameters
-        ----------
-        o : mixed
-            the object to deserialize (can be flow object, or any serialized
-            parameter value that is accepted by)
-
-        components : dict
-
-
-        initialize_with_defaults : bool, optional (default=False)
-            If this flag is set, the hyperparameter values of flows will be
-            ignored and a flow with its defaults is returned.
-
-        recursion_depth : int
-            The depth at which this flow is called, mostly for debugging
-            purposes
-
-        Returns
-        -------
-        mixed
-        """
-
-        logging.info('-%s flow_to_sklearn START o=%s, components=%s, '
-                     'init_defaults=%s' % ('-' * recursion_depth, o, components,
-                                           initialize_with_defaults))
-        depth_pp = recursion_depth + 1  # shortcut var, depth plus plus
-
-        # First, we need to check whether the presented object is a json string.
-        # JSON strings are used to encoder parameter values. By passing around
-        # json strings for parameters, we make sure that we can flow_to_sklearn
-        # the parameter values to the correct type.
-
-        if isinstance(o, str):
-            try:
-                o = json.loads(o)
-            except JSONDecodeError:
-                pass
-
-        if isinstance(o, dict):
-            # Check if the dict encodes a 'special' object, which could not
-            # easily converted into a string, but rather the information to
-            # re-create the object were stored in a dictionary.
-            if 'oml-python:serialized_object' in o:
-                serialized_type = o['oml-python:serialized_object']
-                value = o['value']
-                if serialized_type == 'type':
-                    rval = self._deserialize_type(value)
-                elif serialized_type == 'rv_frozen':
-                    rval = self._deserialize_rv_frozen(value)
-                elif serialized_type == 'function':
-                    rval = self._deserialize_function(value)
-                elif serialized_type == 'component_reference':
-                    assert components is not None  # Necessary for mypy
-                    value = self._deserialize_sklearn(value, recursion_depth=depth_pp)
-                    step_name = value['step_name']
-                    key = value['key']
-                    component = self._deserialize_sklearn(
-                        components[key],
-                        initialize_with_defaults=initialize_with_defaults,
-                        recursion_depth=depth_pp
-                    )
-                    # The component is now added to where it should be used
-                    # later. It should not be passed to the constructor of the
-                    # main flow object.
-                    del components[key]
-                    if step_name is None:
-                        rval = component
-                    elif 'argument_1' not in value:
-                        rval = (step_name, component)
-                    else:
-                        rval = (step_name, component, value['argument_1'])
-                elif serialized_type == 'cv_object':
-                    rval = self._deserialize_cross_validator(
-                        value, recursion_depth=recursion_depth
-                    )
-                else:
-                    raise ValueError('Cannot flow_to_sklearn %s' % serialized_type)
-
-            else:
-                rval = OrderedDict(
-                    (
-                        self._deserialize_sklearn(
-                            o=key,
-                            components=components,
-                            initialize_with_defaults=initialize_with_defaults,
-                            recursion_depth=depth_pp,
-                        ),
-                        self._deserialize_sklearn(
-                            o=value,
-                            components=components,
-                            initialize_with_defaults=initialize_with_defaults,
-                            recursion_depth=depth_pp,
-                        )
-                    )
-                    for key, value in sorted(o.items())
-                )
-        elif isinstance(o, (list, tuple)):
-            rval = [
-                self._deserialize_sklearn(
-                    o=element,
-                    components=components,
-                    initialize_with_defaults=initialize_with_defaults,
-                    recursion_depth=depth_pp,
-                )
-                for element in o
-            ]
-            if isinstance(o, tuple):
-                rval = tuple(rval)
-        elif isinstance(o, (bool, int, float, str)) or o is None:
-            rval = o
-        elif isinstance(o, OpenMLFlow):
-            if not self._is_sklearn_flow(o):
-                raise ValueError('Only sklearn flows can be reinstantiated')
-            rval = self._deserialize_model(
-                flow=o,
-                keep_defaults=initialize_with_defaults,
-                recursion_depth=recursion_depth,
-            )
-        else:
-            raise TypeError(o)
-        logging.info('-%s flow_to_sklearn END   o=%s, rval=%s'
-                     % ('-' * recursion_depth, o, rval))
-        return rval
-
-    def model_to_flow(self, model: Any) -> 'OpenMLFlow':
-        """Transform a scikit-learn model to a flow for uploading it to OpenML.
-
-        Parameters
-        ----------
-        model : Any
-
-        Returns
-        -------
-        OpenMLFlow
-        """
-        # Necessary to make pypy not complain about all the different possible return types
-        return self._serialize_sklearn(model)
-
-    def _serialize_sklearn(self, o: Any, parent_model: Optional[Any] = None) -> Any:
-        rval = None  # type: Any
-
-        # TODO: assert that only on first recursion lvl `parent_model` can be None
-        if self.is_estimator(o):
-            # is the main model or a submodel
-            rval = self._serialize_model(o)
-        elif isinstance(o, (list, tuple)):
-            # TODO: explain what type of parameter is here
-            rval = [self._serialize_sklearn(element, parent_model) for element in o]
-            if isinstance(o, tuple):
-                rval = tuple(rval)
-        elif isinstance(o, SIMPLE_TYPES) or o is None:
-            if isinstance(o, tuple(SIMPLE_NUMPY_TYPES)):
-                o = o.item()
-            # base parameter values
-            rval = o
-        elif isinstance(o, dict):
-            # TODO: explain what type of parameter is here
-            if not isinstance(o, OrderedDict):
-                o = OrderedDict([(key, value) for key, value in sorted(o.items())])
-
-            rval = OrderedDict()
-            for key, value in o.items():
-                if not isinstance(key, str):
-                    raise TypeError('Can only use string as keys, you passed '
-                                    'type %s for value %s.' %
-                                    (type(key), str(key)))
-                key = self._serialize_sklearn(key, parent_model)
-                value = self._serialize_sklearn(value, parent_model)
-                rval[key] = value
-            rval = rval
-        elif isinstance(o, type):
-            # TODO: explain what type of parameter is here
-            rval = self._serialize_type(o)
-        elif isinstance(o, scipy.stats.distributions.rv_frozen):
-            rval = self._serialize_rv_frozen(o)
-        # This only works for user-defined functions (and not even partial).
-        # I think this is exactly what we want here as there shouldn't be any
-        # built-in or functool.partials in a pipeline
-        elif inspect.isfunction(o):
-            # TODO: explain what type of parameter is here
-            rval = self._serialize_function(o)
-        elif self._is_cross_validator(o):
-            # TODO: explain what type of parameter is here
-            rval = self._serialize_cross_validator(o)
-        else:
-            raise TypeError(o, type(o))
-
-        return rval
-
-    def get_version_information(self) -> List[str]:
-        """List versions of libraries required by the flow.
-
-        Libraries listed are ``Python``, ``scikit-learn``, ``numpy`` and ``scipy``.
-
-        Returns
-        -------
-        List
-        """
-
-        # This can possibly be done by a package such as pyxb, but I could not get
-        # it to work properly.
-        import sklearn
-        import scipy
-        import numpy
-
-        major, minor, micro, _, _ = sys.version_info
-        python_version = 'Python_{}.'.format(
-            ".".join([str(major), str(minor), str(micro)]))
-        sklearn_version = 'Sklearn_{}.'.format(sklearn.__version__)
-        numpy_version = 'NumPy_{}.'.format(numpy.__version__)
-        scipy_version = 'SciPy_{}.'.format(scipy.__version__)
-
-        return [python_version, sklearn_version, numpy_version, scipy_version]
-
-    def create_setup_string(self, model: Any) -> str:
-        """Create a string which can be used to reinstantiate the given model.
-
-        Parameters
-        ----------
-        model : Any
-
-        Returns
-        -------
-        str
-        """
-        run_environment = " ".join(self.get_version_information())
-        # fixme str(model) might contain (...)
-        return run_environment + " " + str(model)
-
-    def _is_cross_validator(self, o: Any) -> bool:
-        return isinstance(o, sklearn.model_selection.BaseCrossValidator)
-
-    @classmethod
-    def _is_sklearn_flow(cls, flow: OpenMLFlow) -> bool:
-        return (
-            flow.external_version.startswith('sklearn==')
-            or ',sklearn==' in flow.external_version
-        )
-
-    def _serialize_model(self, model: Any) -> OpenMLFlow:
-        """Create an OpenMLFlow.
-
-        Calls `sklearn_to_flow` recursively to properly serialize the
-        parameters to strings and the components (other models) to OpenMLFlows.
-
-        Parameters
-        ----------
-        model : sklearn estimator
-
-        Returns
-        -------
-        OpenMLFlow
-
-        """
-
-        # Get all necessary information about the model objects itself
-        parameters, parameters_meta_info, subcomponents, subcomponents_explicit = \
-            self._extract_information_from_model(model)
-
-        # Check that a component does not occur multiple times in a flow as this
-        # is not supported by OpenML
-        self._check_multiple_occurence_of_component_in_flow(model, subcomponents)
-
-        # Create a flow name, which contains all components in brackets, e.g.:
-        # RandomizedSearchCV(Pipeline(StandardScaler,AdaBoostClassifier(DecisionTreeClassifier)),
-        # StandardScaler,AdaBoostClassifier(DecisionTreeClassifier))
-        class_name = model.__module__ + "." + model.__class__.__name__
-
-        # will be part of the name (in brackets)
-        sub_components_names = ""
-        for key in subcomponents:
-            if key in subcomponents_explicit:
-                sub_components_names += "," + key + "=" + subcomponents[key].name
-            else:
-                sub_components_names += "," + subcomponents[key].name
-
-        if sub_components_names:
-            # slice operation on string in order to get rid of leading comma
-            name = '%s(%s)' % (class_name, sub_components_names[1:])
-        else:
-            name = class_name
-        short_name = SklearnExtension.trim_flow_name(name)
-
-        # Get the external versions of all sub-components
-        external_version = self._get_external_version_string(model, subcomponents)
-
-        dependencies = '\n'.join([
-            self._format_external_version(
-                'sklearn',
-                sklearn.__version__,
-            ),
-            'numpy>=1.6.1',
-            'scipy>=0.9',
-        ])
-
-        sklearn_version = self._format_external_version('sklearn', sklearn.__version__)
-        sklearn_version_formatted = sklearn_version.replace('==', '_')
-        flow = OpenMLFlow(name=name,
-                          class_name=class_name,
-                          custom_name=short_name,
-                          description='Automatically created scikit-learn flow.',
-                          model=model,
-                          components=subcomponents,
-                          parameters=parameters,
-                          parameters_meta_info=parameters_meta_info,
-                          external_version=external_version,
-                          tags=['openml-python', 'sklearn', 'scikit-learn',
-                                'python', sklearn_version_formatted,
-                                # TODO: add more tags based on the scikit-learn
-                                # module a flow is in? For example automatically
-                                # annotate a class of sklearn.svm.SVC() with the
-                                # tag svm?
-                                ],
-                          extension=self,
-                          language='English',
-                          # TODO fill in dependencies!
-                          dependencies=dependencies)
-
-        return flow
-
-    def _get_external_version_string(
-        self,
-        model: Any,
-        sub_components: Dict[str, OpenMLFlow],
-    ) -> str:
-        # Create external version string for a flow, given the model and the
-        # already parsed dictionary of sub_components. Retrieves the external
-        # version of all subcomponents, which themselves already contain all
-        # requirements for their subcomponents. The external version string is a
-        # sorted concatenation of all modules which are present in this run.
-        model_package_name = model.__module__.split('.')[0]
-        module = importlib.import_module(model_package_name)
-        model_package_version_number = module.__version__  # type: ignore
-        external_version = self._format_external_version(
-            model_package_name, model_package_version_number,
-        )
-        openml_version = self._format_external_version('openml', openml.__version__)
-        sklearn_version = self._format_external_version('sklearn', sklearn.__version__)
-
-        external_versions = set()
-        external_versions.add(external_version)
-        external_versions.add(openml_version)
-        external_versions.add(sklearn_version)
-        for visitee in sub_components.values():
-            for external_version in visitee.external_version.split(','):
-                external_versions.add(external_version)
-        return ','.join(list(sorted(external_versions)))
-
-    def _check_multiple_occurence_of_component_in_flow(
-        self,
-        model: Any,
-        sub_components: Dict[str, OpenMLFlow],
-    ) -> None:
-        to_visit_stack = []  # type: List[OpenMLFlow]
-        to_visit_stack.extend(sub_components.values())
-        known_sub_components = set()  # type: Set[str]
-        while len(to_visit_stack) > 0:
-            visitee = to_visit_stack.pop()
-            if visitee.name in known_sub_components:
-                raise ValueError('Found a second occurence of component %s when '
-                                 'trying to serialize %s.' % (visitee.name, model))
-            else:
-                known_sub_components.add(visitee.name)
-                to_visit_stack.extend(visitee.components.values())
-
-    def _extract_information_from_model(
-        self,
-        model: Any,
-    ) -> Tuple[
-        'OrderedDict[str, Optional[str]]',
-        'OrderedDict[str, Optional[Dict]]',
-        'OrderedDict[str, OpenMLFlow]',
-        Set,
-    ]:
-        # This function contains four "global" states and is quite long and
-        # complicated. If it gets to complicated to ensure it's correctness,
-        # it would be best to make it a class with the four "global" states being
-        # the class attributes and the if/elif/else in the for-loop calls to
-        # separate class methods
-
-        # stores all entities that should become subcomponents
-        sub_components = OrderedDict()  # type: OrderedDict[str, OpenMLFlow]
-        # stores the keys of all subcomponents that should become
-        sub_components_explicit = set()
-        parameters = OrderedDict()  # type: OrderedDict[str, Optional[str]]
-        parameters_meta_info = OrderedDict()  # type: OrderedDict[str, Optional[Dict]]
-
-        model_parameters = model.get_params(deep=False)
-        for k, v in sorted(model_parameters.items(), key=lambda t: t[0]):
-            rval = self._serialize_sklearn(v, model)
-
-            def flatten_all(list_):
-                """ Flattens arbitrary depth lists of lists (e.g. [[1,2],[3,[1]]] -> [1,2,3,1]). """
-                for el in list_:
-                    if isinstance(el, (list, tuple)):
-                        yield from flatten_all(el)
-                    else:
-                        yield el
-
-            # In case rval is a list of lists (or tuples), we need to identify two situations:
-            # - sklearn pipeline steps, feature union or base classifiers in voting classifier.
-            #   They look like e.g. [("imputer", Imputer()), ("classifier", SVC())]
-            # - a list of lists with simple types (e.g. int or str), such as for an OrdinalEncoder
-            #   where all possible values for each feature are described: [[0,1,2], [1,2,5]]
-            is_non_empty_list_of_lists_with_same_type = (
-                isinstance(rval, (list, tuple))
-                and len(rval) > 0
-                and isinstance(rval[0], (list, tuple))
-                and all([isinstance(rval_i, type(rval[0])) for rval_i in rval])
-            )
-
-            # Check that all list elements are of simple types.
-            nested_list_of_simple_types = (
-                is_non_empty_list_of_lists_with_same_type
-                and all([isinstance(el, SIMPLE_TYPES) for el in flatten_all(rval)])
-            )
-
-            if is_non_empty_list_of_lists_with_same_type and not nested_list_of_simple_types:
-                # If a list of lists is identified that include 'non-simple' types (e.g. objects),
-                # we assume they are steps in a pipeline, feature union, or base classifiers in
-                # a voting classifier.
-                parameter_value = list()  # type: List
-                reserved_keywords = set(model.get_params(deep=False).keys())
-
-                for sub_component_tuple in rval:
-                    identifier = sub_component_tuple[0]
-                    sub_component = sub_component_tuple[1]
-                    sub_component_type = type(sub_component_tuple)
-                    if not 2 <= len(sub_component_tuple) <= 3:
-                        # length 2 is for {VotingClassifier.estimators,
-                        # Pipeline.steps, FeatureUnion.transformer_list}
-                        # length 3 is for ColumnTransformer
-                        msg = 'Length of tuple does not match assumptions'
-                        raise ValueError(msg)
-                    if not isinstance(sub_component, (OpenMLFlow, type(None))):
-                        msg = 'Second item of tuple does not match assumptions. ' \
-                              'Expected OpenMLFlow, got %s' % type(sub_component)
-                        raise TypeError(msg)
-
-                    if identifier in reserved_keywords:
-                        parent_model = "{}.{}".format(model.__module__,
-                                                      model.__class__.__name__)
-                        msg = 'Found element shadowing official ' \
-                              'parameter for %s: %s' % (parent_model,
-                                                        identifier)
-                        raise PyOpenMLError(msg)
-
-                    if sub_component is None:
-                        # In a FeatureUnion it is legal to have a None step
-
-                        pv = [identifier, None]
-                        if sub_component_type is tuple:
-                            parameter_value.append(tuple(pv))
-                        else:
-                            parameter_value.append(pv)
-
-                    else:
-                        # Add the component to the list of components, add a
-                        # component reference as a placeholder to the list of
-                        # parameters, which will be replaced by the real component
-                        # when deserializing the parameter
-                        sub_components_explicit.add(identifier)
-                        sub_components[identifier] = sub_component
-                        component_reference = OrderedDict()  # type: Dict[str, Union[str, Dict]]
-                        component_reference['oml-python:serialized_object'] = 'component_reference'
-                        cr_value = OrderedDict()  # type: Dict[str, Any]
-                        cr_value['key'] = identifier
-                        cr_value['step_name'] = identifier
-                        if len(sub_component_tuple) == 3:
-                            cr_value['argument_1'] = sub_component_tuple[2]
-                        component_reference['value'] = cr_value
-                        parameter_value.append(component_reference)
-
-                # Here (and in the elif and else branch below) are the only
-                # places where we encode a value as json to make sure that all
-                # parameter values still have the same type after
-                # deserialization
-                if isinstance(rval, tuple):
-                    parameter_json = json.dumps(tuple(parameter_value))
-                else:
-                    parameter_json = json.dumps(parameter_value)
-                parameters[k] = parameter_json
-
-            elif isinstance(rval, OpenMLFlow):
-
-                # A subcomponent, for example the base model in
-                # AdaBoostClassifier
-                sub_components[k] = rval
-                sub_components_explicit.add(k)
-                component_reference = OrderedDict()
-                component_reference['oml-python:serialized_object'] = 'component_reference'
-                cr_value = OrderedDict()
-                cr_value['key'] = k
-                cr_value['step_name'] = None
-                component_reference['value'] = cr_value
-                cr = self._serialize_sklearn(component_reference, model)
-                parameters[k] = json.dumps(cr)
-
-            else:
-                # a regular hyperparameter
-                if not (hasattr(rval, '__len__') and len(rval) == 0):
-                    rval = json.dumps(rval)
-                    parameters[k] = rval
-                else:
-                    parameters[k] = None
-
-            parameters_meta_info[k] = OrderedDict((('description', None), ('data_type', None)))
-
-        return parameters, parameters_meta_info, sub_components, sub_components_explicit
-
-    def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> Tuple[Dict, Set]:
-        """
-        Returns:
-            i) a dict with all parameter names that have a default value, and
-            ii) a set with all parameter names that do not have a default
-
-        Parameters
-        ----------
-        fn_name : callable
-            The function of which we want to obtain the defaults
-
-        Returns
-        -------
-        params_with_defaults: dict
-            a dict mapping parameter name to the default value
-        params_without_defaults: set
-            a set with all parameters that do not have a default value
-        """
-        # parameters with defaults are optional, all others are required.
-        signature = inspect.getfullargspec(fn_name)
-        if signature.defaults:
-            optional_params = dict(zip(reversed(signature.args), reversed(signature.defaults)))
-        else:
-            optional_params = dict()
-        required_params = {arg for arg in signature.args if arg not in optional_params}
-        return optional_params, required_params
-
-    def _deserialize_model(
-        self,
-        flow: OpenMLFlow,
-        keep_defaults: bool,
-        recursion_depth: int,
-    ) -> Any:
-        logging.info('-%s deserialize %s' % ('-' * recursion_depth, flow.name))
-        model_name = flow.class_name
-        self._check_dependencies(flow.dependencies)
-
-        parameters = flow.parameters
-        components = flow.components
-        parameter_dict = OrderedDict()  # type: Dict[str, Any]
-
-        # Do a shallow copy of the components dictionary so we can remove the
-        # components from this copy once we added them into the pipeline. This
-        # allows us to not consider them any more when looping over the
-        # components, but keeping the dictionary of components untouched in the
-        # original components dictionary.
-        components_ = copy.copy(components)
-
-        for name in parameters:
-            value = parameters.get(name)
-            logging.info('--%s flow_parameter=%s, value=%s' %
-                         ('-' * recursion_depth, name, value))
-            rval = self._deserialize_sklearn(
-                value,
-                components=components_,
-                initialize_with_defaults=keep_defaults,
-                recursion_depth=recursion_depth + 1,
-            )
-            parameter_dict[name] = rval
-
-        for name in components:
-            if name in parameter_dict:
-                continue
-            if name not in components_:
-                continue
-            value = components[name]
-            logging.info('--%s flow_component=%s, value=%s'
-                         % ('-' * recursion_depth, name, value))
-            rval = self._deserialize_sklearn(
-                value,
-                recursion_depth=recursion_depth + 1,
-            )
-            parameter_dict[name] = rval
-
-        module_name = model_name.rsplit('.', 1)
-        model_class = getattr(importlib.import_module(module_name[0]),
-                              module_name[1])
-
-        if keep_defaults:
-            # obtain all params with a default
-            param_defaults, _ = \
-                self._get_fn_arguments_with_defaults(model_class.__init__)
-
-            # delete the params that have a default from the dict,
-            # so they get initialized with their default value
-            # except [...]
-            for param in param_defaults:
-                # [...] the ones that also have a key in the components dict.
-                # As OpenML stores different flows for ensembles with different
-                # (base-)components, in OpenML terms, these are not considered
-                # hyperparameters but rather constants (i.e., changing them would
-                # result in a different flow)
-                if param not in components.keys():
-                    del parameter_dict[param]
-        return model_class(**parameter_dict)
-
-    def _check_dependencies(self, dependencies: str) -> None:
-        if not dependencies:
-            return
-
-        dependencies_list = dependencies.split('\n')
-        for dependency_string in dependencies_list:
-            match = DEPENDENCIES_PATTERN.match(dependency_string)
-            if not match:
-                raise ValueError('Cannot parse dependency %s' % dependency_string)
-
-            dependency_name = match.group('name')
-            operation = match.group('operation')
-            version = match.group('version')
-
-            module = importlib.import_module(dependency_name)
-            required_version = LooseVersion(version)
-            installed_version = LooseVersion(module.__version__)  # type: ignore
-
-            if operation == '==':
-                check = required_version == installed_version
-            elif operation == '>':
-                check = installed_version > required_version
-            elif operation == '>=':
-                check = (installed_version > required_version
-                         or installed_version == required_version)
-            else:
-                raise NotImplementedError(
-                    'operation \'%s\' is not supported' % operation)
-            if not check:
-                raise ValueError('Trying to deserialize a model with dependency '
-                                 '%s not satisfied.' % dependency_string)
-
-    def _serialize_type(self, o: Any) -> 'OrderedDict[str, str]':
-        mapping = {float: 'float',
-                   np.float: 'np.float',
-                   np.float32: 'np.float32',
-                   np.float64: 'np.float64',
-                   int: 'int',
-                   np.int: 'np.int',
-                   np.int32: 'np.int32',
-                   np.int64: 'np.int64'}
-        ret = OrderedDict()  # type: 'OrderedDict[str, str]'
-        ret['oml-python:serialized_object'] = 'type'
-        ret['value'] = mapping[o]
-        return ret
-
-    def _deserialize_type(self, o: str) -> Any:
-        mapping = {'float': float,
-                   'np.float': np.float,
-                   'np.float32': np.float32,
-                   'np.float64': np.float64,
-                   'int': int,
-                   'np.int': np.int,
-                   'np.int32': np.int32,
-                   'np.int64': np.int64}
-        return mapping[o]
-
-    def _serialize_rv_frozen(self, o: Any) -> 'OrderedDict[str, Union[str, Dict]]':
-        args = o.args
-        kwds = o.kwds
-        a = o.a
-        b = o.b
-        dist = o.dist.__class__.__module__ + '.' + o.dist.__class__.__name__
-        ret = OrderedDict()  # type: 'OrderedDict[str, Union[str, Dict]]'
-        ret['oml-python:serialized_object'] = 'rv_frozen'
-        ret['value'] = OrderedDict((('dist', dist), ('a', a), ('b', b),
-                                    ('args', args), ('kwds', kwds)))
-        return ret
-
-    def _deserialize_rv_frozen(self, o: 'OrderedDict[str, str]') -> Any:
-        args = o['args']
-        kwds = o['kwds']
-        a = o['a']
-        b = o['b']
-        dist_name = o['dist']
-
-        module_name = dist_name.rsplit('.', 1)
-        try:
-            rv_class = getattr(importlib.import_module(module_name[0]),
-                               module_name[1])
-        except AttributeError:
-            warnings.warn('Cannot create model %s for flow.' % dist_name)
-            return None
-
-        dist = scipy.stats.distributions.rv_frozen(rv_class(), *args, **kwds)
-        dist.a = a
-        dist.b = b
-
-        return dist
-
-    def _serialize_function(self, o: Callable) -> 'OrderedDict[str, str]':
-        name = o.__module__ + '.' + o.__name__
-        ret = OrderedDict()  # type: 'OrderedDict[str, str]'
-        ret['oml-python:serialized_object'] = 'function'
-        ret['value'] = name
-        return ret
-
-    def _deserialize_function(self, name: str) -> Callable:
-        module_name = name.rsplit('.', 1)
-        function_handle = getattr(importlib.import_module(module_name[0]), module_name[1])
-        return function_handle
-
-    def _serialize_cross_validator(self, o: Any) -> 'OrderedDict[str, Union[str, Dict]]':
-        ret = OrderedDict()  # type: 'OrderedDict[str, Union[str, Dict]]'
-
-        parameters = OrderedDict()  # type: 'OrderedDict[str, Any]'
-
-        # XXX this is copied from sklearn.model_selection._split
-        cls = o.__class__
-        init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
-        # Ignore varargs, kw and default values and pop self
-        init_signature = inspect.signature(init)
-        # Consider the constructor parameters excluding 'self'
-        if init is object.__init__:
-            args = []  # type: List
-        else:
-            args = sorted([p.name for p in init_signature.parameters.values()
-                           if p.name != 'self' and p.kind != p.VAR_KEYWORD])
-
-        for key in args:
-            # We need deprecation warnings to always be on in order to
-            # catch deprecated param values.
-            # This is set in utils/__init__.py but it gets overwritten
-            # when running under python3 somehow.
-            with warnings.catch_warnings(record=True) as w:
-                warnings.simplefilter("always", DeprecationWarning)
-                value = getattr(o, key, None)
-                if w is not None and len(w) and w[0].category == DeprecationWarning:
-                    # if the parameter is deprecated, don't show it
-                    continue
-
-            if not (hasattr(value, '__len__') and len(value) == 0):
-                value = json.dumps(value)
-                parameters[key] = value
-            else:
-                parameters[key] = None
-
-        ret['oml-python:serialized_object'] = 'cv_object'
-        name = o.__module__ + "." + o.__class__.__name__
-        value = OrderedDict([('name', name), ('parameters', parameters)])
-        ret['value'] = value
-
-        return ret
-
-    def _deserialize_cross_validator(
-        self,
-        value: 'OrderedDict[str, Any]',
-        recursion_depth: int,
-    ) -> Any:
-        model_name = value['name']
-        parameters = value['parameters']
-
-        module_name = model_name.rsplit('.', 1)
-        model_class = getattr(importlib.import_module(module_name[0]),
-                              module_name[1])
-        for parameter in parameters:
-            parameters[parameter] = self._deserialize_sklearn(
-                parameters[parameter],
-                recursion_depth=recursion_depth + 1,
-            )
-        return model_class(**parameters)
-
-    def _format_external_version(
-        self,
-        model_package_name: str,
-        model_package_version_number: str,
-    ) -> str:
-        return '%s==%s' % (model_package_name, model_package_version_number)
-
-    @staticmethod
-    def _get_parameter_values_recursive(param_grid: Union[Dict, List[Dict]],
-                                        parameter_name: str) -> List[Any]:
-        """
-        Returns a list of values for a given hyperparameter, encountered
-        recursively throughout the flow. (e.g., n_jobs can be defined
-        for various flows)
-
-        Parameters
-        ----------
-        param_grid: Union[Dict, List[Dict]]
-            Dict mapping from hyperparameter list to value, to a list of
-            such dicts
-
-        parameter_name: str
-            The hyperparameter that needs to be inspected
-
-        Returns
-        -------
-        List
-            A list of all values of hyperparameters with this name
-        """
-        if isinstance(param_grid, dict):
-            result = list()
-            for param, value in param_grid.items():
-                # n_jobs is scikit-learn parameter for parallelizing jobs
-                if param.split('__')[-1] == parameter_name:
-                    result.append(value)
-            return result
-        elif isinstance(param_grid, list):
-            result = list()
-            for sub_grid in param_grid:
-                result.extend(SklearnExtension._get_parameter_values_recursive(sub_grid,
-                                                                               parameter_name))
-            return result
-        else:
-            raise ValueError('Param_grid should either be a dict or list of dicts')
-
-    def _prevent_optimize_n_jobs(self, model):
-        """
-        Ensures that HPO classes will not optimize the n_jobs hyperparameter
-
-        Parameters:
-        -----------
-        model:
-            The model that will be fitted
-        """
-        if self._is_hpo_class(model):
-            if isinstance(model, sklearn.model_selection.GridSearchCV):
-                param_distributions = model.param_grid
-            elif isinstance(model, sklearn.model_selection.RandomizedSearchCV):
-                param_distributions = model.param_distributions
-            else:
-                if hasattr(model, 'param_distributions'):
-                    param_distributions = model.param_distributions
-                else:
-                    raise AttributeError('Using subclass BaseSearchCV other than '
-                                         '{GridSearchCV, RandomizedSearchCV}. '
-                                         'Could not find attribute '
-                                         'param_distributions.')
-                print('Warning! Using subclass BaseSearchCV other than '
-                      '{GridSearchCV, RandomizedSearchCV}. '
-                      'Should implement param check. ')
-            n_jobs_vals = SklearnExtension._get_parameter_values_recursive(param_distributions,
-                                                                           'n_jobs')
-            if len(n_jobs_vals) > 0:
-                raise PyOpenMLError('openml-python should not be used to '
-                                    'optimize the n_jobs parameter.')
-
-    def _can_measure_cputime(self, model: Any) -> bool:
-        """
-        Returns True if the parameter settings of model are chosen s.t. the model
-        will run on a single core (if so, openml-python can measure cpu-times)
-
-        Parameters:
-        -----------
-        model:
-            The model that will be fitted
-
-        Returns:
-        --------
-        bool:
-            True if all n_jobs parameters will be either set to None or 1, False otherwise
-        """
-        if not (
-                isinstance(model, sklearn.base.BaseEstimator) or self._is_hpo_class(model)
-        ):
-            raise ValueError('model should be BaseEstimator or BaseSearchCV')
-
-        # check the parameters for n_jobs
-        n_jobs_vals = SklearnExtension._get_parameter_values_recursive(model.get_params(), 'n_jobs')
-        for val in n_jobs_vals:
-            if val is not None and val != 1:
-                return False
-        return True
-
-    def _can_measure_wallclocktime(self, model: Any) -> bool:
-        """
-        Returns True if the parameter settings of model are chosen s.t. the model
-        will run on a preset number of cores (if so, openml-python can measure wall-clock time)
-
-        Parameters:
-        -----------
-        model:
-            The model that will be fitted
-
-        Returns:
-        --------
-        bool:
-            True if no n_jobs parameters is set to -1, False otherwise
-        """
-        if not (
-                isinstance(model, sklearn.base.BaseEstimator) or self._is_hpo_class(model)
-        ):
-            raise ValueError('model should be BaseEstimator or BaseSearchCV')
-
-        # check the parameters for n_jobs
-        n_jobs_vals = SklearnExtension._get_parameter_values_recursive(model.get_params(), 'n_jobs')
-        return -1 not in n_jobs_vals
-
-    ################################################################################################
-    # Methods for performing runs with extension modules
-
-    def is_estimator(self, model: Any) -> bool:
-        """Check whether the given model is a scikit-learn estimator.
-
-        This function is only required for backwards compatibility and will be removed in the
-        near future.
-
-        Parameters
-        ----------
-        model : Any
-
-        Returns
-        -------
-        bool
-        """
-        o = model
-        return hasattr(o, 'fit') and hasattr(o, 'get_params') and hasattr(o, 'set_params')
-
-    def seed_model(self, model: Any, seed: Optional[int] = None) -> Any:
-        """Set the random state of all the unseeded components of a model and return the seeded
-        model.
-
-        Required so that all seed information can be uploaded to OpenML for reproducible results.
-
-        Models that are already seeded will maintain the seed. In this case,
-        only integer seeds are allowed (An exception is raised when a RandomState was used as
-        seed).
-
-        Parameters
-        ----------
-        model : sklearn model
-            The model to be seeded
-        seed : int
-            The seed to initialize the RandomState with. Unseeded subcomponents
-            will be seeded with a random number from the RandomState.
-
-        Returns
-        -------
-        Any
-        """
-
-        def _seed_current_object(current_value):
-            if isinstance(current_value, int):  # acceptable behaviour
-                return False
-            elif isinstance(current_value, np.random.RandomState):
-                raise ValueError(
-                    'Models initialized with a RandomState object are not '
-                    'supported. Please seed with an integer. ')
-            elif current_value is not None:
-                raise ValueError(
-                    'Models should be seeded with int or None (this should never '
-                    'happen). ')
-            else:
-                return True
-
-        rs = np.random.RandomState(seed)
-        model_params = model.get_params()
-        random_states = {}
-        for param_name in sorted(model_params):
-            if 'random_state' in param_name:
-                current_value = model_params[param_name]
-                # important to draw the value at this point (and not in the if
-                # statement) this way we guarantee that if a different set of
-                # subflows is seeded, the same number of the random generator is
-                # used
-                new_value = rs.randint(0, 2 ** 16)
-                if _seed_current_object(current_value):
-                    random_states[param_name] = new_value
-
-            # Also seed CV objects!
-            elif isinstance(model_params[param_name], sklearn.model_selection.BaseCrossValidator):
-                if not hasattr(model_params[param_name], 'random_state'):
-                    continue
-
-                current_value = model_params[param_name].random_state
-                new_value = rs.randint(0, 2 ** 16)
-                if _seed_current_object(current_value):
-                    model_params[param_name].random_state = new_value
-
-        model.set_params(**random_states)
-        return model
-
-    def _run_model_on_fold(
-        self,
-        model: Any,
-        task: 'OpenMLTask',
-        X_train: Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame],
-        rep_no: int,
-        fold_no: int,
-        y_train: Optional[np.ndarray] = None,
-        X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None,
-    ) -> Tuple[np.ndarray, np.ndarray, 'OrderedDict[str, float]', Optional[OpenMLRunTrace]]:
-        """Run a model on a repeat,fold,subsample triplet of the task and return prediction
-        information.
-
-        Furthermore, it will measure run time measures in case multi-core behaviour allows this.
-        * exact user cpu time will be measured if the number of cores is set (recursive throughout
-        the model) exactly to 1
-        * wall clock time will be measured if the number of cores is set (recursive throughout the
-        model) to any given number (but not when it is set to -1)
-
-        Returns the data that is necessary to construct the OpenML Run object. Is used by
-        run_task_get_arff_content. Do not use this function unless you know what you are doing.
-
-        Parameters
-        ----------
-        model : Any
-            The UNTRAINED model to run. The model instance will be copied and not altered.
-        task : OpenMLTask
-            The task to run the model on.
-        X_train : array-like
-            Training data for the given repetition and fold.
-        rep_no : int
-            The repeat of the experiment (0-based; in case of 1 time CV, always 0)
-        fold_no : int
-            The fold nr of the experiment (0-based; in case of holdout, always 0)
-        y_train : Optional[np.ndarray] (default=None)
-            Target attributes for supervised tasks. In case of classification, these are integer
-            indices to the potential classes specified by dataset.
-        X_test : Optional, array-like (default=None)
-            Test attributes to test for generalization in supervised tasks.
-
-        Returns
-        -------
-        arff_datacontent : List[List]
-            Arff representation (list of lists) of the predictions that were
-            generated by this fold (required to populate predictions.arff)
-        arff_tracecontent :  List[List]
-            Arff representation (list of lists) of the trace data that was generated by this
-            fold
-            (will be used to populate trace.arff, leave it empty if the model did not perform
-            any
-            hyperparameter optimization).
-        user_defined_measures : OrderedDict[str, float]
-            User defined measures that were generated on this fold
-        model : Any
-            The model trained on this repeat,fold,subsample triple. Will be used to generate
-            trace
-            information later on (in ``obtain_arff_trace``).
-        """
-
-        def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarray:
-            """Transforms predicted probabilities to match with OpenML class indices.
-
-            Parameters
-            ----------
-            y : np.ndarray
-                Predicted probabilities (possibly omitting classes if they were not present in the
-                training data).
-            model_classes : list
-                List of classes known_predicted by the model, ordered by their index.
-
-            Returns
-            -------
-            np.ndarray
-            """
-            # y: list or numpy array of predictions
-            # model_classes: sklearn classifier mapping from original array id to
-            # prediction index id
-            if not isinstance(classes, list):
-                raise ValueError('please convert model classes to list prior to '
-                                 'calling this fn')
-            result = np.zeros((len(y), len(classes)), dtype=np.float32)
-            for obs, prediction_idx in enumerate(y):
-                result[obs][prediction_idx] = 1.0
-            return result
-
-        if isinstance(task, OpenMLSupervisedTask):
-            if y_train is None:
-                raise TypeError('argument y_train must not be of type None')
-            if X_test is None:
-                raise TypeError('argument X_test must not be of type None')
-
-        # TODO: if possible, give a warning if model is already fitted (acceptable
-        # in case of custom experimentation,
-        # but not desirable if we want to upload to OpenML).
-
-        model_copy = sklearn.base.clone(model, safe=True)
-        # sanity check: prohibit users from optimizing n_jobs
-        self._prevent_optimize_n_jobs(model_copy)
-        # Runtime can be measured if the model is run sequentially
-        can_measure_cputime = self._can_measure_cputime(model_copy)
-        can_measure_wallclocktime = self._can_measure_wallclocktime(model_copy)
-
-        user_defined_measures = OrderedDict()  # type: 'OrderedDict[str, float]'
-
-        try:
-            # for measuring runtime. Only available since Python 3.3
-            modelfit_start_cputime = time.process_time()
-            modelfit_start_walltime = time.time()
-
-            if isinstance(task, OpenMLSupervisedTask):
-                model_copy.fit(X_train, y_train)
-            elif isinstance(task, OpenMLClusteringTask):
-                model_copy.fit(X_train)
-
-            modelfit_dur_cputime = (time.process_time() - modelfit_start_cputime) * 1000
-            if can_measure_cputime:
-                user_defined_measures['usercpu_time_millis_training'] = modelfit_dur_cputime
-
-            modelfit_dur_walltime = (time.time() - modelfit_start_walltime) * 1000
-            if can_measure_wallclocktime:
-                user_defined_measures['wall_clock_time_millis_training'] = modelfit_dur_walltime
-
-        except AttributeError as e:
-            # typically happens when training a regressor on classification task
-            raise PyOpenMLError(str(e))
-
-        if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
-            # search for model classes_ (might differ depending on modeltype)
-            # first, pipelines are a special case (these don't have a classes_
-            # object, but rather borrows it from the last step. We do this manually,
-            # because of the BaseSearch check)
-            if isinstance(model_copy, sklearn.pipeline.Pipeline):
-                used_estimator = model_copy.steps[-1][-1]
-            else:
-                used_estimator = model_copy
-
-            if self._is_hpo_class(used_estimator):
-                model_classes = used_estimator.best_estimator_.classes_
-            else:
-                model_classes = used_estimator.classes_
-
-        modelpredict_start_cputime = time.process_time()
-        modelpredict_start_walltime = time.time()
-
-        # In supervised learning this returns the predictions for Y, in clustering
-        # it returns the clusters
-        if isinstance(task, OpenMLSupervisedTask):
-            pred_y = model_copy.predict(X_test)
-        elif isinstance(task, OpenMLClusteringTask):
-            pred_y = model_copy.predict(X_train)
-        else:
-            raise ValueError(task)
-
-        if can_measure_cputime:
-            modelpredict_duration_cputime = (time.process_time()
-                                             - modelpredict_start_cputime) * 1000
-            user_defined_measures['usercpu_time_millis_testing'] = modelpredict_duration_cputime
-            user_defined_measures['usercpu_time_millis'] = (modelfit_dur_cputime
-                                                            + modelpredict_duration_cputime)
-        if can_measure_wallclocktime:
-            modelpredict_duration_walltime = (time.time() - modelpredict_start_walltime) * 1000
-            user_defined_measures['wall_clock_time_millis_testing'] = modelpredict_duration_walltime
-            user_defined_measures['wall_clock_time_millis'] = (modelfit_dur_walltime
-                                                               + modelpredict_duration_walltime)
-
-        if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
-
-            try:
-                proba_y = model_copy.predict_proba(X_test)
-            except AttributeError:
-                if task.class_labels is not None:
-                    proba_y = _prediction_to_probabilities(pred_y, list(task.class_labels))
-                else:
-                    raise ValueError('The task has no class labels')
-
-            if task.class_labels is not None:
-                if proba_y.shape[1] != len(task.class_labels):
-                    # Remap the probabilities in case there was a class missing
-                    # at training time. By default, the classification targets
-                    # are mapped to be zero-based indices to the actual classes.
-                    # Therefore, the model_classes contain the correct indices to
-                    # the correct probability array. Example:
-                    # classes in the dataset: 0, 1, 2, 3, 4, 5
-                    # classes in the training set: 0, 1, 2, 4, 5
-                    # then we need to add a column full of zeros into the probabilities
-                    # for class 3 because the rest of the library expects that the
-                    # probabilities are ordered the same way as the classes are ordered).
-                    proba_y_new = np.zeros((proba_y.shape[0], len(task.class_labels)))
-                    for idx, model_class in enumerate(model_classes):
-                        proba_y_new[:, model_class] = proba_y[:, idx]
-                    proba_y = proba_y_new
-
-                if proba_y.shape[1] != len(task.class_labels):
-                    message = "Estimator only predicted for {}/{} classes!".format(
-                        proba_y.shape[1], len(task.class_labels),
-                    )
-                    warnings.warn(message)
-                    openml.config.logger.warn(message)
-            else:
-                raise ValueError('The task has no class labels')
-
-        elif isinstance(task, OpenMLRegressionTask):
-            proba_y = None
-
-        elif isinstance(task, OpenMLClusteringTask):
-            proba_y = None
-
-        else:
-            raise TypeError(type(task))
-
-        if self._is_hpo_class(model_copy):
-            trace_data = self._extract_trace_data(model_copy, rep_no, fold_no)
-            trace = self._obtain_arff_trace(model_copy, trace_data)  # type: Optional[OpenMLRunTrace]  # noqa E501
-        else:
-            trace = None
-
-        return pred_y, proba_y, user_defined_measures, trace
-
-    def obtain_parameter_values(
-        self,
-        flow: 'OpenMLFlow',
-        model: Any = None,
-    ) -> List[Dict[str, Any]]:
-        """Extracts all parameter settings required for the flow from the model.
-
-        If no explicit model is provided, the parameters will be extracted from `flow.model`
-        instead.
-
-        Parameters
-        ----------
-        flow : OpenMLFlow
-            OpenMLFlow object (containing flow ids, i.e., it has to be downloaded from the server)
-
-        model: Any, optional (default=None)
-            The model from which to obtain the parameter values. Must match the flow signature.
-            If None, use the model specified in ``OpenMLFlow.model``.
-
-        Returns
-        -------
-        list
-            A list of dicts, where each dict has the following entries:
-            - ``oml:name`` : str: The OpenML parameter name
-            - ``oml:value`` : mixed: A representation of the parameter value
-            - ``oml:component`` : int: flow id to which the parameter belongs
-        """
-        openml.flows.functions._check_flow_for_server_id(flow)
-
-        def get_flow_dict(_flow):
-            flow_map = {_flow.name: _flow.flow_id}
-            for subflow in _flow.components:
-                flow_map.update(get_flow_dict(_flow.components[subflow]))
-            return flow_map
-
-        def extract_parameters(_flow, _flow_dict, component_model,
-                               _main_call=False, main_id=None):
-            def is_subcomponent_specification(values):
-                # checks whether the current value can be a specification of
-                # subcomponents, as for example the value for steps parameter
-                # (in Pipeline) or transformers parameter (in
-                # ColumnTransformer). These are always lists/tuples of lists/
-                # tuples, size bigger than 2 and an OpenMLFlow item involved.
-                if not isinstance(values, (tuple, list)):
-                    return False
-                for item in values:
-                    if not isinstance(item, (tuple, list)):
-                        return False
-                    if len(item) < 2:
-                        return False
-                    if not isinstance(item[1], openml.flows.OpenMLFlow):
-                        return False
-                return True
-
-            # _flow is openml flow object, _param dict maps from flow name to flow
-            # id for the main call, the param dict can be overridden (useful for
-            # unit tests / sentinels) this way, for flows without subflows we do
-            # not have to rely on _flow_dict
-            exp_parameters = set(_flow.parameters)
-            exp_components = set(_flow.components)
-            model_parameters = set([mp for mp in component_model.get_params()
-                                    if '__' not in mp])
-            if len((exp_parameters | exp_components) ^ model_parameters) != 0:
-                flow_params = sorted(exp_parameters | exp_components)
-                model_params = sorted(model_parameters)
-                raise ValueError('Parameters of the model do not match the '
-                                 'parameters expected by the '
-                                 'flow:\nexpected flow parameters: '
-                                 '%s\nmodel parameters: %s' % (flow_params,
-                                                               model_params))
-
-            _params = []
-            for _param_name in _flow.parameters:
-                _current = OrderedDict()
-                _current['oml:name'] = _param_name
-
-                current_param_values = self.model_to_flow(component_model.get_params()[_param_name])
-
-                # Try to filter out components (a.k.a. subflows) which are
-                # handled further down in the code (by recursively calling
-                # this function)!
-                if isinstance(current_param_values, openml.flows.OpenMLFlow):
-                    continue
-
-                if is_subcomponent_specification(current_param_values):
-                    # complex parameter value, with subcomponents
-                    parsed_values = list()
-                    for subcomponent in current_param_values:
-                        # scikit-learn stores usually tuples in the form
-                        # (name (str), subcomponent (mixed), argument
-                        # (mixed)). OpenML replaces the subcomponent by an
-                        # OpenMLFlow object.
-                        if len(subcomponent) < 2 or len(subcomponent) > 3:
-                            raise ValueError('Component reference should be '
-                                             'size {2,3}. ')
-
-                        subcomponent_identifier = subcomponent[0]
-                        subcomponent_flow = subcomponent[1]
-                        if not isinstance(subcomponent_identifier, str):
-                            raise TypeError('Subcomponent identifier should be '
-                                            'string')
-                        if not isinstance(subcomponent_flow,
-                                          openml.flows.OpenMLFlow):
-                            raise TypeError('Subcomponent flow should be string')
-
-                        current = {
-                            "oml-python:serialized_object": "component_reference",
-                            "value": {
-                                "key": subcomponent_identifier,
-                                "step_name": subcomponent_identifier
-                            }
-                        }
-                        if len(subcomponent) == 3:
-                            if not isinstance(subcomponent[2], list):
-                                raise TypeError('Subcomponent argument should be'
-                                                'list')
-                            current['value']['argument_1'] = subcomponent[2]
-                        parsed_values.append(current)
-                    parsed_values = json.dumps(parsed_values)
-                else:
-                    # vanilla parameter value
-                    parsed_values = json.dumps(current_param_values)
-
-                _current['oml:value'] = parsed_values
-                if _main_call:
-                    _current['oml:component'] = main_id
-                else:
-                    _current['oml:component'] = _flow_dict[_flow.name]
-                _params.append(_current)
-
-            for _identifier in _flow.components:
-                subcomponent_model = component_model.get_params()[_identifier]
-                _params.extend(extract_parameters(_flow.components[_identifier],
-                                                  _flow_dict, subcomponent_model))
-            return _params
-
-        flow_dict = get_flow_dict(flow)
-        model = model if model is not None else flow.model
-        parameters = extract_parameters(flow, flow_dict, model, True, flow.flow_id)
-
-        return parameters
-
-    def _openml_param_name_to_sklearn(
-        self,
-        openml_parameter: openml.setups.OpenMLParameter,
-        flow: OpenMLFlow,
-    ) -> str:
-        """
-        Converts the name of an OpenMLParameter into the sklean name, given a flow.
-
-        Parameters
-        ----------
-        openml_parameter: OpenMLParameter
-            The parameter under consideration
-
-        flow: OpenMLFlow
-            The flow that provides context.
-
-        Returns
-        -------
-        sklearn_parameter_name: str
-            The name the parameter will have once used in scikit-learn
-        """
-        if not isinstance(openml_parameter, openml.setups.OpenMLParameter):
-            raise ValueError('openml_parameter should be an instance of OpenMLParameter')
-        if not isinstance(flow, OpenMLFlow):
-            raise ValueError('flow should be an instance of OpenMLFlow')
-
-        flow_structure = flow.get_structure('name')
-        if openml_parameter.flow_name not in flow_structure:
-            raise ValueError('Obtained OpenMLParameter and OpenMLFlow do not correspond. ')
-        name = openml_parameter.flow_name  # for PEP8
-        return '__'.join(flow_structure[name] + [openml_parameter.parameter_name])
-
-    ################################################################################################
-    # Methods for hyperparameter optimization
-
-    def _is_hpo_class(self, model: Any) -> bool:
-        """Check whether the model performs hyperparameter optimization.
-
-        Used to check whether an optimization trace can be extracted from the model after
-        running it.
-
-        Parameters
-        ----------
-        model : Any
-
-        Returns
-        -------
-        bool
-        """
-        return isinstance(model, sklearn.model_selection._search.BaseSearchCV)
-
-    def instantiate_model_from_hpo_class(
-        self,
-        model: Any,
-        trace_iteration: OpenMLTraceIteration,
-    ) -> Any:
-        """Instantiate a ``base_estimator`` which can be searched over by the hyperparameter
-        optimization model.
-
-        Parameters
-        ----------
-        model : Any
-            A hyperparameter optimization model which defines the model to be instantiated.
-        trace_iteration : OpenMLTraceIteration
-            Describing the hyperparameter settings to instantiate.
-
-        Returns
-        -------
-        Any
-        """
-        if not self._is_hpo_class(model):
-            raise AssertionError(
-                'Flow model %s is not an instance of sklearn.model_selection._search.BaseSearchCV'
-                % model
-            )
-        base_estimator = model.estimator
-        base_estimator.set_params(**trace_iteration.get_parameters())
-        return base_estimator
-
-    def _extract_trace_data(self, model, rep_no, fold_no):
-        arff_tracecontent = []
-        for itt_no in range(0, len(model.cv_results_['mean_test_score'])):
-            # we use the string values for True and False, as it is defined in
-            # this way by the OpenML server
-            selected = 'false'
-            if itt_no == model.best_index_:
-                selected = 'true'
-            test_score = model.cv_results_['mean_test_score'][itt_no]
-            arff_line = [rep_no, fold_no, itt_no, test_score, selected]
-            for key in model.cv_results_:
-                if key.startswith('param_'):
-                    value = model.cv_results_[key][itt_no]
-                    if value is not np.ma.masked:
-                        serialized_value = json.dumps(value)
-                    else:
-                        serialized_value = np.nan
-                    arff_line.append(serialized_value)
-            arff_tracecontent.append(arff_line)
-        return arff_tracecontent
-
-    def _obtain_arff_trace(
-        self,
-        model: Any,
-        trace_content: List,
-    ) -> 'OpenMLRunTrace':
-        """Create arff trace object from a fitted model and the trace content obtained by
-        repeatedly calling ``run_model_on_task``.
-
-        Parameters
-        ----------
-        model : Any
-            A fitted hyperparameter optimization model.
-
-        trace_content : List[List]
-            Trace content obtained by ``openml.runs.run_flow_on_task``.
-
-        Returns
-        -------
-        OpenMLRunTrace
-        """
-        if not self._is_hpo_class(model):
-            raise AssertionError(
-                'Flow model %s is not an instance of sklearn.model_selection._search.BaseSearchCV'
-                % model
-            )
-        if not hasattr(model, 'cv_results_'):
-            raise ValueError('model should contain `cv_results_`')
-
-        # attributes that will be in trace arff, regardless of the model
-        trace_attributes = [('repeat', 'NUMERIC'),
-                            ('fold', 'NUMERIC'),
-                            ('iteration', 'NUMERIC'),
-                            ('evaluation', 'NUMERIC'),
-                            ('selected', ['true', 'false'])]
-
-        # model dependent attributes for trace arff
-        for key in model.cv_results_:
-            if key.startswith('param_'):
-                # supported types should include all types, including bool,
-                # int float
-                supported_basic_types = (bool, int, float, str)
-                for param_value in model.cv_results_[key]:
-                    if isinstance(param_value, supported_basic_types) or \
-                            param_value is None or param_value is np.ma.masked:
-                        # basic string values
-                        type = 'STRING'
-                    elif isinstance(param_value, list) and \
-                            all(isinstance(i, int) for i in param_value):
-                        # list of integers
-                        type = 'STRING'
-                    else:
-                        raise TypeError('Unsupported param type in param grid: %s' % key)
-
-                # renamed the attribute param to parameter, as this is a required
-                # OpenML convention - this also guards against name collisions
-                # with the required trace attributes
-                attribute = (PREFIX + key[6:], type)
-                trace_attributes.append(attribute)
-
-        return OpenMLRunTrace.generate(
-            trace_attributes,
-            trace_content,
-        )
-
-
-register_extension(SklearnExtension)
diff --git a/openml/flows/__init__.py b/openml/flows/__init__.py
index 504c37c1a..d455249de 100644
--- a/openml/flows/__init__.py
+++ b/openml/flows/__init__.py
@@ -1,11 +1,21 @@
-from .flow import OpenMLFlow
+# License: BSD 3-Clause
 
-from .functions import get_flow, list_flows, flow_exists, assert_flows_equal
+from .flow import OpenMLFlow
+from .functions import (
+    assert_flows_equal,
+    delete_flow,
+    flow_exists,
+    get_flow,
+    get_flow_id,
+    list_flows,
+)
 
 __all__ = [
-    'OpenMLFlow',
-    'get_flow',
-    'list_flows',
-    'flow_exists',
-    'assert_flows_equal',
+    "OpenMLFlow",
+    "assert_flows_equal",
+    "delete_flow",
+    "flow_exists",
+    "get_flow",
+    "get_flow_id",
+    "list_flows",
 ]
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
index 0db69d16f..7dd84fdee 100644
--- a/openml/flows/flow.py
+++ b/openml/flows/flow.py
@@ -1,24 +1,29 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import logging
 from collections import OrderedDict
-import os
-from typing import Dict, List, Union  # noqa: F401
+from collections.abc import Hashable, Sequence
+from pathlib import Path
+from typing import Any, cast
 
 import xmltodict
 
-from ..extensions import get_extension_by_flow
-from ..utils import extract_xml_tags, _tag_entity
-
-import openml.config
+from openml.base import OpenMLBase
+from openml.extensions import Extension, get_extension_by_flow
+from openml.utils import extract_xml_tags
 
 
-class OpenMLFlow(object):
+class OpenMLFlow(OpenMLBase):
     """OpenML Flow. Stores machine learning models.
 
     Flows should not be generated manually, but by the function
     :meth:`openml.flows.create_flow_from_model`. Using this helper function
     ensures that all relevant fields are filled in.
 
-    Implements https://github.com/openml/website/blob/master/openml_OS/ \
-        views/pages/api_new/v1/xsd/openml.implementation.upload.xsd.
+    Implements `openml.implementation.upload.xsd
+    <https://github.com/openml/openml/blob/master/openml_OS/views/pages/api_new/v1/xsd/
+    openml.implementation.upload.xsd>`_.
 
     Parameters
     ----------
@@ -56,10 +61,10 @@ class OpenMLFlow(object):
         A list of dependencies necessary to run the flow. This field should
         contain all libraries the flow depends on. To allow reproducibility
         it should also specify the exact version numbers.
-    class_name : str
+    class_name : str, optional
         The development language name of the class which is described by this
         flow.
-    custom_name : str
+    custom_name : str, optional
         Custom name of the flow given by the owner.
     binary_url : str, optional
         Url from which the binary can be downloaded. Added by the server.
@@ -78,27 +83,49 @@ class OpenMLFlow(object):
         Date the flow was uploaded. Filled in by the server.
     flow_id : int, optional
         Flow ID. Assigned by the server.
+    extension : Extension, optional
+        The extension for a flow (e.g., sklearn).
     version : str, optional
         OpenML version of the flow. Assigned by the server.
     """
 
-    def __init__(self, name, description, model, components, parameters,
-                 parameters_meta_info, external_version, tags, language,
-                 dependencies, class_name=None, custom_name=None,
-                 binary_url=None, binary_format=None,
-                 binary_md5=None, uploader=None, upload_date=None,
-                 flow_id=None, extension=None, version=None):
+    def __init__(  # noqa: PLR0913
+        self,
+        name: str,
+        description: str,
+        model: object,
+        components: dict,
+        parameters: dict,
+        parameters_meta_info: dict,
+        external_version: str,
+        tags: list,
+        language: str,
+        dependencies: str,
+        class_name: str | None = None,
+        custom_name: str | None = None,
+        binary_url: str | None = None,
+        binary_format: str | None = None,
+        binary_md5: str | None = None,
+        uploader: str | None = None,
+        upload_date: str | None = None,
+        flow_id: int | None = None,
+        extension: Extension | None = None,
+        version: str | None = None,
+    ):
         self.name = name
         self.description = description
         self.model = model
 
         for variable, variable_name in [
-                [components, 'components'],
-                [parameters, 'parameters'],
-                [parameters_meta_info, 'parameters_meta_info']]:
-            if not isinstance(variable, OrderedDict):
-                raise TypeError('%s must be of type OrderedDict, '
-                                'but is %s.' % (variable_name, type(variable)))
+            [components, "components"],
+            [parameters, "parameters"],
+            [parameters_meta_info, "parameters_meta_info"],
+        ]:
+            if not isinstance(variable, (OrderedDict, dict)):
+                raise TypeError(
+                    f"{variable_name} must be of type OrderedDict or dict, "
+                    f"but is {type(variable)}.",
+                )
 
         self.components = components
         self.parameters = parameters
@@ -108,15 +135,15 @@ def __init__(self, name, description, model, components, parameters,
         keys_parameters = set(parameters.keys())
         keys_parameters_meta_info = set(parameters_meta_info.keys())
         if len(keys_parameters.difference(keys_parameters_meta_info)) > 0:
-            raise ValueError('Parameter %s only in parameters, but not in '
-                             'parameters_meta_info.' %
-                             str(keys_parameters.difference(
-                                 keys_parameters_meta_info)))
+            raise ValueError(
+                f"Parameter {keys_parameters.difference(keys_parameters_meta_info)!s} only in "
+                "parameters, but not in parameters_meta_info.",
+            )
         if len(keys_parameters_meta_info.difference(keys_parameters)) > 0:
-            raise ValueError('Parameter %s only in parameters_meta_info, '
-                             'but not in parameters.' %
-                             str(keys_parameters_meta_info.difference(
-                                 keys_parameters)))
+            raise ValueError(
+                f"Parameter {keys_parameters_meta_info.difference(keys_parameters)!s} only in "
+                " parameters_meta_info, but not in parameters.",
+            )
 
         self.external_version = external_version
         self.uploader = uploader
@@ -131,156 +158,151 @@ def __init__(self, name, description, model, components, parameters,
         self.language = language
         self.dependencies = dependencies
         self.flow_id = flow_id
-        if extension is None:
-            self._extension = get_extension_by_flow(self)
-        else:
-            self._extension = extension
+        self._extension = extension
+
+    @property
+    def id(self) -> int | None:
+        """The ID of the flow."""
+        return self.flow_id
 
     @property
-    def extension(self):
-        if self._extension is not None:
-            return self._extension
-        else:
-            raise RuntimeError("No extension could be found for flow {}: {}"
-                               .format(self.flow_id, self.name))
-
-    def __repr__(self):
-        header = "OpenML Flow"
-        header = '{}\n{}\n'.format(header, '=' * len(header))
-
-        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
-        fields = {"Flow Name": self.name,
-                  "Flow Description": self.description,
-                  "Dependencies": self.dependencies}
+    def extension(self) -> Extension:
+        """The extension of the flow (e.g., sklearn)."""
+        if self._extension is None:
+            self._extension = cast(
+                "Extension", get_extension_by_flow(self, raise_if_no_extension=True)
+            )
+
+        return self._extension
+
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
+        """Collect all information to display in the __repr__ body."""
+        fields = {
+            "Flow Name": self.name,
+            "Flow Description": self.description,
+            "Dependencies": self.dependencies,
+        }
         if self.flow_id is not None:
+            fields["Flow URL"] = self.openml_url if self.openml_url is not None else "None"
+            fields["Flow ID"] = str(self.flow_id)
             if self.version is not None:
-                fields["Flow ID"] = "{} (version {})".format(self.flow_id, self.version)
-            else:
-                fields["Flow ID"] = self.flow_id
-            fields["Flow URL"] = "{}f/{}".format(base_url, self.flow_id)
+                fields["Flow ID"] += f" (version {self.version})"
         if self.upload_date is not None:
-            fields["Upload Date"] = self.upload_date.replace('T', ' ')
+            fields["Upload Date"] = self.upload_date.replace("T", " ")
         if self.binary_url is not None:
             fields["Binary URL"] = self.binary_url
 
         # determines the order in which the information will be printed
-        order = ["Flow ID", "Flow URL", "Flow Name", "Flow Description", "Binary URL",
-                 "Upload Date", "Dependencies"]
-        fields = [(key, fields[key]) for key in order if key in fields]
-
-        longest_field_name_length = max(len(name) for name, value in fields)
-        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
-        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
-        return header + body
-
-    def _to_xml(self) -> str:
-        """Generate xml representation of self for upload to server.
-
-        Returns
-        -------
-        str
-            Flow represented as XML string.
-        """
-        flow_dict = self._to_dict()
-        flow_xml = xmltodict.unparse(flow_dict, pretty=True)
-
-        # A flow may not be uploaded with the xml encoding specification:
-        # <?xml version="1.0" encoding="utf-8"?>
-        flow_xml = flow_xml.split('\n', 1)[-1]
-        return flow_xml
-
-    def _to_dict(self) -> dict:
-        """ Helper function used by _to_xml and itself.
-
-        Creates a dictionary representation of self which can be serialized
-        to xml by the function _to_xml. Since a flow can contain subflows
-        (components) this helper function calls itself recursively to also
-        serialize these flows to dictionaries.
-
-        Uses OrderedDict to ensure consistent ordering when converting to xml.
-        The return value (OrderedDict) will be used to create the upload xml
-        file. The xml file must have the tags in exactly the order given in the
-        xsd schema of a flow (see class docstring).
-
-        Returns
-        -------
-        OrderedDict
-            Flow represented as OrderedDict.
-
-        """
-        flow_container = OrderedDict()  # type: 'OrderedDict[str, OrderedDict]'
-        flow_dict = OrderedDict([('@xmlns:oml', 'http://openml.org/openml')])  # type: 'OrderedDict[str, Union[List, str]]'  # noqa E501
-        flow_container['oml:flow'] = flow_dict
-        _add_if_nonempty(flow_dict, 'oml:id', self.flow_id)
+        order = [
+            "Flow ID",
+            "Flow URL",
+            "Flow Name",
+            "Flow Description",
+            "Binary URL",
+            "Upload Date",
+            "Dependencies",
+        ]
+        return [(key, fields[key]) for key in order if key in fields]
+
+    def _to_dict(self) -> dict[str, dict]:  # noqa: C901, PLR0912
+        """Creates a dictionary representation of self."""
+        flow_container = OrderedDict()  # type: 'dict[str, dict]'
+        flow_dict = OrderedDict(
+            [("@xmlns:oml", "http://openml.org/openml")],
+        )  # type: 'dict[str, list | str]'  # E501
+        flow_container["oml:flow"] = flow_dict
+        _add_if_nonempty(flow_dict, "oml:id", self.flow_id)
 
         for required in ["name", "external_version"]:
             if getattr(self, required) is None:
-                raise ValueError("self.{} is required but None".format(
-                    required))
-        for attribute in ["uploader", "name", "custom_name", "class_name",
-                          "version", "external_version", "description",
-                          "upload_date", "language", "dependencies"]:
-            _add_if_nonempty(flow_dict, 'oml:{}'.format(attribute),
-                             getattr(self, attribute))
+                raise ValueError(f"self.{required} is required but None")
+        for attribute in [
+            "uploader",
+            "name",
+            "custom_name",
+            "class_name",
+            "version",
+            "external_version",
+            "description",
+            "upload_date",
+            "language",
+            "dependencies",
+        ]:
+            _add_if_nonempty(flow_dict, f"oml:{attribute}", getattr(self, attribute))
+
+        if not self.description:
+            logger = logging.getLogger(__name__)
+            logger.warning("Flow % has empty description", self.name)
 
         flow_parameters = []
         for key in self.parameters:
             param_dict = OrderedDict()  # type: 'OrderedDict[str, str]'
-            param_dict['oml:name'] = key
+            param_dict["oml:name"] = key
             meta_info = self.parameters_meta_info[key]
 
-            _add_if_nonempty(param_dict, 'oml:data_type',
-                             meta_info['data_type'])
-            param_dict['oml:default_value'] = self.parameters[key]
-            _add_if_nonempty(param_dict, 'oml:description',
-                             meta_info['description'])
+            _add_if_nonempty(param_dict, "oml:data_type", meta_info["data_type"])
+            param_dict["oml:default_value"] = self.parameters[key]
+            _add_if_nonempty(param_dict, "oml:description", meta_info["description"])
 
             for key_, value in param_dict.items():
                 if key_ is not None and not isinstance(key_, str):
-                    raise ValueError('Parameter name %s cannot be serialized '
-                                     'because it is of type %s. Only strings '
-                                     'can be serialized.' % (key_, type(key_)))
+                    raise ValueError(
+                        f"Parameter name {key_} cannot be serialized "
+                        f"because it is of type {type(key_)}. Only strings "
+                        "can be serialized.",
+                    )
                 if value is not None and not isinstance(value, str):
-                    raise ValueError('Parameter value %s cannot be serialized '
-                                     'because it is of type %s. Only strings '
-                                     'can be serialized.'
-                                     % (value, type(value)))
+                    raise ValueError(
+                        f"Parameter value {value} cannot be serialized "
+                        f"because it is of type {type(value)}. Only strings "
+                        "can be serialized.",
+                    )
 
             flow_parameters.append(param_dict)
 
-        flow_dict['oml:parameter'] = flow_parameters
+        flow_dict["oml:parameter"] = flow_parameters
 
         components = []
         for key in self.components:
-            component_dict = OrderedDict()  # type: 'OrderedDict[str, Dict]'
-            component_dict['oml:identifier'] = key
-            component_dict['oml:flow'] = self.components[key]._to_dict()['oml:flow']
+            component_dict = OrderedDict()  # type: 'OrderedDict[str, dict]'
+            component_dict["oml:identifier"] = key
+            if self.components[key] in ["passthrough", "drop"]:
+                component_dict["oml:flow"] = {
+                    "oml-python:serialized_object": "component_reference",
+                    "value": {"key": self.components[key], "step_name": self.components[key]},
+                }
+            else:
+                component_dict["oml:flow"] = self.components[key]._to_dict()["oml:flow"]
 
             for key_ in component_dict:
                 # We only need to check if the key is a string, because the
                 # value is a flow. The flow itself is valid by recursion
                 if key_ is not None and not isinstance(key_, str):
-                    raise ValueError('Parameter name %s cannot be serialized '
-                                     'because it is of type %s. Only strings '
-                                     'can be serialized.' % (key_, type(key_)))
+                    raise ValueError(
+                        f"Parameter name {key_} cannot be serialized "
+                        f"because it is of type {type(key_)}. Only strings "
+                        "can be serialized.",
+                    )
 
             components.append(component_dict)
 
-        flow_dict['oml:component'] = components
-        flow_dict['oml:tag'] = self.tags
+        flow_dict["oml:component"] = components
+        flow_dict["oml:tag"] = self.tags
         for attribute in ["binary_url", "binary_format", "binary_md5"]:
-            _add_if_nonempty(flow_dict, 'oml:{}'.format(attribute),
-                             getattr(self, attribute))
+            _add_if_nonempty(flow_dict, f"oml:{attribute}", getattr(self, attribute))
 
         return flow_container
 
     @classmethod
-    def _from_dict(cls, xml_dict):
+    def _from_dict(cls, xml_dict: dict) -> OpenMLFlow:
         """Create a flow from an xml description.
 
         Calls itself recursively to create :class:`OpenMLFlow` objects of
         subflows (components).
 
+        XML definition of a flow is available at
+        https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.implementation.upload.xsd
+
         Parameters
         ----------
         xml_dict : dict
@@ -290,24 +312,34 @@ def _from_dict(cls, xml_dict):
         -------
             OpenMLFlow
 
-        """
+        """  # E501
         arguments = OrderedDict()
         dic = xml_dict["oml:flow"]
 
         # Mandatory parts in the xml file
-        for key in ['name', 'external_version']:
+        for key in ["name"]:
             arguments[key] = dic["oml:" + key]
 
         # non-mandatory parts in the xml file
-        for key in ['uploader', 'description', 'upload_date', 'language',
-                    'dependencies', 'version', 'binary_url', 'binary_format',
-                    'binary_md5', 'class_name', 'custom_name']:
+        for key in [
+            "external_version",
+            "uploader",
+            "description",
+            "upload_date",
+            "language",
+            "dependencies",
+            "version",
+            "binary_url",
+            "binary_format",
+            "binary_md5",
+            "class_name",
+            "custom_name",
+        ]:
             arguments[key] = dic.get("oml:" + key)
 
         # has to be converted to an int if present and cannot parsed in the
         # two loops above
-        arguments['flow_id'] = (int(dic['oml:id']) if dic.get("oml:id")
-                                is not None else None)
+        arguments["flow_id"] = int(dic["oml:id"]) if dic.get("oml:id") is not None else None
 
         # Now parse parts of a flow which can occur multiple times like
         # parameters, components (subflows) and tags. These can't be tackled
@@ -320,59 +352,65 @@ def _from_dict(cls, xml_dict):
 
         parameters = OrderedDict()
         parameters_meta_info = OrderedDict()
-        if 'oml:parameter' in dic:
+        if "oml:parameter" in dic:
             # In case of a single parameter, xmltodict returns a dictionary,
             # otherwise a list.
-            oml_parameters = extract_xml_tags('oml:parameter', dic,
-                                              allow_none=False)
+            oml_parameters = extract_xml_tags("oml:parameter", dic, allow_none=False)
 
             for oml_parameter in oml_parameters:
-                parameter_name = oml_parameter['oml:name']
-                default_value = oml_parameter['oml:default_value']
+                parameter_name = oml_parameter["oml:name"]
+                default_value = oml_parameter["oml:default_value"]
                 parameters[parameter_name] = default_value
 
                 meta_info = OrderedDict()
-                meta_info['description'] = oml_parameter.get('oml:description')
-                meta_info['data_type'] = oml_parameter.get('oml:data_type')
+                meta_info["description"] = oml_parameter.get("oml:description")
+                meta_info["data_type"] = oml_parameter.get("oml:data_type")
                 parameters_meta_info[parameter_name] = meta_info
-        arguments['parameters'] = parameters
-        arguments['parameters_meta_info'] = parameters_meta_info
+        arguments["parameters"] = parameters
+        arguments["parameters_meta_info"] = parameters_meta_info
 
         components = OrderedDict()
-        if 'oml:component' in dic:
+        if "oml:component" in dic:
             # In case of a single component xmltodict returns a dict,
             # otherwise a list.
-            oml_components = extract_xml_tags('oml:component', dic,
-                                              allow_none=False)
+            oml_components = extract_xml_tags("oml:component", dic, allow_none=False)
 
             for component in oml_components:
                 flow = OpenMLFlow._from_dict(component)
-                components[component['oml:identifier']] = flow
-        arguments['components'] = components
-        arguments['tags'] = extract_xml_tags('oml:tag', dic)
+                components[component["oml:identifier"]] = flow
+        arguments["components"] = components
+        arguments["tags"] = extract_xml_tags("oml:tag", dic)
 
-        arguments['model'] = None
-        flow = cls(**arguments)
+        arguments["model"] = None
+        return cls(**arguments)
 
-        return flow
+    def to_filesystem(self, output_directory: str | Path) -> None:
+        """Write a flow to the filesystem as XML to output_directory."""
+        output_directory = Path(output_directory)
+        output_directory.mkdir(parents=True, exist_ok=True)
 
-    def to_filesystem(self, output_directory: str) -> None:
-        os.makedirs(output_directory, exist_ok=True)
-        if 'flow.xml' in os.listdir(output_directory):
-            raise ValueError('Output directory already contains a flow.xml file.')
+        output_path = output_directory / "flow.xml"
+        if output_path.exists():
+            raise ValueError("Output directory already contains a flow.xml file.")
 
         run_xml = self._to_xml()
-        with open(os.path.join(output_directory, 'flow.xml'), 'w') as f:
+        with output_path.open("w") as f:
             f.write(run_xml)
 
     @classmethod
-    def from_filesystem(cls, input_directory) -> 'OpenMLFlow':
-        with open(os.path.join(input_directory, 'flow.xml'), 'r') as f:
+    def from_filesystem(cls, input_directory: str | Path) -> OpenMLFlow:
+        """Read a flow from an XML in input_directory on the filesystem."""
+        input_directory = Path(input_directory) / "flow.xml"
+        with input_directory.open() as f:
             xml_string = f.read()
         return OpenMLFlow._from_dict(xmltodict.parse(xml_string))
 
-    def publish(self, raise_error_if_exists: bool = False) -> 'OpenMLFlow':
-        """ Publish this flow to OpenML server.
+    def _parse_publish_response(self, xml_response: dict) -> None:
+        """Parse the id from the xml_response and assign it to self."""
+        self.flow_id = int(xml_response["oml:upload_flow"]["oml:id"])
+
+    def publish(self, raise_error_if_exists: bool = False) -> OpenMLFlow:  # noqa: FBT002
+        """Publish this flow to OpenML server.
 
         Raises a PyOpenMLError if the flow exists on the server, but
         `self.flow_id` does not match the server known flow id.
@@ -397,40 +435,40 @@ def publish(self, raise_error_if_exists: bool = False) -> 'OpenMLFlow':
         flow_id = openml.flows.functions.flow_exists(self.name, self.external_version)
         if not flow_id:
             if self.flow_id:
-                raise openml.exceptions.PyOpenMLError("Flow does not exist on the server, "
-                                                      "but 'flow.flow_id' is not None.")
-            xml_description = self._to_xml()
-            file_elements = {'description': xml_description}
-            return_value = openml._api_calls._perform_api_call(
-                "flow/",
-                'post',
-                file_elements=file_elements,
-            )
-            server_response = xmltodict.parse(return_value)
-            flow_id = int(server_response['oml:upload_flow']['oml:id'])
+                raise openml.exceptions.PyOpenMLError(
+                    "Flow does not exist on the server, but 'flow.flow_id' is not None.",
+                )
+            super().publish()
+            assert self.flow_id is not None  # for mypy
+            flow_id = self.flow_id
         elif raise_error_if_exists:
-            error_message = "This OpenMLFlow already exists with id: {}.".format(flow_id)
+            error_message = f"This OpenMLFlow already exists with id: {flow_id}."
             raise openml.exceptions.PyOpenMLError(error_message)
         elif self.flow_id is not None and self.flow_id != flow_id:
-            raise openml.exceptions.PyOpenMLError("Local flow_id does not match server flow_id: "
-                                                  "'{}' vs '{}'".format(self.flow_id, flow_id))
+            raise openml.exceptions.PyOpenMLError(
+                f"Local flow_id does not match server flow_id: '{self.flow_id}' vs '{flow_id}'",
+            )
 
         flow = openml.flows.functions.get_flow(flow_id)
         _copy_server_fields(flow, self)
         try:
             openml.flows.functions.assert_flows_equal(
-                self, flow, flow.upload_date,
+                self,
+                flow,
+                flow.upload_date,
                 ignore_parameter_values=True,
-                ignore_custom_name_if_none=True
+                ignore_custom_name_if_none=True,
             )
         except ValueError as e:
             message = e.args[0]
-            raise ValueError("The flow on the server is inconsistent with the local flow. "
-                             "The server flow ID is {}. Please check manually and remove "
-                             "the flow if necessary! Error is:\n'{}'".format(flow_id, message))
+            raise ValueError(
+                "The flow on the server is inconsistent with the local flow. "
+                f"The server flow ID is {flow_id}. Please check manually and remove "
+                f"the flow if necessary! Error is:\n'{message}'",
+            ) from e
         return self
 
-    def get_structure(self, key_item: str) -> Dict[str, List[str]]:
+    def get_structure(self, key_item: str) -> dict[str, list[str]]:
         """
         Returns for each sub-component of the flow the path of identifiers
         that should be traversed to reach this component. The resulting dict
@@ -448,17 +486,17 @@ def get_structure(self, key_item: str) -> Dict[str, List[str]]:
         dict[str, List[str]]
             The flow structure
         """
-        if key_item not in ['flow_id', 'name']:
-            raise ValueError('key_item should be in {flow_id, name}')
-        structure = dict()
+        if key_item not in ["flow_id", "name"]:
+            raise ValueError("key_item should be in {flow_id, name}")
+        structure = {}
         for key, sub_flow in self.components.items():
             sub_structure = sub_flow.get_structure(key_item)
             for flow_name, flow_sub_structure in sub_structure.items():
-                structure[flow_name] = [key] + flow_sub_structure
+                structure[flow_name] = [key, *flow_sub_structure]
         structure[getattr(self, key_item)] = []
         return structure
 
-    def get_subflow(self, structure):
+    def get_subflow(self, structure: list[str]) -> OpenMLFlow:
         """
         Returns a subflow from the tree of dependencies.
 
@@ -476,41 +514,35 @@ def get_subflow(self, structure):
         # outer scope
         structure = list(structure)
         if len(structure) < 1:
-            raise ValueError('Please provide a structure list of size >= 1')
+            raise ValueError("Please provide a structure list of size >= 1")
         sub_identifier = structure[0]
         if sub_identifier not in self.components:
-            raise ValueError('Flow %s does not contain component with '
-                             'identifier %s' % (self.name, sub_identifier))
+            raise ValueError(
+                f"Flow {self.name} does not contain component with identifier {sub_identifier}",
+            )
         if len(structure) == 1:
-            return self.components[sub_identifier]
-        else:
-            structure.pop(0)
-            return self.components[sub_identifier].get_subflow(structure)
+            return self.components[sub_identifier]  # type: ignore
 
-    def push_tag(self, tag):
-        """Annotates this flow with a tag on the server.
+        structure.pop(0)
+        return self.components[sub_identifier].get_subflow(structure)  # type: ignore
 
-        Parameters
-        ----------
-        tag : str
-            Tag to attach to the flow.
-        """
-        _tag_entity('flow', self.flow_id, tag)
 
-    def remove_tag(self, tag):
-        """Removes a tag from this flow on the server.
+def _copy_server_fields(source_flow: OpenMLFlow, target_flow: OpenMLFlow) -> None:
+    """Recursively copies the fields added by the server
+    from the `source_flow` to the `target_flow`.
 
-        Parameters
-        ----------
-        tag : str
-            Tag to attach to the flow.
-        """
-        _tag_entity('flow', self.flow_id, tag, untag=True)
-
-
-def _copy_server_fields(source_flow, target_flow):
-    fields_added_by_the_server = ['flow_id', 'uploader', 'version',
-                                  'upload_date']
+    Parameters
+    ----------
+    source_flow : OpenMLFlow
+        To copy the fields from.
+    target_flow : OpenMLFlow
+        To copy the fields to.
+
+    Returns
+    -------
+    None
+    """
+    fields_added_by_the_server = ["flow_id", "uploader", "version", "upload_date"]
     for field in fields_added_by_the_server:
         setattr(target_flow, field, getattr(source_flow, field))
 
@@ -519,6 +551,21 @@ def _copy_server_fields(source_flow, target_flow):
         _copy_server_fields(component, target_flow.components[name])
 
 
-def _add_if_nonempty(dic, key, value):
+def _add_if_nonempty(dic: dict, key: Hashable, value: Any) -> None:
+    """Adds a key-value pair to a dictionary if the value is not None.
+
+    Parameters
+    ----------
+    dic: dict
+        To add the key-value pair to.
+    key: hashable
+        To add to the dictionary.
+    value: Any
+        To add to the dictionary.
+
+    Returns
+    -------
+    None
+    """
     if value is not None:
         dic[key] = value
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
index d12bcfe91..0a2058890 100644
--- a/openml/flows/functions.py
+++ b/openml/flows/functions.py
@@ -1,19 +1,23 @@
-import dateutil.parser
-from collections import OrderedDict
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import os
-import io
 import re
-import xmltodict
+from collections import OrderedDict
+from functools import partial
+from typing import Any
+
+import dateutil.parser
 import pandas as pd
-from typing import Union, Dict, Optional
+import xmltodict
 
-from ..exceptions import OpenMLCacheException
 import openml._api_calls
-from . import OpenMLFlow
 import openml.utils
+from openml.exceptions import OpenMLCacheException
 
+from . import OpenMLFlow
 
-FLOWS_CACHE_DIR_NAME = 'flows'
+FLOWS_CACHE_DIR_NAME = "flows"
 
 
 def _get_cached_flows() -> OrderedDict:
@@ -27,7 +31,7 @@ def _get_cached_flows() -> OrderedDict:
     flows = OrderedDict()  # type: 'OrderedDict[int, OpenMLFlow]'
 
     flow_cache_dir = openml.utils._create_cache_directory(FLOWS_CACHE_DIR_NAME)
-    directory_content = os.listdir(flow_cache_dir)
+    directory_content = os.listdir(flow_cache_dir)  # noqa: PTH208
     directory_content.sort()
     # Find all flow ids for which we have downloaded
     # the flow description
@@ -54,44 +58,83 @@ def _get_cached_flow(fid: int) -> OpenMLFlow:
     -------
     OpenMLFlow.
     """
-
-    fid_cache_dir = openml.utils._create_cache_directory_for_id(
-        FLOWS_CACHE_DIR_NAME,
-        fid
-    )
-    flow_file = os.path.join(fid_cache_dir, "flow.xml")
+    fid_cache_dir = openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, fid)
+    flow_file = fid_cache_dir / "flow.xml"
 
     try:
-        with io.open(flow_file, encoding='utf8') as fh:
+        with flow_file.open(encoding="utf8") as fh:
             return _create_flow_from_xml(fh.read())
-    except (OSError, IOError):
+    except OSError as e:
         openml.utils._remove_cache_dir_for_id(FLOWS_CACHE_DIR_NAME, fid_cache_dir)
-        raise OpenMLCacheException("Flow file for fid %d not "
-                                   "cached" % fid)
+        raise OpenMLCacheException(f"Flow file for fid {fid} not cached") from e
 
 
 @openml.utils.thread_safe_if_oslo_installed
-def get_flow(flow_id: int, reinstantiate: bool = False) -> OpenMLFlow:
-    """Download the OpenML flow for a given flow ID.
+def get_flow(flow_id: int, reinstantiate: bool = False, strict_version: bool = True) -> OpenMLFlow:  # noqa: FBT002
+    """Fetch an OpenMLFlow by its server-assigned ID.
+
+    Queries the OpenML REST API for the flow metadata and returns an
+    :class:`OpenMLFlow` instance. If the flow is already cached locally,
+    the cached copy is returned. Optionally the flow can be re-instantiated
+    into a concrete model instance using the registered extension.
 
     Parameters
     ----------
     flow_id : int
         The OpenML flow id.
-
-    reinstantiate: bool
-        Whether to reinstantiate the flow to a model instance.
+    reinstantiate : bool, optional (default=False)
+        If True, convert the flow description into a concrete model instance
+        using the flow's extension (e.g., sklearn). If conversion fails and
+        ``strict_version`` is True, an exception will be raised.
+    strict_version : bool, optional (default=True)
+        When ``reinstantiate`` is True, whether to enforce exact version
+        requirements for the extension/model. If False, a new flow may
+        be returned when versions differ.
 
     Returns
     -------
-    flow : OpenMLFlow
-        the flow
+    OpenMLFlow
+        The flow object with metadata; ``model`` may be populated when
+        ``reinstantiate=True``.
+
+    Raises
+    ------
+    OpenMLCacheException
+        When cached flow files are corrupted or cannot be read.
+    OpenMLServerException
+        When the REST API call fails.
+
+    Side Effects
+    ------------
+    - Writes to ``openml.config.cache_directory/flows/{flow_id}/flow.xml``
+      when the flow is downloaded from the server.
+
+    Preconditions
+    -------------
+    - Network access to the OpenML server is required unless the flow is cached.
+    - For private flows, ``openml.config.apikey`` must be set.
+
+    Notes
+    -----
+    Results are cached to speed up subsequent calls. When ``reinstantiate`` is
+    True and version mismatches occur, a new flow may be returned to reflect
+    the converted model (only when ``strict_version`` is False).
+
+    Examples
+    --------
+    >>> import openml
+    >>> flow = openml.flows.get_flow(5)  # doctest: +SKIP
     """
     flow_id = int(flow_id)
     flow = _get_flow_description(flow_id)
 
     if reinstantiate:
-        flow.model = flow.extension.flow_to_model(flow)
+        flow.model = flow.extension.flow_to_model(flow, strict_version=strict_version)
+        if not strict_version:
+            # check if we need to return a new flow b/c of version mismatch
+            new_flow = flow.extension.model_to_flow(flow.model)
+            if new_flow.dependencies != flow.dependencies:
+                return new_flow
     return flow
 
 
@@ -114,201 +157,301 @@ def _get_flow_description(flow_id: int) -> OpenMLFlow:
     try:
         return _get_cached_flow(flow_id)
     except OpenMLCacheException:
-
-        xml_file = os.path.join(
-            openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, flow_id),
-            "flow.xml",
+        xml_file = (
+            openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, flow_id) / "flow.xml"
         )
+        flow_xml = openml._api_calls._perform_api_call(f"flow/{flow_id}", request_method="get")
 
-        flow_xml = openml._api_calls._perform_api_call("flow/%d" % flow_id, request_method='get')
-        with io.open(xml_file, "w", encoding='utf8') as fh:
+        with xml_file.open("w", encoding="utf8") as fh:
             fh.write(flow_xml)
 
         return _create_flow_from_xml(flow_xml)
 
 
 def list_flows(
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    tag: Optional[str] = None,
-    output_format: str = 'dict',
-    **kwargs
-) -> Union[Dict, pd.DataFrame]:
+    offset: int | None = None,
+    size: int | None = None,
+    tag: str | None = None,
+    uploader: str | None = None,
+) -> pd.DataFrame:
+    """List flows available on the OpenML server.
 
-    """
-    Return a list of all flows which are on OpenML.
-    (Supports large amount of results)
+    This function supports paging and filtering and returns a pandas
+    DataFrame with one row per flow and columns for id, name, version,
+    external_version, full_name and uploader.
 
     Parameters
     ----------
     offset : int, optional
-        the number of flows to skip, starting from the first
+        Number of flows to skip, starting from the first (for paging).
     size : int, optional
-        the maximum number of flows to return
+        Maximum number of flows to return.
     tag : str, optional
-        the tag to include
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
-    kwargs: dict, optional
-        Legal filter operators: uploader.
+        Only return flows having this tag.
+    uploader : str, optional
+        Only return flows uploaded by this user.
 
     Returns
     -------
-    flows : dict of dicts, or dataframe
-        - If output_format='dict'
-            A mapping from flow_id to a dict giving a brief overview of the
-            respective flow.
-            Every flow is represented by a dictionary containing
-            the following information:
-            - flow id
-            - full name
-            - name
-            - version
-            - external version
-            - uploader
-
-        - If output_format='dataframe'
-            Each row maps to a dataset
-            Each column contains the following information:
-            - flow id
-            - full name
-            - name
-            - version
-            - external version
-            - uploader
+    pandas.DataFrame
+        Rows correspond to flows. Columns include ``id``, ``full_name``,
+        ``name``, ``version``, ``external_version``, and ``uploader``.
+
+    Raises
+    ------
+    OpenMLServerException
+        When the API call fails.
+
+    Side Effects
+    ------------
+    - None: results are fetched and returned; Read-only operation.
+
+    Preconditions
+    -------------
+    - Network access is required to list flows unless cached mechanisms are
+      used by the underlying API helper.
+
+    Examples
+    --------
+    >>> import openml
+    >>> flows = openml.flows.list_flows(size=100)  # doctest: +SKIP
     """
-    if output_format not in ['dataframe', 'dict']:
-        raise ValueError("Invalid output format selected. "
-                         "Only 'dict' or 'dataframe' applicable.")
+    listing_call = partial(_list_flows, tag=tag, uploader=uploader)
+    batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+    if len(batches) == 0:
+        return pd.DataFrame()
 
-    return openml.utils._list_all(output_format=output_format,
-                                  listing_call=_list_flows,
-                                  offset=offset,
-                                  size=size,
-                                  tag=tag,
-                                  **kwargs)
+    return pd.concat(batches)
 
 
-def _list_flows(output_format='dict', **kwargs) -> Union[Dict, pd.DataFrame]:
+def _list_flows(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame:
     """
     Perform the api call that return a list of all flows.
 
     Parameters
     ----------
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
-
+    limit : int
+        the maximum number of flows to return
+    offset : int
+        the number of flows to skip, starting from the first
     kwargs: dict, optional
-        Legal filter operators: uploader, tag, limit, offset.
+        Legal filter operators: uploader, tag
 
     Returns
     -------
-    flows : dict, or dataframe
+    flows : dataframe
     """
     api_call = "flow/list"
 
+    if limit is not None:
+        api_call += f"/limit/{limit}"
+    if offset is not None:
+        api_call += f"/offset/{offset}"
+
     if kwargs is not None:
         for operator, value in kwargs.items():
-            api_call += "/%s/%s" % (operator, value)
+            if value is not None:
+                api_call += f"/{operator}/{value}"
 
-    return __list_flows(api_call=api_call, output_format=output_format)
+    return __list_flows(api_call=api_call)
 
 
-def flow_exists(name: str, external_version: str) -> Union[int, bool]:
-    """Retrieves the flow id.
+def flow_exists(name: str, external_version: str) -> int | bool:
+    """Check whether a flow (name + external_version) exists on the server.
 
-    A flow is uniquely identified by name + external_version.
+    The OpenML server defines uniqueness of flows by the pair
+    ``(name, external_version)``. This helper queries the server and
+    returns the corresponding flow id when present.
 
     Parameters
     ----------
-    name : string
-        Name of the flow
-    external_version : string
+    name : str
+        Flow name (e.g., ``sklearn.tree._classes.DecisionTreeClassifier(1)``).
+    external_version : str
         Version information associated with flow.
 
     Returns
     -------
-    flow_exist : int or bool
-        flow id iff exists, False otherwise
-
-    Notes
-    -----
-    see http://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version
+    int or bool
+        The flow id if the flow exists on the server, otherwise ``False``.
+
+    Raises
+    ------
+    ValueError
+        If ``name`` or ``external_version`` are empty or not strings.
+    OpenMLServerException
+        When the API request fails.
+
+    Examples
+    --------
+    >>> import openml
+    >>> openml.flows.flow_exists("weka.JRip", "Weka_3.9.0_10153")  # doctest: +SKIP
     """
     if not (isinstance(name, str) and len(name) > 0):
-        raise ValueError('Argument \'name\' should be a non-empty string')
+        raise ValueError("Argument 'name' should be a non-empty string")
     if not (isinstance(name, str) and len(external_version) > 0):
-        raise ValueError('Argument \'version\' should be a non-empty string')
+        raise ValueError("Argument 'version' should be a non-empty string")
 
     xml_response = openml._api_calls._perform_api_call(
         "flow/exists",
-        'post',
-        data={'name': name, 'external_version': external_version},
+        "post",
+        data={"name": name, "external_version": external_version},
     )
 
     result_dict = xmltodict.parse(xml_response)
-    flow_id = int(result_dict['oml:flow_exists']['oml:id'])
-    if flow_id > 0:
-        return flow_id
+    flow_id = int(result_dict["oml:flow_exists"]["oml:id"])
+    return flow_id if flow_id > 0 else False
+
+
+def get_flow_id(
+    model: Any | None = None,
+    name: str | None = None,
+    exact_version: bool = True,  # noqa: FBT002
+) -> int | bool | list[int]:
+    """Retrieve flow id(s) for a model instance or a flow name.
+
+    Provide either a concrete ``model`` (which will be converted to a flow by
+    the appropriate extension) or a flow ``name``. Behavior depends on
+    ``exact_version``:
+
+    - ``model`` + ``exact_version=True``: convert ``model`` to a flow and call
+        :func:`flow_exists` to get a single flow id (or False).
+    - ``model`` + ``exact_version=False``: convert ``model`` to a flow and
+        return all server flow ids with the same flow name.
+    - ``name``: ignore ``exact_version`` and return all server flow ids that
+        match ``name``.
+
+    Parameters
+    ----------
+    model : object, optional
+            A model instance that can be handled by a registered extension. Either
+            ``model`` or ``name`` must be provided.
+    name : str, optional
+            Flow name to query for. Either ``model`` or ``name`` must be provided.
+    exact_version : bool, optional (default=True)
+            When True and ``model`` is provided, only return the id for the exact
+            external version. When False, return a list of matching ids.
+
+    Returns
+    -------
+    int or bool or list[int]
+            If ``exact_version`` is True: the flow id if found, otherwise ``False``.
+            If ``exact_version`` is False: a list of matching flow ids (may be empty).
+
+    Raises
+    ------
+    ValueError
+            If neither ``model`` nor ``name`` is provided, or if both are provided.
+    OpenMLServerException
+            If underlying API calls fail.
+
+    Side Effects
+    ------------
+    - May call server APIs (``flow/exists``, ``flow/list``) and therefore
+        depends on network access and API keys for private flows.
+
+    Examples
+    --------
+    >>> import openml
+    >>> # Lookup by flow name
+    >>> openml.flows.get_flow_id(name="weka.JRip")  # doctest: +SKIP
+    >>> # Lookup by model instance (requires a registered extension)
+    >>> import sklearn
+    >>> import openml_sklearn
+    >>> clf = sklearn.tree.DecisionTreeClassifier()
+    >>> openml.flows.get_flow_id(model=clf)  # doctest: +SKIP
+    """
+    if model is not None and name is not None:
+        raise ValueError("Must provide either argument `model` or argument `name`, but not both.")
+
+    if model is not None:
+        extension = openml.extensions.get_extension_by_model(model, raise_if_no_extension=True)
+        if extension is None:
+            # This should never happen and is only here to please mypy will be gone soon once the
+            # whole function is removed
+            raise TypeError(extension)
+        flow = extension.model_to_flow(model)
+        flow_name = flow.name
+        external_version = flow.external_version
+    elif name is not None:
+        flow_name = name
+        exact_version = False
+        external_version = None
     else:
-        return False
+        raise ValueError(
+            "Need to provide either argument `model` or argument `name`, but both are `None`."
+        )
 
+    if exact_version:
+        if external_version is None:
+            raise ValueError("exact_version should be False if model is None!")
+        return flow_exists(name=flow_name, external_version=external_version)
 
-def __list_flows(
-    api_call: str,
-    output_format: str = 'dict'
-) -> Union[Dict, pd.DataFrame]:
+    flows = list_flows()
+    flows = flows.query(f'name == "{flow_name}"')
+    return flows["id"].to_list()  # type: ignore[no-any-return]
+
+
+def __list_flows(api_call: str) -> pd.DataFrame:
+    """Retrieve information about flows from OpenML API
+    and parse it to a dictionary or a Pandas DataFrame.
+
+    Parameters
+    ----------
+    api_call: str
+        Retrieves the information about flows.
 
-    xml_string = openml._api_calls._perform_api_call(api_call, 'get')
-    flows_dict = xmltodict.parse(xml_string, force_list=('oml:flow',))
+    Returns
+    -------
+        The flows information in the specified output format.
+    """
+    xml_string = openml._api_calls._perform_api_call(api_call, "get")
+    flows_dict = xmltodict.parse(xml_string, force_list=("oml:flow",))
 
     # Minimalistic check if the XML is useful
-    assert type(flows_dict['oml:flows']['oml:flow']) == list, \
-        type(flows_dict['oml:flows'])
-    assert flows_dict['oml:flows']['@xmlns:oml'] == \
-        'http://openml.org/openml', flows_dict['oml:flows']['@xmlns:oml']
-
-    flows = dict()
-    for flow_ in flows_dict['oml:flows']['oml:flow']:
-        fid = int(flow_['oml:id'])
-        flow = {'id': fid,
-                'full_name': flow_['oml:full_name'],
-                'name': flow_['oml:name'],
-                'version': flow_['oml:version'],
-                'external_version': flow_['oml:external_version'],
-                'uploader': flow_['oml:uploader']}
+    assert isinstance(flows_dict["oml:flows"]["oml:flow"], list), type(flows_dict["oml:flows"])
+    assert flows_dict["oml:flows"]["@xmlns:oml"] == "http://openml.org/openml", flows_dict[
+        "oml:flows"
+    ]["@xmlns:oml"]
+
+    flows = {}
+    for flow_ in flows_dict["oml:flows"]["oml:flow"]:
+        fid = int(flow_["oml:id"])
+        flow = {
+            "id": fid,
+            "full_name": flow_["oml:full_name"],
+            "name": flow_["oml:name"],
+            "version": flow_["oml:version"],
+            "external_version": flow_["oml:external_version"],
+            "uploader": flow_["oml:uploader"],
+        }
         flows[fid] = flow
 
-    if output_format == 'dataframe':
-        flows = pd.DataFrame.from_dict(flows, orient='index')
-
-    return flows
+    return pd.DataFrame.from_dict(flows, orient="index")
 
 
 def _check_flow_for_server_id(flow: OpenMLFlow) -> None:
-    """ Raises a ValueError if the flow or any of its subflows has no flow id. """
-
+    """Raises a ValueError if the flow or any of its subflows has no flow id."""
     # Depth-first search to check if all components were uploaded to the
     # server before parsing the parameters
-    stack = list()
-    stack.append(flow)
+    stack = [flow]
     while len(stack) > 0:
         current = stack.pop()
         if current.flow_id is None:
-            raise ValueError("Flow %s has no flow_id!" % current.name)
-        else:
-            for component in current.components.values():
-                stack.append(component)
+            raise ValueError(f"Flow {current.name} has no flow_id!")
+
+        for component in current.components.values():
+            stack.append(component)
 
 
-def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
-                       ignore_parameter_values_on_older_children: str = None,
-                       ignore_parameter_values: bool = False,
-                       ignore_custom_name_if_none: bool = False) -> None:
+def assert_flows_equal(  # noqa: C901, PLR0912, PLR0913, PLR0915
+    flow1: OpenMLFlow,
+    flow2: OpenMLFlow,
+    ignore_parameter_values_on_older_children: str | None = None,
+    ignore_parameter_values: bool = False,  # noqa: FBT002
+    ignore_custom_name_if_none: bool = False,  # noqa: FBT002
+    check_description: bool = True,  # noqa: FBT002
+) -> None:
     """Check equality of two flows.
 
     Two flows are equal if their all keys which are not set by the server
@@ -327,62 +470,98 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
     ignore_parameter_values : bool
         Whether to ignore parameter values when comparing flows.
 
-   ignore_custom_name_if_none : bool
+    ignore_custom_name_if_none : bool
         Whether to ignore the custom name field if either flow has `custom_name` equal to `None`.
+
+    check_description : bool
+        Whether to ignore matching of flow descriptions.
+
+    Raises
+    ------
+    TypeError
+        When either argument is not an :class:`OpenMLFlow`.
+    ValueError
+        When a relevant mismatch is found between the two flows.
+
+    Examples
+    --------
+    >>> import openml
+    >>> f1 = openml.flows.get_flow(5)  # doctest: +SKIP
+    >>> f2 = openml.flows.get_flow(5)  # doctest: +SKIP
+    >>> openml.flows.assert_flows_equal(f1, f2)  # doctest: +SKIP
+    >>> # If flows differ, a ValueError is raised
     """
     if not isinstance(flow1, OpenMLFlow):
-        raise TypeError('Argument 1 must be of type OpenMLFlow, but is %s' %
-                        type(flow1))
+        raise TypeError(f"Argument 1 must be of type OpenMLFlow, but is {type(flow1)}")
 
     if not isinstance(flow2, OpenMLFlow):
-        raise TypeError('Argument 2 must be of type OpenMLFlow, but is %s' %
-                        type(flow2))
+        raise TypeError(f"Argument 2 must be of type OpenMLFlow, but is {type(flow2)}")
 
     # TODO as they are actually now saved during publish, it might be good to
     # check for the equality of these as well.
-    generated_by_the_server = ['flow_id', 'uploader', 'version', 'upload_date',
-                               # Tags aren't directly created by the server,
-                               # but the uploader has no control over them!
-                               'tags']
-    ignored_by_python_api = ['binary_url', 'binary_format', 'binary_md5',
-                             'model']
+    generated_by_the_server = [
+        "flow_id",
+        "uploader",
+        "version",
+        "upload_date",
+        # Tags aren't directly created by the server,
+        # but the uploader has no control over them!
+        "tags",
+    ]
+    ignored_by_python_api = ["binary_url", "binary_format", "binary_md5", "model", "_entity_id"]
 
     for key in set(flow1.__dict__.keys()).union(flow2.__dict__.keys()):
         if key in generated_by_the_server + ignored_by_python_api:
             continue
         attr1 = getattr(flow1, key, None)
         attr2 = getattr(flow2, key, None)
-        if key == 'components':
+        if key == "components":
+            if not (isinstance(attr1, dict) and isinstance(attr2, dict)):
+                raise TypeError("Cannot compare components because they are not dictionary.")
+
             for name in set(attr1.keys()).union(attr2.keys()):
                 if name not in attr1:
-                    raise ValueError('Component %s only available in '
-                                     'argument2, but not in argument1.' % name)
+                    raise ValueError(
+                        f"Component {name} only available in argument2, but not in argument1.",
+                    )
                 if name not in attr2:
-                    raise ValueError('Component %s only available in '
-                                     'argument2, but not in argument1.' % name)
-                assert_flows_equal(attr1[name], attr2[name],
-                                   ignore_parameter_values_on_older_children,
-                                   ignore_parameter_values,
-                                   ignore_custom_name_if_none)
-        elif key == '_extension':
+                    raise ValueError(
+                        f"Component {name} only available in argument2, but not in argument1.",
+                    )
+                assert_flows_equal(
+                    attr1[name],
+                    attr2[name],
+                    ignore_parameter_values_on_older_children,
+                    ignore_parameter_values,
+                    ignore_custom_name_if_none,
+                )
+        elif key == "_extension":
+            continue
+        elif check_description and key == "description":
+            # to ignore matching of descriptions since sklearn based flows may have
+            # altering docstrings and is not guaranteed to be consistent
             continue
         else:
-            if key == 'parameters':
-                if ignore_parameter_values or \
-                        ignore_parameter_values_on_older_children:
+            if key == "parameters":
+                if ignore_parameter_values or ignore_parameter_values_on_older_children:
                     params_flow_1 = set(flow1.parameters.keys())
                     params_flow_2 = set(flow2.parameters.keys())
                     symmetric_difference = params_flow_1 ^ params_flow_2
                     if len(symmetric_difference) > 0:
-                        raise ValueError('Flow %s: parameter set of flow '
-                                         'differs from the parameters stored '
-                                         'on the server.' % flow1.name)
+                        raise ValueError(
+                            f"Flow {flow1.name}: parameter set of flow "
+                            "differs from the parameters stored "
+                            "on the server.",
+                        )
 
                 if ignore_parameter_values_on_older_children:
-                    upload_date_current_flow = dateutil.parser.parse(
-                        flow1.upload_date)
+                    assert flow1.upload_date is not None, (
+                        "Flow1 has no upload date that allows us to compare age of children."
+                    )
+                    upload_date_current_flow = dateutil.parser.parse(flow1.upload_date)
                     upload_date_parent_flow = dateutil.parser.parse(
-                        ignore_parameter_values_on_older_children)
+                        ignore_parameter_values_on_older_children,
+                    )
                     if upload_date_current_flow < upload_date_parent_flow:
                         continue
 
@@ -390,18 +569,55 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
                     # Continue needs to be done here as the first if
                     # statement triggers in both special cases
                     continue
-            elif (key == 'custom_name'
-                  and ignore_custom_name_if_none
-                  and (attr1 is None or attr2 is None)):
+            elif (
+                key == "custom_name"
+                and ignore_custom_name_if_none
+                and (attr1 is None or attr2 is None)
+            ):
                 # If specified, we allow `custom_name` inequality if one flow's name is None.
                 # Helps with backwards compatibility as `custom_name` is now auto-generated, but
                 # before it used to be `None`.
                 continue
+            elif key == "parameters_meta_info":
+                # this value is a dictionary where each key is a parameter name, containing another
+                # dictionary with keys specifying the parameter's 'description' and 'data_type'
+                # checking parameter descriptions can be ignored since that might change
+                # data type check can also be ignored if one of them is not defined, i.e., None
+                params1 = set(flow1.parameters_meta_info)
+                params2 = set(flow2.parameters_meta_info)
+                if params1 != params2:
+                    raise ValueError(
+                        "Parameter list in meta info for parameters differ in the two flows.",
+                    )
+                # iterating over the parameter's meta info list
+                for param in params1:
+                    if (
+                        isinstance(flow1.parameters_meta_info[param], dict)
+                        and isinstance(flow2.parameters_meta_info[param], dict)
+                        and "data_type" in flow1.parameters_meta_info[param]
+                        and "data_type" in flow2.parameters_meta_info[param]
+                    ):
+                        value1 = flow1.parameters_meta_info[param]["data_type"]
+                        value2 = flow2.parameters_meta_info[param]["data_type"]
+                    else:
+                        value1 = flow1.parameters_meta_info[param]
+                        value2 = flow2.parameters_meta_info[param]
+                    if value1 is None or value2 is None:
+                        continue
+
+                    if value1 != value2:
+                        raise ValueError(
+                            f"Flow {flow1.name}: data type for parameter {param} in {key} differ "
+                            f"as {value1}\nvs\n{value2}",
+                        )
+                # the continue is to avoid the 'attr != attr2' check at end of function
+                continue
 
             if attr1 != attr2:
-                raise ValueError("Flow %s: values for attribute '%s' differ: "
-                                 "'%s'\nvs\n'%s'." %
-                                 (str(flow1.name), str(key), str(attr1), str(attr2)))
+                raise ValueError(
+                    f"Flow {flow1.name!s}: values for attribute '{key!s}' differ: "
+                    f"'{attr1!s}'\nvs\n'{attr2!s}'.",
+                )
 
 
 def _create_flow_from_xml(flow_xml: str) -> OpenMLFlow:
@@ -415,5 +631,38 @@ def _create_flow_from_xml(flow_xml: str) -> OpenMLFlow:
     -------
     OpenMLFlow
     """
-
     return OpenMLFlow._from_dict(xmltodict.parse(flow_xml))
+
+
+def delete_flow(flow_id: int) -> bool:
+    """Delete flow with id `flow_id` from the OpenML server.
+
+    You can only delete flows which you uploaded and which
+    which are not linked to runs.
+
+    Parameters
+    ----------
+    flow_id : int
+        OpenML id of the flow
+
+    Returns
+    -------
+    bool
+        True if the deletion was successful. False otherwise.
+
+    Raises
+    ------
+    OpenMLServerException
+        If the server-side deletion fails due to permissions or other errors.
+
+    Side Effects
+    ------------
+    - Removes the flow from the OpenML server (if permitted).
+
+    Examples
+    --------
+    >>> import openml
+    >>> # Deletes flow 23 if you are the uploader and it's not linked to runs
+    >>> openml.flows.delete_flow(23)  # doctest: +SKIP
+    """
+    return openml.utils._delete_entity("flow", flow_id)
diff --git a/.nojekyll b/openml/py.typed
similarity index 100%
rename from .nojekyll
rename to openml/py.typed
diff --git a/openml/runs/__init__.py b/openml/runs/__init__.py
index 76aabcbc4..2f068a2e6 100644
--- a/openml/runs/__init__.py
+++ b/openml/runs/__init__.py
@@ -1,28 +1,32 @@
-from .run import OpenMLRun
-from .trace import OpenMLRunTrace, OpenMLTraceIteration
+# License: BSD 3-Clause
+
 from .functions import (
-    run_model_on_task,
-    run_flow_on_task,
+    delete_run,
     get_run,
-    list_runs,
-    get_runs,
     get_run_trace,
-    run_exists,
+    get_runs,
     initialize_model_from_run,
     initialize_model_from_trace,
+    list_runs,
+    run_exists,
+    run_flow_on_task,
+    run_model_on_task,
 )
+from .run import OpenMLRun
+from .trace import OpenMLRunTrace, OpenMLTraceIteration
 
 __all__ = [
-    'OpenMLRun',
-    'OpenMLRunTrace',
-    'OpenMLTraceIteration',
-    'run_model_on_task',
-    'run_flow_on_task',
-    'get_run',
-    'list_runs',
-    'get_runs',
-    'get_run_trace',
-    'run_exists',
-    'initialize_model_from_run',
-    'initialize_model_from_trace'
+    "OpenMLRun",
+    "OpenMLRunTrace",
+    "OpenMLTraceIteration",
+    "delete_run",
+    "get_run",
+    "get_run_trace",
+    "get_runs",
+    "initialize_model_from_run",
+    "initialize_model_from_trace",
+    "list_runs",
+    "run_exists",
+    "run_flow_on_task",
+    "run_model_on_task",
 ]
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index 767a4a48a..d87bd3e18 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -1,62 +1,85 @@
-from collections import OrderedDict
-import io
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import itertools
-import os
-from typing import Any, List, Dict, Optional, Set, Tuple, Union, TYPE_CHECKING  # noqa F401
+import time
 import warnings
+from collections import OrderedDict
+from functools import partial
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
 
+import numpy as np
+import pandas as pd
 import sklearn.metrics
 import xmltodict
-import pandas as pd
+from joblib.parallel import Parallel, delayed
 
 import openml
-import openml.utils
 import openml._api_calls
-from openml.exceptions import PyOpenMLError
+import openml.utils
+from openml.exceptions import (
+    OpenMLCacheException,
+    OpenMLRunsExistError,
+    OpenMLServerException,
+    PyOpenMLError,
+)
 from openml.extensions import get_extension_by_model
-from openml import config
+from openml.flows import OpenMLFlow, flow_exists, get_flow
 from openml.flows.flow import _copy_server_fields
-from ..flows import get_flow, flow_exists, OpenMLFlow
-from ..setups import setup_exists, initialize_model
-from ..exceptions import OpenMLCacheException, OpenMLServerException, OpenMLRunsExistError
-from ..tasks import OpenMLTask, OpenMLClassificationTask, OpenMLClusteringTask, \
-    OpenMLRegressionTask, OpenMLSupervisedTask, OpenMLLearningCurveTask
+from openml.setups import initialize_model, setup_exists
+from openml.tasks import (
+    OpenMLClassificationTask,
+    OpenMLClusteringTask,
+    OpenMLLearningCurveTask,
+    OpenMLRegressionTask,
+    OpenMLSupervisedTask,
+    OpenMLTask,
+    TaskType,
+    get_task,
+)
+
 from .run import OpenMLRun
 from .trace import OpenMLRunTrace
-from ..tasks import TaskTypeEnum
 
 # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
 if TYPE_CHECKING:
+    from openml._config import _Config
     from openml.extensions.extension_interface import Extension
 
 # get_dict is in run.py to avoid circular imports
 
-RUNS_CACHE_DIR_NAME = 'runs'
+RUNS_CACHE_DIR_NAME = "runs"
+ERROR_CODE = 512
 
 
-def run_model_on_task(
+# TODO(eddiebergman): Could potentially overload this but
+# it seems very big to do so
+def run_model_on_task(  # noqa: PLR0913
     model: Any,
-    task: OpenMLTask,
-    avoid_duplicate_runs: bool = True,
-    flow_tags: List[str] = None,
-    seed: int = None,
-    add_local_measures: bool = True,
-    upload_flow: bool = False,
-    return_flow: bool = False,
-) -> Union[OpenMLRun, Tuple[OpenMLRun, OpenMLFlow]]:
+    task: int | str | OpenMLTask,
+    avoid_duplicate_runs: bool | None = None,
+    flow_tags: list[str] | None = None,
+    seed: int | None = None,
+    add_local_measures: bool = True,  # noqa: FBT002
+    upload_flow: bool = False,  # noqa: FBT002
+    return_flow: bool = False,  # noqa: FBT002
+    n_jobs: int | None = None,
+) -> OpenMLRun | tuple[OpenMLRun, OpenMLFlow]:
     """Run the model on the dataset defined by the task.
 
     Parameters
     ----------
     model : sklearn model
         A model which has a function fit(X,Y) and predict(X),
-        all supervised estimators of scikit learn follow this definition of a model [1]
-        [1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
-    task : OpenMLTask
-        Task to perform. This may be a model instead if the first argument is an OpenMLTask.
-    avoid_duplicate_runs : bool, optional (default=True)
+        all supervised estimators of scikit learn follow this definition of a model.
+    task : OpenMLTask or int or str
+        Task to perform or Task id.
+        This may be a model instead if the first argument is an OpenMLTask.
+    avoid_duplicate_runs : bool, optional (default=None)
         If True, the run will throw an error if the setup/task combination is already present on
         the server. This feature requires an internet connection.
+        If not set, it will use the default from your openml configuration (False if unset).
     flow_tags : List[str], optional (default=None)
         A list of tags that the flow should have at creation.
     seed: int, optional (default=None)
@@ -69,6 +92,10 @@ def run_model_on_task(
         If False, do not upload the flow to OpenML.
     return_flow : bool (default=False)
         If True, returns the OpenMLFlow generated from the model in addition to the OpenMLRun.
+    n_jobs : int (default=None)
+        The number of processes/threads to distribute the evaluation asynchronously.
+        If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially.
+        If `-1`, then the job uses as many cores available.
 
     Returns
     -------
@@ -77,15 +104,30 @@ def run_model_on_task(
     flow : OpenMLFlow (optional, only if `return_flow` is True).
         Flow generated from the model.
     """
+    if avoid_duplicate_runs is None:
+        avoid_duplicate_runs = openml.config.avoid_duplicate_runs
+    if avoid_duplicate_runs and not openml.config.apikey:
+        warnings.warn(
+            "avoid_duplicate_runs is set to True, but no API key is set. "
+            "Please set your API key in the OpenML configuration file, see"
+            "https://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial"
+            ".html#authentication for more information on authentication.",
+            RuntimeWarning,
+            stacklevel=2,
+        )
 
     # TODO: At some point in the future do not allow for arguments in old order (6-2018).
     # Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019).
     # When removing this please also remove the method `is_estimator` from the extension
     # interface as it is only used here (MF, 3-2019)
-    if isinstance(model, OpenMLTask):
-        warnings.warn("The old argument order (task, model) is deprecated and "
-                      "will not be supported in the future. Please use the "
-                      "order (model, task).", DeprecationWarning)
+    if isinstance(model, (int, str, OpenMLTask)):
+        warnings.warn(
+            "The old argument order (task, model) is deprecated and "
+            "will not be supported in the future. Please use the "
+            "order (model, task).",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         task, model = model, task
 
     extension = get_extension_by_model(model, raise_if_no_extension=True)
@@ -96,6 +138,27 @@ def run_model_on_task(
 
     flow = extension.model_to_flow(model)
 
+    def get_task_and_type_conversion(_task: int | str | OpenMLTask) -> OpenMLTask:
+        """Retrieve an OpenMLTask object from either an integer or string ID,
+        or directly from an OpenMLTask object.
+
+        Parameters
+        ----------
+        _task : Union[int, str, OpenMLTask]
+            The task ID or the OpenMLTask object.
+
+        Returns
+        -------
+        OpenMLTask
+            The OpenMLTask object.
+        """
+        if isinstance(_task, (int, str)):
+            return get_task(int(_task))  # type: ignore
+
+        return _task
+
+    task = get_task_and_type_conversion(task)
+
     run = run_flow_on_task(
         task=task,
         flow=flow,
@@ -104,22 +167,23 @@ def run_model_on_task(
         seed=seed,
         add_local_measures=add_local_measures,
         upload_flow=upload_flow,
+        n_jobs=n_jobs,
     )
     if return_flow:
         return run, flow
     return run
 
 
-def run_flow_on_task(
+def run_flow_on_task(  # noqa: C901, PLR0912, PLR0915, PLR0913
     flow: OpenMLFlow,
     task: OpenMLTask,
-    avoid_duplicate_runs: bool = True,
-    flow_tags: List[str] = None,
-    seed: int = None,
-    add_local_measures: bool = True,
-    upload_flow: bool = False,
+    avoid_duplicate_runs: bool | None = None,
+    flow_tags: list[str] | None = None,
+    seed: int | None = None,
+    add_local_measures: bool = True,  # noqa: FBT002
+    upload_flow: bool = False,  # noqa: FBT002
+    n_jobs: int | None = None,
 ) -> OpenMLRun:
-
     """Run the model provided by the flow on the dataset defined by task.
 
     Takes the flow and repeat information into account.
@@ -130,16 +194,13 @@ def run_flow_on_task(
     flow : OpenMLFlow
         A flow wraps a machine learning model together with relevant information.
         The model has a function fit(X,Y) and predict(X),
-        all supervised estimators of scikit learn follow this definition of a model [1]
-        [1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
+        all supervised estimators of scikit learn follow this definition of a model.
     task : OpenMLTask
         Task to perform. This may be an OpenMLFlow instead if the first argument is an OpenMLTask.
-    avoid_duplicate_runs : bool, optional (default=True)
-        If True, the run will throw an error if the setup/task combination is already present on
-        the server. This feature requires an internet connection.
-    avoid_duplicate_runs : bool, optional (default=True)
+    avoid_duplicate_runs : bool, optional (default=None)
         If True, the run will throw an error if the setup/task combination is already present on
         the server. This feature requires an internet connection.
+        If not set, it will use the default from your openml configuration (False if unset).
     flow_tags : List[str], optional (default=None)
         A list of tags that the flow should have at creation.
     seed: int, optional (default=None)
@@ -150,6 +211,10 @@ def run_flow_on_task(
     upload_flow : bool (default=False)
         If True, upload the flow to OpenML if it does not exist yet.
         If False, do not upload the flow to OpenML.
+    n_jobs : int (default=None)
+        The number of processes/threads to distribute the evaluation asynchronously.
+        If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially.
+        If `-1`, then the job uses as many cores available.
 
     Returns
     -------
@@ -159,13 +224,20 @@ def run_flow_on_task(
     if flow_tags is not None and not isinstance(flow_tags, list):
         raise ValueError("flow_tags should be a list")
 
+    if avoid_duplicate_runs is None:
+        avoid_duplicate_runs = openml.config.avoid_duplicate_runs
+
     # TODO: At some point in the future do not allow for arguments in old order (changed 6-2018).
     # Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019).
     if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow):
         # We want to allow either order of argument (to avoid confusion).
-        warnings.warn("The old argument order (Flow, model) is deprecated and "
-                      "will not be supported in the future. Please use the "
-                      "order (model, Flow).", DeprecationWarning)
+        warnings.warn(
+            "The old argument order (Flow, model) is deprecated and "
+            "will not be supported in the future. Please use the "
+            "order (model, Flow).",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         task, flow = flow, task
 
     if task.task_id is None:
@@ -173,6 +245,7 @@ def run_flow_on_task(
 
     if flow.model is None:
         flow.model = flow.extension.flow_to_model(flow)
+
     flow.model = flow.extension.seed_model(flow.model, seed=seed)
 
     # We only need to sync with the server right now if we want to upload the flow,
@@ -181,14 +254,14 @@ def run_flow_on_task(
     if upload_flow or avoid_duplicate_runs:
         flow_id = flow_exists(flow.name, flow.external_version)
         if isinstance(flow.flow_id, int) and flow_id != flow.flow_id:
-            if flow_id:
-                raise PyOpenMLError("Local flow_id does not match server flow_id: "
-                                    "'{}' vs '{}'".format(flow.flow_id, flow_id))
-            else:
-                raise PyOpenMLError("Flow does not exist on the server, "
-                                    "but 'flow.flow_id' is not None.")
-
-        if upload_flow and not flow_id:
+            if flow_id is not False:
+                raise PyOpenMLError(
+                    f"Local flow_id does not match server flow_id: '{flow.flow_id}' vs '{flow_id}'",
+                )
+            raise PyOpenMLError(
+                "Flow does not exist on the server, but 'flow.flow_id' is not None."
+            )
+        if upload_flow and flow_id is False:
             flow.publish()
             flow_id = flow.flow_id
         elif flow_id:
@@ -199,31 +272,39 @@ def run_flow_on_task(
                 setup_id = setup_exists(flow_from_server)
                 ids = run_exists(task.task_id, setup_id)
                 if ids:
-                    error_message = ("One or more runs of this setup were "
-                                     "already performed on the task.")
+                    error_message = (
+                        "One or more runs of this setup were already performed on the task."
+                    )
                     raise OpenMLRunsExistError(ids, error_message)
         else:
             # Flow does not exist on server and we do not want to upload it.
             # No sync with the server happens.
             flow_id = None
-            pass
 
     dataset = task.get_dataset()
 
     run_environment = flow.extension.get_version_information()
-    tags = ['openml-python', run_environment[1]]
+    tags = ["openml-python", run_environment[1]]
+
+    if flow.extension.check_if_model_fitted(flow.model):
+        warnings.warn(
+            "The model is already fitted! This might cause inconsistency in comparison of results.",
+            RuntimeWarning,
+            stacklevel=2,
+        )
 
     # execute the run
     res = _run_task_get_arffcontent(
-        flow=flow,
         model=flow.model,
         task=task,
         extension=flow.extension,
         add_local_measures=add_local_measures,
+        n_jobs=n_jobs,
     )
 
     data_content, trace, fold_evaluations, sample_evaluations = res
-
+    fields = [*run_environment, time.strftime("%c"), "Created by run_flow_on_task"]
+    generated_description = "\n".join(fields)
     run = OpenMLRun(
         task_id=task.task_id,
         flow_id=flow_id,
@@ -235,6 +316,7 @@ def run_flow_on_task(
         data_content=data_content,
         flow=flow,
         setup_string=flow.extension.create_setup_string(flow.model),
+        description_text=generated_description,
     )
 
     if (upload_flow or avoid_duplicate_runs) and flow.flow_id is not None:
@@ -244,16 +326,16 @@ def run_flow_on_task(
         run.parameter_settings = flow.extension.obtain_parameter_values(flow)
 
     # now we need to attach the detailed evaluations
-    if task.task_type_id == TaskTypeEnum.LEARNING_CURVE:
+    if task.task_type_id == TaskType.LEARNING_CURVE:
         run.sample_evaluations = sample_evaluations
     else:
         run.fold_evaluations = fold_evaluations
 
     if flow_id:
-        message = 'Executed Task {} with Flow id:{}'.format(task.task_id, run.flow_id)
+        message = f"Executed Task {task.task_id} with Flow id:{run.flow_id}"
     else:
-        message = 'Executed Task {} on local Flow with name {}.'.format(task.task_id, flow.name)
-    config.logger.info(message)
+        message = f"Executed Task {task.task_id} on local Flow with name {flow.name}."
+    openml.config.logger.info(message)
 
     return run
 
@@ -270,13 +352,11 @@ def get_run_trace(run_id: int) -> OpenMLRunTrace:
     -------
     openml.runs.OpenMLTrace
     """
-    trace_xml = openml._api_calls._perform_api_call('run/trace/%d' % run_id,
-                                                    'get')
-    run_trace = OpenMLRunTrace.trace_from_xml(trace_xml)
-    return run_trace
+    trace_xml = openml._api_calls._perform_api_call(f"run/trace/{run_id}", "get")
+    return OpenMLRunTrace.trace_from_xml(trace_xml)
 
 
-def initialize_model_from_run(run_id: int) -> Any:
+def initialize_model_from_run(run_id: int, *, strict_version: bool = True) -> Any:
     """
     Initialized a model based on a run_id (i.e., using the exact
     same parameter settings)
@@ -285,20 +365,26 @@ def initialize_model_from_run(run_id: int) -> Any:
     ----------
     run_id : int
         The Openml run_id
+    strict_version: bool (default=True)
+        See `flow_to_model` strict_version.
 
     Returns
     -------
     model
     """
     run = get_run(run_id)
-    return initialize_model(run.setup_id)
+    # TODO(eddiebergman): I imagine this is None if it's not published,
+    # might need to raise an explicit error for that
+    if run.setup_id is None:
+        raise ValueError(f"Run {run_id} has no associated setup_id. Cannot initialize model.")
+    return initialize_model(setup_id=run.setup_id, strict_version=strict_version)
 
 
 def initialize_model_from_trace(
     run_id: int,
     repeat: int,
     fold: int,
-    iteration: Optional[int] = None,
+    iteration: int | None = None,
 ) -> Any:
     """
     Initialize a model based on the parameters that were set
@@ -328,6 +414,11 @@ def initialize_model_from_trace(
     model
     """
     run = get_run(run_id)
+    # TODO(eddiebergman): I imagine this is None if it's not published,
+    # might need to raise an explicit error for that
+    if run.flow_id is None:
+        raise ValueError(f"Run {run_id} has no associated flow_id. Cannot initialize model.")
+
     flow = get_flow(run.flow_id)
     run_trace = get_run_trace(run_id)
 
@@ -336,15 +427,14 @@ def initialize_model_from_trace(
 
     request = (repeat, fold, iteration)
     if request not in run_trace.trace_iterations:
-        raise ValueError('Combination repeat, fold, iteration not available')
+        raise ValueError("Combination repeat, fold, iteration not available")
     current = run_trace.trace_iterations[(repeat, fold, iteration)]
 
     search_model = initialize_model_from_run(run_id)
-    model = flow.extension.instantiate_model_from_hpo_class(search_model, current)
-    return model
+    return flow.extension.instantiate_model_from_hpo_class(search_model, current)
 
 
-def run_exists(task_id: int, setup_id: int) -> Set[int]:
+def run_exists(task_id: int, setup_id: int) -> set[int]:
     """Checks whether a task/setup combination is already present on the
     server.
 
@@ -365,30 +455,53 @@ def run_exists(task_id: int, setup_id: int) -> Set[int]:
 
     try:
         result = list_runs(task=[task_id], setup=[setup_id])
-        if len(result) > 0:
-            return set(result.keys())
-        else:
-            return set()
+        return set() if result.empty else set(result["run_id"])
     except OpenMLServerException as exception:
-        # error code 512 implies no results. The run does not exist yet
-        assert (exception.code == 512)
+        # error code implies no results. The run does not exist yet
+        if exception.code != ERROR_CODE:
+            raise exception
         return set()
 
 
-def _run_task_get_arffcontent(
-    flow: OpenMLFlow,
+def _run_task_get_arffcontent(  # noqa: PLR0915, PLR0912, C901
+    *,
     model: Any,
     task: OpenMLTask,
-    extension: 'Extension',
+    extension: Extension,
     add_local_measures: bool,
-) -> Tuple[
-    List[List],
-    Optional[OpenMLRunTrace],
-    'OrderedDict[str, OrderedDict]',
-    'OrderedDict[str, OrderedDict]',
+    n_jobs: int | None = None,
+) -> tuple[
+    list[list],
+    OpenMLRunTrace | None,
+    OrderedDict[str, OrderedDict],
+    OrderedDict[str, OrderedDict],
 ]:
-    arff_datacontent = []  # type: List[List]
-    traces = []  # type: List[OpenMLRunTrace]
+    """Runs the hyperparameter optimization on the given task
+    and returns the arfftrace content.
+
+    Parameters
+    ----------
+    model : Any
+        The model that is to be evalauted.
+    task : OpenMLTask
+        The OpenMLTask to evaluate.
+    extension : Extension
+        The OpenML extension object.
+    add_local_measures : bool
+        Whether to compute additional local evaluation measures.
+    n_jobs : int
+        Number of jobs to run in parallel.
+        If None, use 1 core by default. If -1, use all available cores.
+
+    Returns
+    -------
+    Tuple[List[List], Optional[OpenMLRunTrace],
+        OrderedDict[str, OrderedDict], OrderedDict[str, OrderedDict]]
+    A tuple containing the arfftrace content,
+    the OpenML run trace, the global and local evaluation measures.
+    """
+    arff_datacontent = []  # type: list[list]
+    traces = []  # type: list[OpenMLRunTrace]
     # stores fold-based evaluation measures. In case of a sample based task,
     # this information is multiple times overwritten, but due to the ordering
     # of tne loops, eventually it contains the information based on the full
@@ -403,92 +516,138 @@ def _run_task_get_arffcontent(
     # methods, less maintenance, less confusion)
     num_reps, num_folds, num_samples = task.get_split_dimensions()
 
-    for n_fit, (rep_no, fold_no, sample_no) in enumerate(itertools.product(
-        range(num_reps),
-        range(num_folds),
-        range(num_samples),
-    ), start=1):
-
-        train_indices, test_indices = task.get_train_test_split_indices(
-            repeat=rep_no, fold=fold_no, sample=sample_no)
-        if isinstance(task, OpenMLSupervisedTask):
-            x, y = task.get_X_and_y(dataset_format='array')
-            train_x = x[train_indices]
-            train_y = y[train_indices]
-            test_x = x[test_indices]
-            test_y = y[test_indices]
-        elif isinstance(task, OpenMLClusteringTask):
-            x = task.get_X(dataset_format='array')
-            train_x = x[train_indices]
-            train_y = None
-            test_x = None
-            test_y = None
-        else:
-            raise NotImplementedError(task.task_type)
-
-        config.logger.info(
-            "Going to execute flow '%s' on task %d for repeat %d fold %d sample %d.",
-            flow.name, task.task_id, rep_no, fold_no, sample_no,
-        )
-
-        (
-            pred_y,
-            proba_y,
-            user_defined_measures_fold,
-            trace,
-        ) = extension._run_model_on_fold(
+    jobs = []
+    for n_fit, (rep_no, fold_no, sample_no) in enumerate(
+        itertools.product(
+            range(num_reps),
+            range(num_folds),
+            range(num_samples),
+        ),
+        start=1,
+    ):
+        jobs.append((n_fit, rep_no, fold_no, sample_no))
+
+    # The forked child process may not copy the configuration state of OpenML from the parent.
+    # Current configuration setup needs to be copied and passed to the child processes.
+    _config = openml.config.get_config_as_dict()
+    # Execute runs in parallel
+    # assuming the same number of tasks as workers (n_jobs), the total compute time for this
+    # statement will be similar to the slowest run
+    # TODO(eddiebergman): Simplify this
+    job_rvals: list[
+        tuple[
+            np.ndarray,
+            pd.DataFrame | None,
+            np.ndarray,
+            pd.DataFrame | None,
+            OpenMLRunTrace | None,
+            OrderedDict[str, float],
+        ],
+    ]
+    job_rvals = Parallel(verbose=0, n_jobs=n_jobs)(  # type: ignore
+        delayed(_run_task_get_arffcontent_parallel_helper)(
+            extension=extension,
+            fold_no=fold_no,
             model=model,
-            task=task,
-            X_train=train_x,
-            y_train=train_y,
             rep_no=rep_no,
-            fold_no=fold_no,
-            X_test=test_x,
+            sample_no=sample_no,
+            task=task,
+            configuration=_config,
         )
-        if trace is not None:
-            traces.append(trace)
+        for _n_fit, rep_no, fold_no, sample_no in jobs
+    )  # job_rvals contain the output of all the runs with one-to-one correspondence with `jobs`
+
+    for n_fit, rep_no, fold_no, sample_no in jobs:
+        pred_y, proba_y, test_indices, test_y, inner_trace, user_defined_measures_fold = job_rvals[
+            n_fit - 1
+        ]
+
+        if inner_trace is not None:
+            traces.append(inner_trace)
 
         # add client-side calculated metrics. These is used on the server as
         # consistency check, only useful for supervised tasks
-        def _calculate_local_measure(sklearn_fn, openml_name):
-            user_defined_measures_fold[openml_name] = sklearn_fn(test_y, pred_y)
+        def _calculate_local_measure(  # type: ignore
+            sklearn_fn,
+            openml_name,
+            _test_y=test_y,
+            _pred_y=pred_y,
+            _user_defined_measures_fold=user_defined_measures_fold,
+        ):
+            _user_defined_measures_fold[openml_name] = sklearn_fn(_test_y, _pred_y)
 
         if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
+            if test_y is None:
+                raise ValueError("test_y cannot be None for classification tasks.")
+            if proba_y is None:
+                raise ValueError("proba_y cannot be None for classification tasks.")
 
             for i, tst_idx in enumerate(test_indices):
-
-                arff_line = [rep_no, fold_no, sample_no, tst_idx]  # type: List[Any]
                 if task.class_labels is not None:
-                    for j, class_label in enumerate(task.class_labels):
-                        arff_line.append(proba_y[i][j])
-
-                    arff_line.append(task.class_labels[pred_y[i]])
-                    arff_line.append(task.class_labels[test_y[i]])
+                    prediction = (
+                        task.class_labels[pred_y[i]]
+                        if isinstance(pred_y[i], (int, np.integer))
+                        else pred_y[i]
+                    )
+                    if isinstance(test_y, pd.Series):
+                        truth = (
+                            task.class_labels[test_y.iloc[i]]
+                            if isinstance(test_y.iloc[i], int)
+                            else test_y.iloc[i]
+                        )
+                    else:
+                        truth = (
+                            task.class_labels[test_y[i]]
+                            if isinstance(test_y[i], (int, np.integer))
+                            else test_y[i]
+                        )
+                    pred_prob = proba_y.iloc[i] if isinstance(proba_y, pd.DataFrame) else proba_y[i]
+
+                    arff_line = format_prediction(
+                        task=task,
+                        repeat=rep_no,
+                        fold=fold_no,
+                        sample=sample_no,
+                        index=tst_idx,
+                        prediction=prediction,
+                        truth=truth,
+                        proba=dict(zip(task.class_labels, pred_prob, strict=False)),
+                    )
                 else:
-                    raise ValueError('The task has no class labels')
+                    raise ValueError("The task has no class labels")
 
                 arff_datacontent.append(arff_line)
 
             if add_local_measures:
                 _calculate_local_measure(
                     sklearn.metrics.accuracy_score,
-                    'predictive_accuracy',
+                    "predictive_accuracy",
                 )
 
         elif isinstance(task, OpenMLRegressionTask):
+            if test_y is None:
+                raise ValueError("test_y cannot be None for regression tasks.")
+            for i, _ in enumerate(test_indices):
+                truth = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i]
+                arff_line = format_prediction(
+                    task=task,
+                    repeat=rep_no,
+                    fold=fold_no,
+                    index=test_indices[i],
+                    prediction=pred_y[i],
+                    truth=truth,
+                )
 
-            for i in range(0, len(test_indices)):
-                arff_line = [rep_no, fold_no, test_indices[i], pred_y[i], test_y[i]]
                 arff_datacontent.append(arff_line)
 
             if add_local_measures:
                 _calculate_local_measure(
                     sklearn.metrics.mean_absolute_error,
-                    'mean_absolute_error',
+                    "mean_absolute_error",
                 )
 
         elif isinstance(task, OpenMLClusteringTask):
-            for i in range(0, len(test_indices)):
+            for i, _ in enumerate(test_indices):
                 arff_line = [test_indices[i], pred_y[i]]  # row_id, cluster ID
                 arff_datacontent.append(arff_line)
 
@@ -496,7 +655,6 @@ def _calculate_local_measure(sklearn_fn, openml_name):
             raise TypeError(type(task))
 
         for measure in user_defined_measures_fold:
-
             if measure not in user_defined_measures_per_fold:
                 user_defined_measures_per_fold[measure] = OrderedDict()
             if rep_no not in user_defined_measures_per_fold[measure]:
@@ -509,22 +667,21 @@ def _calculate_local_measure(sklearn_fn, openml_name):
             if fold_no not in user_defined_measures_per_sample[measure][rep_no]:
                 user_defined_measures_per_sample[measure][rep_no][fold_no] = OrderedDict()
 
-            user_defined_measures_per_fold[measure][rep_no][fold_no] = (
-                user_defined_measures_fold[measure]
-            )
+            user_defined_measures_per_fold[measure][rep_no][fold_no] = user_defined_measures_fold[
+                measure
+            ]
             user_defined_measures_per_sample[measure][rep_no][fold_no][sample_no] = (
                 user_defined_measures_fold[measure]
             )
 
+    trace: OpenMLRunTrace | None = None
     if len(traces) > 0:
-        if len(traces) != n_fit:
+        if len(traces) != len(jobs):
             raise ValueError(
-                'Did not find enough traces (expected {}, found {})'.format(n_fit, len(traces))
+                f"Did not find enough traces (expected {len(jobs)}, found {len(traces)})",
             )
-        else:
-            trace = OpenMLRunTrace.merge_traces(traces)
-    else:
-        trace = None
+
+        trace = OpenMLRunTrace.merge_traces(traces)
 
     return (
         arff_datacontent,
@@ -534,7 +691,105 @@ def _calculate_local_measure(sklearn_fn, openml_name):
     )
 
 
-def get_runs(run_ids):
+def _run_task_get_arffcontent_parallel_helper(  # noqa: PLR0913
+    extension: Extension,
+    fold_no: int,
+    model: Any,
+    rep_no: int,
+    sample_no: int,
+    task: OpenMLTask,
+    configuration: _Config | None = None,
+) -> tuple[
+    np.ndarray,
+    pd.DataFrame | None,
+    np.ndarray,
+    pd.DataFrame | None,
+    OpenMLRunTrace | None,
+    OrderedDict[str, float],
+]:
+    """Helper function that runs a single model on a single task fold sample.
+
+    Parameters
+    ----------
+    extension : Extension
+        An OpenML extension instance.
+    fold_no : int
+        The fold number to be run.
+    model : Any
+        The model that is to be evaluated.
+    rep_no : int
+        Repetition number to be run.
+    sample_no : int
+        Sample number to be run.
+    task : OpenMLTask
+        The task object from OpenML.
+    configuration : _Config
+        Hyperparameters to configure the model.
+
+    Returns
+    -------
+    Tuple[np.ndarray, Optional[pd.DataFrame], np.ndarray, Optional[pd.DataFrame],
+           Optional[OpenMLRunTrace], OrderedDict[str, float]]
+    A tuple containing the predictions, probability estimates (if applicable),
+    actual target values, actual target value probabilities (if applicable),
+    the trace object of the OpenML run (if applicable),
+    and a dictionary of local measures for this particular fold.
+    """
+    # Sets up the OpenML instantiated in the child process to match that of the parent's
+    # if configuration=None, loads the default
+    openml.config._setup(configuration)
+
+    train_indices, test_indices = task.get_train_test_split_indices(
+        repeat=rep_no,
+        fold=fold_no,
+        sample=sample_no,
+    )
+
+    if isinstance(task, OpenMLSupervisedTask):
+        x, y = task.get_X_and_y()
+        if not isinstance(y, (pd.Series, pd.DataFrame)):
+            raise TypeError(f"y must be a pandas Series or DataFrame, got {type(y).__name__}")
+        train_x = x.iloc[train_indices]
+        train_y = y.iloc[train_indices]
+        test_x = x.iloc[test_indices]
+        test_y = y.iloc[test_indices]
+    elif isinstance(task, OpenMLClusteringTask):
+        x = task.get_X()
+        train_x = x.iloc[train_indices]
+        train_y = None
+        test_x = None
+        test_y = None
+    else:
+        raise NotImplementedError(
+            f"Task type '{task.task_type}' is not supported. "
+            f"Only OpenMLSupervisedTask and OpenMLClusteringTask are currently implemented. "
+            f"Task details: task_id={getattr(task, 'task_id', 'unknown')}, "
+            f"task_class={task.__class__.__name__}"
+        )
+
+    openml.config.logger.info(
+        f"Going to run model {model!s} on "
+        f"dataset {openml.datasets.get_dataset(task.dataset_id).name} "
+        f"for repeat {rep_no} fold {fold_no} sample {sample_no}"
+    )
+    (
+        pred_y,
+        proba_y,
+        user_defined_measures_fold,
+        trace,
+    ) = extension._run_model_on_fold(
+        model=model,
+        task=task,
+        X_train=train_x,
+        y_train=train_y,
+        rep_no=rep_no,
+        fold_no=fold_no,
+        X_test=test_x,
+    )
+    return pred_y, proba_y, test_indices, test_y, trace, user_defined_measures_fold  # type: ignore
+
+
+def get_runs(run_ids: list[int]) -> list[OpenMLRun]:
     """Gets all runs in run_ids list.
 
     Parameters
@@ -546,7 +801,6 @@ def get_runs(run_ids):
     runs : list of OpenMLRun
         List of runs corresponding to IDs, fetched from the server.
     """
-
     runs = []
     for run_id in run_ids:
         runs.append(get_run(run_id))
@@ -554,7 +808,7 @@ def get_runs(run_ids):
 
 
 @openml.utils.thread_safe_if_oslo_installed
-def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun:
+def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun:  # noqa: FBT002
     """Gets run corresponding to run_id.
 
     Parameters
@@ -572,30 +826,26 @@ def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun:
     run : OpenMLRun
         Run corresponding to ID, fetched from the server.
     """
-    run_dir = openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME,
-                                                          run_id)
-    run_file = os.path.join(run_dir, "description.xml")
+    run_dir = Path(openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id))
+    run_file = run_dir / "description.xml"
 
-    if not os.path.exists(run_dir):
-        os.makedirs(run_dir)
+    run_dir.mkdir(parents=True, exist_ok=True)
 
     try:
         if not ignore_cache:
             return _get_cached_run(run_id)
-        else:
-            raise OpenMLCacheException(message='dummy')
+
+        raise OpenMLCacheException(message="dummy")
 
     except OpenMLCacheException:
-        run_xml = openml._api_calls._perform_api_call("run/%d" % run_id, 'get')
-        with io.open(run_file, "w", encoding='utf8') as fh:
+        run_xml = openml._api_calls._perform_api_call(f"run/{run_id}", "get")
+        with run_file.open("w", encoding="utf8") as fh:
             fh.write(run_xml)
 
-    run = _create_run_from_xml(run_xml)
+    return _create_run_from_xml(run_xml)
 
-    return run
 
-
-def _create_run_from_xml(xml, from_server=True):
+def _create_run_from_xml(xml: str, from_server: bool = True) -> OpenMLRun:  # noqa: PLR0915, PLR0912, C901, FBT002
     """Create a run object from xml returned from server.
 
     Parameters
@@ -613,7 +863,7 @@ def _create_run_from_xml(xml, from_server=True):
         New run object representing run_xml.
     """
 
-    def obtain_field(xml_obj, fieldname, from_server, cast=None):
+    def obtain_field(xml_obj, fieldname, from_server, cast=None):  # type: ignore
         # this function can be used to check whether a field is present in an
         # object. if it is not present, either returns None or throws an error
         # (this is usually done if the xml comes from the server)
@@ -621,175 +871,191 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
             if cast is not None:
                 return cast(xml_obj[fieldname])
             return xml_obj[fieldname]
-        elif not from_server:
+
+        if not from_server:
             return None
-        else:
-            raise AttributeError('Run XML does not contain required (server) '
-                                 'field: ', fieldname)
 
-    run = xmltodict.parse(xml, force_list=['oml:file', 'oml:evaluation',
-                                           'oml:parameter_setting'])["oml:run"]
-    run_id = obtain_field(run, 'oml:run_id', from_server, cast=int)
-    uploader = obtain_field(run, 'oml:uploader', from_server, cast=int)
-    uploader_name = obtain_field(run, 'oml:uploader_name', from_server)
-    task_id = int(run['oml:task_id'])
-    task_type = obtain_field(run, 'oml:task_type', from_server)
+        raise AttributeError("Run XML does not contain required (server) field: ", fieldname)
+
+    run = xmltodict.parse(xml, force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"])[
+        "oml:run"
+    ]
+    run_id = obtain_field(run, "oml:run_id", from_server, cast=int)
+    uploader = obtain_field(run, "oml:uploader", from_server, cast=int)
+    uploader_name = obtain_field(run, "oml:uploader_name", from_server)
+    task_id = int(run["oml:task_id"])
+    task_type = obtain_field(run, "oml:task_type", from_server)
 
     # even with the server requirement this field may be empty.
-    if 'oml:task_evaluation_measure' in run:
-        task_evaluation_measure = run['oml:task_evaluation_measure']
-    else:
-        task_evaluation_measure = None
+    task_evaluation_measure = run.get("oml:task_evaluation_measure", None)
 
-    if not from_server and run['oml:flow_id'] is None:
+    if not from_server and run["oml:flow_id"] is None:
         # This can happen for a locally stored run of which the flow is not yet published.
         flow_id = None
         parameters = None
     else:
-        flow_id = obtain_field(run, 'oml:flow_id', from_server, cast=int)
+        flow_id = obtain_field(run, "oml:flow_id", from_server, cast=int)
         # parameters are only properly formatted once the flow is established on the server.
         # thus they are also not stored for runs with local flows.
         parameters = []
-        if 'oml:parameter_setting' in run:
-            obtained_parameter_settings = run['oml:parameter_setting']
+        if "oml:parameter_setting" in run:
+            obtained_parameter_settings = run["oml:parameter_setting"]
             for parameter_dict in obtained_parameter_settings:
-                current_parameter = OrderedDict()
-                current_parameter['oml:name'] = parameter_dict['oml:name']
-                current_parameter['oml:value'] = parameter_dict['oml:value']
-                if 'oml:component' in parameter_dict:
-                    current_parameter['oml:component'] = \
-                        parameter_dict['oml:component']
+                current_parameter = {
+                    "oml:name": parameter_dict["oml:name"],
+                    "oml:value": parameter_dict["oml:value"],
+                }
+                if "oml:component" in parameter_dict:
+                    current_parameter["oml:component"] = parameter_dict["oml:component"]
                 parameters.append(current_parameter)
 
-    flow_name = obtain_field(run, 'oml:flow_name', from_server)
-    setup_id = obtain_field(run, 'oml:setup_id', from_server, cast=int)
-    setup_string = obtain_field(run, 'oml:setup_string', from_server)
+    flow_name = obtain_field(run, "oml:flow_name", from_server)
+    setup_id = obtain_field(run, "oml:setup_id", from_server, cast=int)
+    setup_string = obtain_field(run, "oml:setup_string", from_server)
+    # run_details is currently not sent by the server, so we need to retrieve it safely.
+    # whenever that's resolved, we can enforce it being present (OpenML#1087)
+    run_details = obtain_field(run, "oml:run_details", from_server=False)
 
-    if 'oml:input_data' in run:
-        dataset_id = int(run['oml:input_data']['oml:dataset']['oml:did'])
+    if "oml:input_data" in run:
+        dataset_id = int(run["oml:input_data"]["oml:dataset"]["oml:did"])
     elif not from_server:
         dataset_id = None
     else:
         # fetching the task to obtain dataset_id
         t = openml.tasks.get_task(task_id, download_data=False)
-        if not hasattr(t, 'dataset_id'):
-            raise ValueError("Unable to fetch dataset_id from the task({}) "
-                             "linked to run({})".format(task_id, run_id))
+        if not hasattr(t, "dataset_id"):
+            raise ValueError(
+                f"Unable to fetch dataset_id from the task({task_id}) linked to run({run_id})",
+            )
         dataset_id = t.dataset_id
 
-    files = OrderedDict()
-    evaluations = OrderedDict()
-    fold_evaluations = OrderedDict()
-    sample_evaluations = OrderedDict()
-    if 'oml:output_data' not in run:
+    files: dict[str, int] = {}
+    evaluations: dict[str, float | Any] = {}
+    fold_evaluations: dict[str, dict[int, dict[int, float | Any]]] = {}
+    sample_evaluations: dict[str, dict[int, dict[int, dict[int, float | Any]]]] = {}
+    if "oml:output_data" not in run:
         if from_server:
-            raise ValueError('Run does not contain output_data '
-                             '(OpenML server error?)')
+            raise ValueError("Run does not contain output_data (OpenML server error?)")
+        predictions_url = None
     else:
-        output_data = run['oml:output_data']
-        if 'oml:file' in output_data:
+        output_data = run["oml:output_data"]
+        predictions_url = None
+        if "oml:file" in output_data:
             # multiple files, the normal case
-            for file_dict in output_data['oml:file']:
-                files[file_dict['oml:name']] = int(file_dict['oml:file_id'])
-        if 'oml:evaluation' in output_data:
+            for file_dict in output_data["oml:file"]:
+                files[file_dict["oml:name"]] = int(file_dict["oml:file_id"])
+                if file_dict["oml:name"] == "predictions":
+                    predictions_url = file_dict["oml:url"]
+        if "oml:evaluation" in output_data:
             # in normal cases there should be evaluations, but in case there
             # was an error these could be absent
-            for evaluation_dict in output_data['oml:evaluation']:
-                key = evaluation_dict['oml:name']
-                if 'oml:value' in evaluation_dict:
-                    value = float(evaluation_dict['oml:value'])
-                elif 'oml:array_data' in evaluation_dict:
-                    value = evaluation_dict['oml:array_data']
+            for evaluation_dict in output_data["oml:evaluation"]:
+                key = evaluation_dict["oml:name"]
+                if "oml:value" in evaluation_dict:
+                    value = float(evaluation_dict["oml:value"])
+                elif "oml:array_data" in evaluation_dict:
+                    value = evaluation_dict["oml:array_data"]
                 else:
-                    raise ValueError('Could not find keys "value" or '
-                                     '"array_data" in %s' %
-                                     str(evaluation_dict.keys()))
-                if '@repeat' in evaluation_dict and '@fold' in \
-                        evaluation_dict and '@sample' in evaluation_dict:
-                    repeat = int(evaluation_dict['@repeat'])
-                    fold = int(evaluation_dict['@fold'])
-                    sample = int(evaluation_dict['@sample'])
+                    raise ValueError(
+                        'Could not find keys "value" or '
+                        f'"array_data" in {evaluation_dict.keys()!s}',
+                    )
+                if (
+                    "@repeat" in evaluation_dict
+                    and "@fold" in evaluation_dict
+                    and "@sample" in evaluation_dict
+                ):
+                    repeat = int(evaluation_dict["@repeat"])
+                    fold = int(evaluation_dict["@fold"])
+                    sample = int(evaluation_dict["@sample"])
                     if key not in sample_evaluations:
-                        sample_evaluations[key] = OrderedDict()
+                        sample_evaluations[key] = {}
                     if repeat not in sample_evaluations[key]:
-                        sample_evaluations[key][repeat] = OrderedDict()
+                        sample_evaluations[key][repeat] = {}
                     if fold not in sample_evaluations[key][repeat]:
-                        sample_evaluations[key][repeat][fold] = OrderedDict()
+                        sample_evaluations[key][repeat][fold] = {}
                     sample_evaluations[key][repeat][fold][sample] = value
-                elif '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
-                    repeat = int(evaluation_dict['@repeat'])
-                    fold = int(evaluation_dict['@fold'])
+                elif "@repeat" in evaluation_dict and "@fold" in evaluation_dict:
+                    repeat = int(evaluation_dict["@repeat"])
+                    fold = int(evaluation_dict["@fold"])
                     if key not in fold_evaluations:
-                        fold_evaluations[key] = OrderedDict()
+                        fold_evaluations[key] = {}
                     if repeat not in fold_evaluations[key]:
-                        fold_evaluations[key][repeat] = OrderedDict()
+                        fold_evaluations[key][repeat] = {}
                     fold_evaluations[key][repeat][fold] = value
                 else:
                     evaluations[key] = value
 
-    if 'description' not in files and from_server is True:
-        raise ValueError('No description file for run %d in run '
-                         'description XML' % run_id)
+    if "description" not in files and from_server is True:
+        raise ValueError(f"No description file for run {run_id} in run description XML")
 
-    if 'predictions' not in files and from_server is True:
+    if "predictions" not in files and from_server is True:
         task = openml.tasks.get_task(task_id)
-        if task.task_type_id == TaskTypeEnum.SUBGROUP_DISCOVERY:
+        if task.task_type_id == TaskType.SUBGROUP_DISCOVERY:
             raise NotImplementedError(
-                'Subgroup discovery tasks are not yet supported.'
+                f"Subgroup discovery tasks are not yet supported. "
+                f"Task ID: {task_id}. Please check the OpenML documentation"
+                f"for supported task types. "
+                f"Currently supported task types: Classification, Regression,"
+                f"Clustering, and Learning Curve."
             )
-        else:
-            # JvR: actually, I am not sure whether this error should be raised.
-            # a run can consist without predictions. But for now let's keep it
-            # Matthias: yes, it should stay as long as we do not really handle
-            # this stuff
-            raise ValueError('No prediction files for run %d in run '
-                             'description XML' % run_id)
-
-    tags = openml.utils.extract_xml_tags('oml:tag', run)
-
-    return OpenMLRun(run_id=run_id, uploader=uploader,
-                     uploader_name=uploader_name, task_id=task_id,
-                     task_type=task_type,
-                     task_evaluation_measure=task_evaluation_measure,
-                     flow_id=flow_id, flow_name=flow_name,
-                     setup_id=setup_id, setup_string=setup_string,
-                     parameter_settings=parameters,
-                     dataset_id=dataset_id, output_files=files,
-                     evaluations=evaluations,
-                     fold_evaluations=fold_evaluations,
-                     sample_evaluations=sample_evaluations,
-                     tags=tags)
-
-
-def _get_cached_run(run_id):
-    """Load a run from the cache."""
-    run_cache_dir = openml.utils._create_cache_directory_for_id(
-        RUNS_CACHE_DIR_NAME, run_id,
+
+        # JvR: actually, I am not sure whether this error should be raised.
+        # a run can consist without predictions. But for now let's keep it
+        # Matthias: yes, it should stay as long as we do not really handle
+        # this stuff
+        raise ValueError(f"No prediction files for run {run_id} in run description XML")
+
+    tags = openml.utils.extract_xml_tags("oml:tag", run)
+
+    return OpenMLRun(
+        run_id=run_id,
+        uploader=uploader,
+        uploader_name=uploader_name,
+        task_id=task_id,
+        task_type=task_type,
+        task_evaluation_measure=task_evaluation_measure,
+        flow_id=flow_id,
+        flow_name=flow_name,
+        setup_id=setup_id,
+        setup_string=setup_string,
+        parameter_settings=parameters,
+        dataset_id=dataset_id,
+        output_files=files,
+        # Make sure default values are used where needed to keep run objects identical
+        evaluations=evaluations or None,
+        fold_evaluations=fold_evaluations or None,
+        sample_evaluations=sample_evaluations or None,
+        tags=tags,
+        predictions_url=predictions_url,
+        run_details=run_details,
     )
+
+
+def _get_cached_run(run_id: int) -> OpenMLRun:
+    """Load a run from the cache."""
+    run_cache_dir = openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id)
+    run_file = run_cache_dir / "description.xml"
     try:
-        run_file = os.path.join(run_cache_dir, "description.xml")
-        with io.open(run_file, encoding='utf8') as fh:
-            run = _create_run_from_xml(xml=fh.read())
-        return run
-
-    except (OSError, IOError):
-        raise OpenMLCacheException("Run file for run id %d not "
-                                   "cached" % run_id)
-
-
-def list_runs(
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    id: Optional[List] = None,
-    task: Optional[List[int]] = None,
-    setup: Optional[List] = None,
-    flow: Optional[List] = None,
-    uploader: Optional[List] = None,
-    tag: Optional[str] = None,
-    display_errors: bool = False,
-    output_format: str = 'dict',
-    **kwargs
-) -> Union[Dict, pd.DataFrame]:
+        with run_file.open(encoding="utf8") as fh:
+            return _create_run_from_xml(xml=fh.read())
+    except OSError as e:
+        raise OpenMLCacheException(f"Run file for run id {run_id} not cached") from e
+
+
+def list_runs(  # noqa: PLR0913
+    offset: int | None = None,
+    size: int | None = None,
+    id: list | None = None,  # noqa: A002
+    task: list[int] | None = None,
+    setup: list | None = None,
+    flow: list | None = None,
+    uploader: list | None = None,
+    tag: str | None = None,
+    study: int | None = None,
+    display_errors: bool = False,  # noqa: FBT002
+    task_type: TaskType | int | None = None,
+) -> pd.DataFrame:
     """
     List all runs matching all of the given filters.
     (Supports large amount of results)
@@ -813,61 +1079,62 @@ def list_runs(
 
     tag : str, optional
 
+    study : int, optional
+
     display_errors : bool, optional (default=None)
         Whether to list runs which have an error (for example a missing
         prediction file).
 
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
-
-    kwargs : dict, optional
-        Legal filter operators: task_type.
+    task_type : str, optional
 
     Returns
     -------
-    dict of dicts, or dataframe
+    dataframe
     """
-    if output_format not in ['dataframe', 'dict']:
-        raise ValueError("Invalid output format selected. "
-                         "Only 'dict' or 'dataframe' applicable.")
-
     if id is not None and (not isinstance(id, list)):
-        raise TypeError('id must be of type list.')
+        raise TypeError("id must be of type list.")
     if task is not None and (not isinstance(task, list)):
-        raise TypeError('task must be of type list.')
+        raise TypeError("task must be of type list.")
     if setup is not None and (not isinstance(setup, list)):
-        raise TypeError('setup must be of type list.')
+        raise TypeError("setup must be of type list.")
     if flow is not None and (not isinstance(flow, list)):
-        raise TypeError('flow must be of type list.')
+        raise TypeError("flow must be of type list.")
     if uploader is not None and (not isinstance(uploader, list)):
-        raise TypeError('uploader must be of type list.')
-
-    return openml.utils._list_all(output_format=output_format,
-                                  listing_call=_list_runs,
-                                  offset=offset,
-                                  size=size,
-                                  id=id,
-                                  task=task,
-                                  setup=setup,
-                                  flow=flow,
-                                  uploader=uploader,
-                                  tag=tag,
-                                  display_errors=display_errors,
-                                  **kwargs)
-
-
-def _list_runs(
-    id: Optional[List] = None,
-    task: Optional[List] = None,
-    setup: Optional[List] = None,
-    flow: Optional[List] = None,
-    uploader: Optional[List] = None,
+        raise TypeError("uploader must be of type list.")
+
+    listing_call = partial(
+        _list_runs,
+        id=id,
+        task=task,
+        setup=setup,
+        flow=flow,
+        uploader=uploader,
+        tag=tag,
+        study=study,
+        display_errors=display_errors,
+        task_type=task_type,
+    )
+    batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+    if len(batches) == 0:
+        return pd.DataFrame()
+
+    return pd.concat(batches)
+
+
+def _list_runs(  # noqa: PLR0913, C901
+    limit: int,
+    offset: int,
+    *,
+    id: list | None = None,  # noqa: A002
+    task: list | None = None,
+    setup: list | None = None,
+    flow: list | None = None,
+    uploader: list | None = None,
+    study: int | None = None,
+    tag: str | None = None,
     display_errors: bool = False,
-    output_format: str = 'dict',
-    **kwargs
-) -> Union[Dict, pd.DataFrame]:
+    task_type: TaskType | int | None = None,
+) -> pd.DataFrame:
     """
     Perform API call `/run/list/{filters}'
     <https://www.openml.org/api_docs/#!/run/get_run_list_filters>`
@@ -887,80 +1154,175 @@ def _list_runs(
 
     flow : list, optional
 
+    tag: str, optional
+
     uploader : list, optional
 
+    study : int, optional
+
     display_errors : bool, optional (default=None)
         Whether to list runs which have an error (for example a missing
         prediction file).
 
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
-
-    kwargs : dict, optional
-        Legal filter operators: task_type.
+    task_type : str, optional
 
     Returns
     -------
     dict, or dataframe
         List of found runs.
     """
-
     api_call = "run/list"
-    if kwargs is not None:
-        for operator, value in kwargs.items():
-            api_call += "/%s/%s" % (operator, value)
+    if limit is not None:
+        api_call += f"/limit/{limit}"
+    if offset is not None:
+        api_call += f"/offset/{offset}"
     if id is not None:
-        api_call += "/run/%s" % ','.join([str(int(i)) for i in id])
+        api_call += f"/run/{','.join([str(int(i)) for i in id])}"
     if task is not None:
-        api_call += "/task/%s" % ','.join([str(int(i)) for i in task])
+        api_call += f"/task/{','.join([str(int(i)) for i in task])}"
     if setup is not None:
-        api_call += "/setup/%s" % ','.join([str(int(i)) for i in setup])
+        api_call += f"/setup/{','.join([str(int(i)) for i in setup])}"
     if flow is not None:
-        api_call += "/flow/%s" % ','.join([str(int(i)) for i in flow])
+        api_call += f"/flow/{','.join([str(int(i)) for i in flow])}"
     if uploader is not None:
-        api_call += "/uploader/%s" % ','.join([str(int(i)) for i in uploader])
+        api_call += f"/uploader/{','.join([str(int(i)) for i in uploader])}"
+    if study is not None:
+        api_call += f"/study/{study}"
     if display_errors:
         api_call += "/show_errors/true"
-    return __list_runs(api_call=api_call, output_format=output_format)
+    if tag is not None:
+        api_call += f"/tag/{tag}"
+    if task_type is not None:
+        tvalue = task_type.value if isinstance(task_type, TaskType) else task_type
+        api_call += f"/task_type/{tvalue}"
+    return __list_runs(api_call=api_call)
 
 
-def __list_runs(api_call, output_format='dict'):
+def __list_runs(api_call: str) -> pd.DataFrame:
     """Helper function to parse API calls which are lists of runs"""
-    xml_string = openml._api_calls._perform_api_call(api_call, 'get')
-    runs_dict = xmltodict.parse(xml_string, force_list=('oml:run',))
+    xml_string = openml._api_calls._perform_api_call(api_call, "get")
+    runs_dict = xmltodict.parse(xml_string, force_list=("oml:run",))
     # Minimalistic check if the XML is useful
-    if 'oml:runs' not in runs_dict:
-        raise ValueError('Error in return XML, does not contain "oml:runs": %s'
-                         % str(runs_dict))
-    elif '@xmlns:oml' not in runs_dict['oml:runs']:
-        raise ValueError('Error in return XML, does not contain '
-                         '"oml:runs"/@xmlns:oml: %s'
-                         % str(runs_dict))
-    elif runs_dict['oml:runs']['@xmlns:oml'] != 'http://openml.org/openml':
-        raise ValueError('Error in return XML, value of  '
-                         '"oml:runs"/@xmlns:oml is not '
-                         '"http://openml.org/openml": %s'
-                         % str(runs_dict))
-
-    assert type(runs_dict['oml:runs']['oml:run']) == list, \
-        type(runs_dict['oml:runs'])
-
-    runs = OrderedDict()
-    for run_ in runs_dict['oml:runs']['oml:run']:
-        run_id = int(run_['oml:run_id'])
-        run = {'run_id': run_id,
-               'task_id': int(run_['oml:task_id']),
-               'setup_id': int(run_['oml:setup_id']),
-               'flow_id': int(run_['oml:flow_id']),
-               'uploader': int(run_['oml:uploader']),
-               'upload_time': str(run_['oml:upload_time']),
-               'error_message': str((run_['oml:error_message']) or '')}
-
-        runs[run_id] = run
-
-    if output_format == 'dataframe':
-        runs = pd.DataFrame.from_dict(runs, orient='index')
+    if "oml:runs" not in runs_dict:
+        raise ValueError(f'Error in return XML, does not contain "oml:runs": {runs_dict}')
 
-    return runs
+    if "@xmlns:oml" not in runs_dict["oml:runs"]:
+        raise ValueError(
+            f'Error in return XML, does not contain "oml:runs"/@xmlns:oml: {runs_dict}'
+        )
+
+    if runs_dict["oml:runs"]["@xmlns:oml"] != "http://openml.org/openml":
+        raise ValueError(
+            "Error in return XML, value of  "
+            '"oml:runs"/@xmlns:oml is not '
+            f'"http://openml.org/openml": {runs_dict}',
+        )
+
+    if not isinstance(runs_dict["oml:runs"]["oml:run"], list):
+        raise TypeError(
+            f"Expected runs_dict['oml:runs']['oml:run'] to be a list, "
+            f"got {type(runs_dict['oml:runs']['oml:run']).__name__}"
+        )
+
+    runs = {
+        int(r["oml:run_id"]): {
+            "run_id": int(r["oml:run_id"]),
+            "task_id": int(r["oml:task_id"]),
+            "setup_id": int(r["oml:setup_id"]),
+            "flow_id": int(r["oml:flow_id"]),
+            "uploader": int(r["oml:uploader"]),
+            "task_type": TaskType(int(r["oml:task_type_id"])),
+            "upload_time": str(r["oml:upload_time"]),
+            "error_message": str((r["oml:error_message"]) or ""),
+        }
+        for r in runs_dict["oml:runs"]["oml:run"]
+    }
+    return pd.DataFrame.from_dict(runs, orient="index")
+
+
+def format_prediction(  # noqa: PLR0913
+    task: OpenMLSupervisedTask,
+    repeat: int,
+    fold: int,
+    index: int,
+    prediction: str | int | float,
+    truth: str | int | float,
+    sample: int | None = None,
+    proba: dict[str, float] | None = None,
+) -> list[str | int | float]:
+    """Format the predictions in the specific order as required for the run results.
+
+    Parameters
+    ----------
+    task: OpenMLSupervisedTask
+        Task for which to format the predictions.
+    repeat: int
+        From which repeat this predictions is made.
+    fold: int
+        From which fold this prediction is made.
+    index: int
+        For which index this prediction is made.
+    prediction: str, int or float
+        The predicted class label or value.
+    truth: str, int or float
+        The true class label or value.
+    sample: int, optional (default=None)
+        From which sample set this prediction is made.
+        Required only for LearningCurve tasks.
+    proba: Dict[str, float], optional (default=None)
+        For classification tasks only.
+        A mapping from each class label to their predicted probability.
+        The dictionary should contain an entry for each of the `task.class_labels`.
+        E.g.: {"Iris-Setosa": 0.2, "Iris-Versicolor": 0.7, "Iris-Virginica": 0.1}
+
+    Returns
+    -------
+    A list with elements for the prediction results of a run.
+
+    The returned order of the elements is (if available):
+        [repeat, fold, sample, index, prediction, truth, *probabilities]
+
+    This order follows the R Client API.
+    """
+    if isinstance(task, OpenMLClassificationTask):
+        if proba is None:
+            raise ValueError("`proba` is required for classification task")
+        if task.class_labels is None:
+            raise ValueError("The classification task must have class labels set")
+        if not set(task.class_labels) == set(proba):
+            raise ValueError("Each class should have a predicted probability")
+        if sample is None:
+            if isinstance(task, OpenMLLearningCurveTask):
+                raise ValueError("`sample` can not be none for LearningCurveTask")
+
+            sample = 0
+        probabilities = [proba[c] for c in task.class_labels]
+        return [repeat, fold, sample, index, prediction, truth, *probabilities]
+
+    if isinstance(task, OpenMLRegressionTask):
+        return [repeat, fold, index, prediction, truth]
+
+    raise NotImplementedError(
+        f"Formatting for {type(task)} is not supported."
+        f"Supported task types: OpenMLClassificationTask, OpenMLRegressionTask,"
+        f"and OpenMLLearningCurveTask. "
+        f"Please ensure your task is one of these types."
+    )
+
+
+def delete_run(run_id: int) -> bool:
+    """Delete run with id `run_id` from the OpenML server.
+
+    You can only delete runs which you uploaded.
+
+    Parameters
+    ----------
+    run_id : int
+        OpenML id of the run
+
+    Returns
+    -------
+    bool
+        True if the deletion was successful. False otherwise.
+    """
+    return openml.utils._delete_entity("run", run_id)
diff --git a/openml/runs/run.py b/openml/runs/run.py
index 6a4818f30..086e9c046 100644
--- a/openml/runs/run.py
+++ b/openml/runs/run.py
@@ -1,47 +1,127 @@
-from collections import OrderedDict
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import pickle
 import time
-from typing import Any, IO, TextIO  # noqa F401
-import os
+from collections import OrderedDict
+from collections.abc import Callable, Sequence
+from pathlib import Path
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
 
 import arff
 import numpy as np
-import xmltodict
+import pandas as pd
 
 import openml
 import openml._api_calls
-from ..exceptions import PyOpenMLError
-from ..flows import get_flow
-from ..tasks import (get_task,
-                     TaskTypeEnum,
-                     OpenMLClassificationTask,
-                     OpenMLLearningCurveTask,
-                     OpenMLClusteringTask,
-                     OpenMLRegressionTask
-                     )
-from ..utils import _tag_entity
-
-
-class OpenMLRun(object):
-    """OpenML Run: result of running a model on an openml dataset.
-
-       Parameters
-       ----------
-       task_id : int
-           Refers to the task.
-       flow_id : int
-           Refers to the flow.
-       dataset_id: int
-           Refers to the data.
+from openml.base import OpenMLBase
+from openml.exceptions import PyOpenMLError
+from openml.flows import OpenMLFlow, get_flow
+from openml.tasks import (
+    OpenMLClassificationTask,
+    OpenMLClusteringTask,
+    OpenMLLearningCurveTask,
+    OpenMLRegressionTask,
+    OpenMLTask,
+    TaskType,
+    get_task,
+)
+
+if TYPE_CHECKING:
+    from openml.runs.trace import OpenMLRunTrace
+
+
+class OpenMLRun(OpenMLBase):
+    """OpenML Run: result of running a model on an OpenML dataset.
+
+    Parameters
+    ----------
+    task_id: int
+        The ID of the OpenML task associated with the run.
+    flow_id: int
+        The ID of the OpenML flow associated with the run.
+    dataset_id: int
+        The ID of the OpenML dataset used for the run.
+    setup_string: str
+        The setup string of the run.
+    output_files: Dict[str, int]
+        Specifies where each related file can be found.
+    setup_id: int
+        An integer representing the ID of the setup used for the run.
+    tags: List[str]
+        Representing the tags associated with the run.
+    uploader: int
+        User ID of the uploader.
+    uploader_name: str
+        The name of the person who uploaded the run.
+    evaluations: Dict
+        Representing the evaluations of the run.
+    fold_evaluations: Dict
+        The evaluations of the run for each fold.
+    sample_evaluations: Dict
+        The evaluations of the run for each sample.
+    data_content: List[List]
+        The predictions generated from executing this run.
+    trace: OpenMLRunTrace
+        The trace containing information on internal model evaluations of this run.
+    model: object
+        The untrained model that was evaluated in the run.
+    task_type: str
+        The type of the OpenML task associated with the run.
+    task_evaluation_measure: str
+        The evaluation measure used for the task.
+    flow_name: str
+        The name of the OpenML flow associated with the run.
+    parameter_settings: list[OrderedDict]
+        Representing the parameter settings used for the run.
+    predictions_url: str
+        The URL of the predictions file.
+    task: OpenMLTask
+        An instance of the OpenMLTask class, representing the OpenML task associated
+        with the run.
+    flow: OpenMLFlow
+        An instance of the OpenMLFlow class, representing the OpenML flow associated
+        with the run.
+    run_id: int
+        The ID of the run.
+    description_text: str, optional
+        Description text to add to the predictions file. If left None, is set to the
+        time the arff file is generated.
+    run_details: str, optional (default=None)
+        Description of the run stored in the run meta-data.
     """
 
-    def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
-                 output_files=None, setup_id=None, tags=None, uploader=None,
-                 uploader_name=None, evaluations=None, fold_evaluations=None,
-                 sample_evaluations=None, data_content=None, trace=None,
-                 model=None, task_type=None, task_evaluation_measure=None,
-                 flow_name=None, parameter_settings=None, predictions_url=None,
-                 task=None, flow=None, run_id=None):
+    def __init__(  # noqa: PLR0913
+        self,
+        task_id: int,
+        flow_id: int | None,
+        dataset_id: int | None,
+        setup_string: str | None = None,
+        output_files: dict[str, int] | None = None,
+        setup_id: int | None = None,
+        tags: list[str] | None = None,
+        uploader: int | None = None,
+        uploader_name: str | None = None,
+        evaluations: dict | None = None,
+        fold_evaluations: dict | None = None,
+        sample_evaluations: dict | None = None,
+        data_content: list[list] | None = None,
+        trace: OpenMLRunTrace | None = None,
+        model: object | None = None,
+        task_type: str | None = None,
+        task_evaluation_measure: str | None = None,
+        flow_name: str | None = None,
+        parameter_settings: list[dict[str, Any]] | None = None,
+        predictions_url: str | None = None,
+        task: OpenMLTask | None = None,
+        flow: OpenMLFlow | None = None,
+        run_id: int | None = None,
+        description_text: str | None = None,
+        run_details: str | None = None,
+    ):
         self.uploader = uploader
         self.uploader_name = uploader_name
         self.task_id = task_id
@@ -66,48 +146,140 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
         self.model = model
         self.tags = tags
         self.predictions_url = predictions_url
+        self.description_text = description_text
+        self.run_details = run_details
+        self._predictions = None
+
+    @property
+    def predictions(self) -> pd.DataFrame:
+        """Return a DataFrame with predictions for this run"""
+        if self._predictions is None:
+            if self.data_content:
+                arff_dict = self._generate_arff_dict()
+            elif self.predictions_url:
+                arff_text = openml._api_calls._download_text_file(self.predictions_url)
+                arff_dict = arff.loads(arff_text)
+            else:
+                raise RuntimeError("Run has no predictions.")
+            self._predictions = pd.DataFrame(
+                arff_dict["data"],
+                columns=[name for name, _ in arff_dict["attributes"]],
+            )
+        return self._predictions
+
+    @property
+    def id(self) -> int | None:
+        """The ID of the run, None if not uploaded to the server yet."""
+        return self.run_id
+
+    def _evaluation_summary(self, metric: str) -> str:
+        """Summarizes the evaluation of a metric over all folds.
+
+        The fold scores for the metric must exist already. During run creation,
+        by default, the MAE for OpenMLRegressionTask and the accuracy for
+        OpenMLClassificationTask/OpenMLLearningCurveTasktasks are computed.
+
+        If repetition exist, we take the mean over all repetitions.
+
+        Parameters
+        ----------
+        metric: str
+            Name of an evaluation metric that was used to compute fold scores.
+
+        Returns
+        -------
+        metric_summary: str
+            A formatted string that displays the metric's evaluation summary.
+            The summary consists of the mean and std.
+        """
+        if self.fold_evaluations is None:
+            raise ValueError("No fold evaluations available.")
+        fold_score_lists = self.fold_evaluations[metric].values()
+
+        # Get the mean and std over all repetitions
+        rep_means = [np.mean(list(x.values())) for x in fold_score_lists]
+        rep_stds = [np.std(list(x.values())) for x in fold_score_lists]
+
+        return f"{np.mean(rep_means):.4f} +- {np.mean(rep_stds):.4f}"
+
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
+        """Collect all information to display in the __repr__ body."""
+        # Set up fields
+        fields = {
+            "Uploader Name": self.uploader_name,
+            "Metric": self.task_evaluation_measure,
+            "Run ID": self.run_id,
+            "Task ID": self.task_id,
+            "Task Type": self.task_type,
+            "Task URL": openml.tasks.OpenMLTask.url_for_id(self.task_id),
+            "Flow ID": self.flow_id,
+            "Flow Name": self.flow_name,
+            "Flow URL": (
+                openml.flows.OpenMLFlow.url_for_id(self.flow_id)
+                if self.flow_id is not None
+                else None
+            ),
+            "Setup ID": self.setup_id,
+            "Setup String": self.setup_string,
+            "Dataset ID": self.dataset_id,
+            "Dataset URL": (
+                openml.datasets.OpenMLDataset.url_for_id(self.dataset_id)
+                if self.dataset_id is not None
+                else None
+            ),
+        }
+
+        # determines the order of the initial fields in which the information will be printed
+        order = ["Uploader Name", "Uploader Profile", "Metric", "Result"]
 
-    def __repr__(self):
-        header = "OpenML Run"
-        header = '{}\n{}\n'.format(header, '=' * len(header))
-
-        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
-        fields = {"Uploader Name": self.uploader_name,
-                  "Metric": self.task_evaluation_measure,
-                  "Run ID": self.run_id,
-                  "Task ID": self.task_id,
-                  "Task Type": self.task_type,
-                  "Task URL": "{}t/{}".format(base_url, self.task_id),
-                  "Flow ID": self.flow_id,
-                  "Flow Name": self.flow_name,
-                  "Flow URL": "{}f/{}".format(base_url, self.flow_id),
-                  "Setup ID": self.setup_id,
-                  "Setup String": self.setup_string,
-                  "Dataset ID": self.dataset_id,
-                  "Dataset URL": "{}d/{}".format(base_url, self.dataset_id)}
         if self.uploader is not None:
-            fields["Uploader Profile"] = "{}u/{}".format(base_url, self.uploader)
+            fields["Uploader Profile"] = f"{openml.config.get_server_base_url()}/u/{self.uploader}"
         if self.run_id is not None:
-            fields["Run URL"] = "{}r/{}".format(base_url, self.run_id)
+            fields["Run URL"] = self.openml_url
         if self.evaluations is not None and self.task_evaluation_measure in self.evaluations:
             fields["Result"] = self.evaluations[self.task_evaluation_measure]
-
-        # determines the order in which the information will be printed
-        order = ["Uploader Name", "Uploader Profile", "Metric", "Result", "Run ID", "Run URL",
-                 "Task ID", "Task Type", "Task URL", "Flow ID", "Flow Name", "Flow URL",
-                 "Setup ID", "Setup String", "Dataset ID", "Dataset URL"]
-        fields = [(key, fields[key]) for key in order if key in fields]
-
-        longest_field_name_length = max(len(name) for name, value in fields)
-        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
-        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
-        return header + body
-
-    def _repr_pretty_(self, pp, cycle):
-        pp.text(str(self))
+        elif self.fold_evaluations is not None:
+            # -- Add locally computed summary values if possible
+            if "predictive_accuracy" in self.fold_evaluations:
+                # OpenMLClassificationTask; OpenMLLearningCurveTask
+                result_field = "Local Result - Accuracy (+- STD)"
+                fields[result_field] = self._evaluation_summary("predictive_accuracy")
+                order.append(result_field)
+            elif "mean_absolute_error" in self.fold_evaluations:
+                # OpenMLRegressionTask
+                result_field = "Local Result - MAE (+- STD)"
+                fields[result_field] = self._evaluation_summary("mean_absolute_error")
+                order.append(result_field)
+
+            if "usercpu_time_millis" in self.fold_evaluations:
+                # Runtime should be available for most tasks types
+                rt_field = "Local Runtime - ms (+- STD)"
+                fields[rt_field] = self._evaluation_summary("usercpu_time_millis")
+                order.append(rt_field)
+
+        # determines the remaining order
+        order += [
+            "Run ID",
+            "Run URL",
+            "Task ID",
+            "Task Type",
+            "Task URL",
+            "Flow ID",
+            "Flow Name",
+            "Flow URL",
+            "Setup ID",
+            "Setup String",
+            "Dataset ID",
+            "Dataset URL",
+        ]
+        return [
+            (key, "None" if fields[key] is None else fields[key])  # type: ignore
+            for key in order
+            if key in fields
+        ]
 
     @classmethod
-    def from_filesystem(cls, directory: str, expect_model: bool = True) -> 'OpenMLRun':
+    def from_filesystem(cls, directory: str | Path, expect_model: bool = True) -> OpenMLRun:  # noqa: FBT002
         """
         The inverse of the to_filesystem method. Instantiates an OpenMLRun
         object based on files stored on the file system.
@@ -128,26 +300,26 @@ def from_filesystem(cls, directory: str, expect_model: bool = True) -> 'OpenMLRu
         run : OpenMLRun
             the re-instantiated run object
         """
-
         # Avoiding cyclic imports
         import openml.runs.functions
 
-        if not os.path.isdir(directory):
-            raise ValueError('Could not find folder')
+        directory = Path(directory)
+        if not directory.is_dir():
+            raise ValueError("Could not find folder")
 
-        description_path = os.path.join(directory, 'description.xml')
-        predictions_path = os.path.join(directory, 'predictions.arff')
-        trace_path = os.path.join(directory, 'trace.arff')
-        model_path = os.path.join(directory, 'model.pkl')
+        description_path = directory / "description.xml"
+        predictions_path = directory / "predictions.arff"
+        trace_path = directory / "trace.arff"
+        model_path = directory / "model.pkl"
 
-        if not os.path.isfile(description_path):
-            raise ValueError('Could not find description.xml')
-        if not os.path.isfile(predictions_path):
-            raise ValueError('Could not find predictions.arff')
-        if not os.path.isfile(model_path) and expect_model:
-            raise ValueError('Could not find model.pkl')
+        if not description_path.is_file():
+            raise ValueError("Could not find description.xml")
+        if not predictions_path.is_file():
+            raise ValueError("Could not find predictions.arff")
+        if (not model_path.is_file()) and expect_model:
+            raise ValueError("Could not find model.pkl")
 
-        with open(description_path, 'r') as fht:
+        with description_path.open() as fht:
             xml_string = fht.read()
         run = openml.runs.functions._create_run_from_xml(xml_string, from_server=False)
 
@@ -156,25 +328,25 @@ def from_filesystem(cls, directory: str, expect_model: bool = True) -> 'OpenMLRu
             run.flow = flow
             run.flow_name = flow.name
 
-        with open(predictions_path, 'r') as fht:
+        with predictions_path.open() as fht:
             predictions = arff.load(fht)
-            run.data_content = predictions['data']
+            run.data_content = predictions["data"]
 
-        if os.path.isfile(model_path):
+        if model_path.is_file():
             # note that it will load the model if the file exists, even if
             # expect_model is False
-            with open(model_path, 'rb') as fhb:
-                run.model = pickle.load(fhb)
+            with model_path.open("rb") as fhb:
+                run.model = pickle.load(fhb)  # noqa: S301
 
-        if os.path.isfile(trace_path):
+        if trace_path.is_file():
             run.trace = openml.runs.OpenMLRunTrace._from_filesystem(trace_path)
 
         return run
 
     def to_filesystem(
         self,
-        directory: str,
-        store_model: bool = True,
+        directory: str | Path,
+        store_model: bool = True,  # noqa: FBT002
     ) -> None:
         """
         The inverse of the from_filesystem method. Serializes a run
@@ -192,39 +364,90 @@ def to_filesystem(
             model.
         """
         if self.data_content is None or self.model is None:
-            raise ValueError('Run should have been executed (and contain '
-                             'model / predictions)')
+            raise ValueError("Run should have been executed (and contain model / predictions)")
+        directory = Path(directory)
+        directory.mkdir(exist_ok=True, parents=True)
 
-        os.makedirs(directory, exist_ok=True)
-        if not os.listdir(directory) == []:
-            raise ValueError(
-                'Output directory {} should be empty'.format(os.path.abspath(directory))
-            )
+        if any(directory.iterdir()):
+            raise ValueError(f"Output directory {directory.expanduser().resolve()} should be empty")
 
-        run_xml = self._create_description_xml()
+        run_xml = self._to_xml()
         predictions_arff = arff.dumps(self._generate_arff_dict())
 
         # It seems like typing does not allow to define the same variable multiple times
-        with open(os.path.join(directory, 'description.xml'), 'w') as fh:  # type: TextIO
+        with (directory / "description.xml").open("w") as fh:
             fh.write(run_xml)
-        with open(os.path.join(directory, 'predictions.arff'), 'w') as fh:
+        with (directory / "predictions.arff").open("w") as fh:
             fh.write(predictions_arff)
         if store_model:
-            with open(os.path.join(directory, 'model.pkl'), 'wb') as fh_b:  # type: IO[bytes]
+            with (directory / "model.pkl").open("wb") as fh_b:
                 pickle.dump(self.model, fh_b)
 
-        if self.flow_id is None:
+        if self.flow_id is None and self.flow is not None:
             self.flow.to_filesystem(directory)
 
         if self.trace is not None:
             self.trace._to_filesystem(directory)
 
-    def _generate_arff_dict(self) -> 'OrderedDict[str, Any]':
+    def _get_arff_attributes_for_task(self, task: OpenMLTask) -> list[tuple[str, Any]]:
+        """Get ARFF attributes based on task type.
+
+        Parameters
+        ----------
+        task : OpenMLTask
+            The task for which to generate attributes.
+
+        Returns
+        -------
+        list[tuple[str, Any]]
+            List of attribute tuples (name, type).
+        """
+        instance_specifications = [
+            ("repeat", "NUMERIC"),
+            ("fold", "NUMERIC"),
+        ]
+
+        if isinstance(task, (OpenMLLearningCurveTask, OpenMLClassificationTask)):
+            instance_specifications.append(("sample", "NUMERIC"))
+
+        instance_specifications.append(("row_id", "NUMERIC"))
+
+        if isinstance(task, (OpenMLLearningCurveTask, OpenMLClassificationTask)):
+            class_labels = task.class_labels
+            if class_labels is None:
+                raise ValueError("The task has no class labels")
+
+            prediction_confidences = [
+                ("confidence." + class_labels[i], "NUMERIC") for i in range(len(class_labels))
+            ]
+            prediction_and_true = [("prediction", class_labels), ("correct", class_labels)]
+            return instance_specifications + prediction_and_true + prediction_confidences
+
+        if isinstance(task, OpenMLRegressionTask):
+            return [*instance_specifications, ("prediction", "NUMERIC"), ("truth", "NUMERIC")]
+
+        if isinstance(task, OpenMLClusteringTask):
+            return [*instance_specifications, ("cluster", "NUMERIC")]
+
+        supported_task_types = [
+            TaskType.SUPERVISED_CLASSIFICATION,
+            TaskType.SUPERVISED_REGRESSION,
+            TaskType.CLUSTERING,
+            TaskType.LEARNING_CURVE,
+        ]
+        raise NotImplementedError(
+            f"Task type {task.task_type!s} for task_id {getattr(task, 'task_id', None)!s} "
+            f"is not yet supported. Supported task types are: {supported_task_types!r}"
+        )
+
+    def _generate_arff_dict(self) -> OrderedDict[str, Any]:
         """Generates the arff dictionary for uploading predictions to the
         server.
 
         Assumes that the run has been executed.
 
+        The order of the attributes follows the order defined by the Client API for R.
+
         Returns
         -------
         arf_dict : dict
@@ -232,82 +455,25 @@ def _generate_arff_dict(self) -> 'OrderedDict[str, Any]':
             Contains predictions and information about the run environment.
         """
         if self.data_content is None:
-            raise ValueError('Run has not been executed.')
+            raise ValueError("Run has not been executed.")
         if self.flow is None:
+            if self.flow_id is None:
+                raise ValueError("Run has no associated flow id!")
             self.flow = get_flow(self.flow_id)
 
-        run_environment = (self.flow.extension.get_version_information()
-                           + [time.strftime("%c")]
-                           + ['Created by run_task()'])
+        if self.description_text is None:
+            self.description_text = time.strftime("%c")
         task = get_task(self.task_id)
 
         arff_dict = OrderedDict()  # type: 'OrderedDict[str, Any]'
-        arff_dict['data'] = self.data_content
-        arff_dict['description'] = "\n".join(run_environment)
-        arff_dict['relation'] =\
-            'openml_task_{}_predictions'.format(task.task_id)
-
-        if isinstance(task, OpenMLLearningCurveTask):
-            class_labels = task.class_labels
-            instance_specifications = [
-                ('repeat', 'NUMERIC'),
-                ('fold', 'NUMERIC'),
-                ('sample', 'NUMERIC'),
-                ('row_id', 'NUMERIC')
-            ]
-
-            arff_dict['attributes'] = instance_specifications
-            if class_labels is not None:
-                arff_dict['attributes'] = arff_dict['attributes'] + \
-                    [('confidence.' + class_labels[i],
-                      'NUMERIC')
-                     for i in range(len(class_labels))] + \
-                    [('prediction', class_labels),
-                     ('correct', class_labels)]
-            else:
-                raise ValueError('The task has no class labels')
-
-        elif isinstance(task, OpenMLClassificationTask):
-            class_labels = task.class_labels
-            instance_specifications = [('repeat', 'NUMERIC'),
-                                       ('fold', 'NUMERIC'),
-                                       ('sample', 'NUMERIC'),  # Legacy
-                                       ('row_id', 'NUMERIC')]
-
-            arff_dict['attributes'] = instance_specifications
-            if class_labels is not None:
-                prediction_confidences = [('confidence.' + class_labels[i],
-                                           'NUMERIC')
-                                          for i in range(len(class_labels))]
-                prediction_and_true = [('prediction', class_labels),
-                                       ('correct', class_labels)]
-                arff_dict['attributes'] = arff_dict['attributes'] + \
-                    prediction_confidences + \
-                    prediction_and_true
-            else:
-                raise ValueError('The task has no class labels')
-
-        elif isinstance(task, OpenMLRegressionTask):
-            arff_dict['attributes'] = [('repeat', 'NUMERIC'),
-                                       ('fold', 'NUMERIC'),
-                                       ('row_id', 'NUMERIC'),
-                                       ('prediction', 'NUMERIC'),
-                                       ('truth', 'NUMERIC')]
-
-        elif isinstance(task, OpenMLClusteringTask):
-            arff_dict['attributes'] = [('repeat', 'NUMERIC'),
-                                       ('fold', 'NUMERIC'),
-                                       ('row_id', 'NUMERIC'),
-                                       ('cluster', 'NUMERIC')]
-
-        else:
-            raise NotImplementedError(
-                'Task type %s is not yet supported.' % str(task.task_type)
-            )
+        arff_dict["data"] = self.data_content
+        arff_dict["description"] = self.description_text
+        arff_dict["relation"] = f"openml_task_{task.task_id}_predictions"
+        arff_dict["attributes"] = self._get_arff_attributes_for_task(task)
 
         return arff_dict
 
-    def get_metric_fn(self, sklearn_fn, kwargs=None):
+    def get_metric_fn(self, sklearn_fn: Callable, kwargs: dict | None = None) -> np.ndarray:  # noqa: PLR0915, PLR0912, C901
         """Calculates metric scores based on predicted values. Assumes the
         run has been executed locally (and contains run_data). Furthermore,
         it assumes that the 'correct' or 'truth' attribute is specified in
@@ -319,46 +485,45 @@ def get_metric_fn(self, sklearn_fn, kwargs=None):
         sklearn_fn : function
             a function pointer to a sklearn function that
             accepts ``y_true``, ``y_pred`` and ``**kwargs``
+        kwargs : dict
+            kwargs for the function
 
         Returns
         -------
-        scores : list
-            a list of floats, of length num_folds * num_repeats
+        scores : ndarray of scores of length num_folds * num_repeats
+            metric results
         """
-        kwargs = kwargs if kwargs else dict()
+        kwargs = kwargs if kwargs else {}
         if self.data_content is not None and self.task_id is not None:
             predictions_arff = self._generate_arff_dict()
-        elif 'predictions' in self.output_files:
+        elif (self.output_files is not None) and ("predictions" in self.output_files):
             predictions_file_url = openml._api_calls._file_id_to_url(
-                self.output_files['predictions'], 'predictions.arff',
+                self.output_files["predictions"],
+                "predictions.arff",
             )
-            response = openml._api_calls._read_url(predictions_file_url,
-                                                   request_method='get')
+            response = openml._api_calls._download_text_file(predictions_file_url)
             predictions_arff = arff.loads(response)
             # TODO: make this a stream reader
         else:
-            raise ValueError('Run should have been locally executed or '
-                             'contain outputfile reference.')
+            raise ValueError(
+                "Run should have been locally executed or contain outputfile reference.",
+            )
 
         # Need to know more about the task to compute scores correctly
         task = get_task(self.task_id)
 
-        attribute_names = [att[0] for att in predictions_arff['attributes']]
-        if (task.task_type_id in [TaskTypeEnum.SUPERVISED_CLASSIFICATION,
-                                  TaskTypeEnum.LEARNING_CURVE]
-                and 'correct' not in attribute_names):
-            raise ValueError('Attribute "correct" should be set for '
-                             'classification task runs')
-        if (task.task_type_id == TaskTypeEnum.SUPERVISED_REGRESSION
-                and 'truth' not in attribute_names):
-            raise ValueError('Attribute "truth" should be set for '
-                             'regression task runs')
-        if (task.task_type_id != TaskTypeEnum.CLUSTERING
-                and 'prediction' not in attribute_names):
-            raise ValueError('Attribute "predict" should be set for '
-                             'supervised task runs')
-
-        def _attribute_list_to_dict(attribute_list):
+        attribute_names = [att[0] for att in predictions_arff["attributes"]]
+        if (
+            task.task_type_id in [TaskType.SUPERVISED_CLASSIFICATION, TaskType.LEARNING_CURVE]
+            and "correct" not in attribute_names
+        ):
+            raise ValueError('Attribute "correct" should be set for classification task runs')
+        if task.task_type_id == TaskType.SUPERVISED_REGRESSION and "truth" not in attribute_names:
+            raise ValueError('Attribute "truth" should be set for regression task runs')
+        if task.task_type_id != TaskType.CLUSTERING and "prediction" not in attribute_names:
+            raise ValueError('Attribute "prediction" should be set for supervised task runs')
+
+        def _attribute_list_to_dict(attribute_list):  # type: ignore
             # convenience function: Creates a mapping to map from the name of
             # attributes present in the arff prediction file to their index.
             # This is necessary because the number of classes can be different
@@ -368,48 +533,48 @@ def _attribute_list_to_dict(attribute_list):
                 res[attribute_list[idx][0]] = idx
             return res
 
-        attribute_dict = \
-            _attribute_list_to_dict(predictions_arff['attributes'])
+        attribute_dict = _attribute_list_to_dict(predictions_arff["attributes"])
 
-        repeat_idx = attribute_dict['repeat']
-        fold_idx = attribute_dict['fold']
-        predicted_idx = attribute_dict['prediction']  # Assume supervised task
+        repeat_idx = attribute_dict["repeat"]
+        fold_idx = attribute_dict["fold"]
+        predicted_idx = attribute_dict["prediction"]  # Assume supervised task
 
-        if task.task_type_id == TaskTypeEnum.SUPERVISED_CLASSIFICATION or \
-                task.task_type_id == TaskTypeEnum.LEARNING_CURVE:
-            correct_idx = attribute_dict['correct']
-        elif task.task_type_id == TaskTypeEnum.SUPERVISED_REGRESSION:
-            correct_idx = attribute_dict['truth']
+        if task.task_type_id in (TaskType.SUPERVISED_CLASSIFICATION, TaskType.LEARNING_CURVE):
+            correct_idx = attribute_dict["correct"]
+        elif task.task_type_id == TaskType.SUPERVISED_REGRESSION:
+            correct_idx = attribute_dict["truth"]
         has_samples = False
-        if 'sample' in attribute_dict:
-            sample_idx = attribute_dict['sample']
+        if "sample" in attribute_dict:
+            sample_idx = attribute_dict["sample"]
             has_samples = True
 
-        if predictions_arff['attributes'][predicted_idx][1] != \
-                predictions_arff['attributes'][correct_idx][1]:
-            pred = predictions_arff['attributes'][predicted_idx][1]
-            corr = predictions_arff['attributes'][correct_idx][1]
-            raise ValueError('Predicted and Correct do not have equal values:'
-                             ' %s Vs. %s' % (str(pred), str(corr)))
+        if (
+            predictions_arff["attributes"][predicted_idx][1]
+            != predictions_arff["attributes"][correct_idx][1]
+        ):
+            pred = predictions_arff["attributes"][predicted_idx][1]
+            corr = predictions_arff["attributes"][correct_idx][1]
+            raise ValueError(
+                f"Predicted and Correct do not have equal values: {pred!s} Vs. {corr!s}",
+            )
 
         # TODO: these could be cached
-        values_predict = {}
-        values_correct = {}
-        for line_idx, line in enumerate(predictions_arff['data']):
+        values_predict: dict[int, dict[int, dict[int, list[float]]]] = {}
+        values_correct: dict[int, dict[int, dict[int, list[float]]]] = {}
+        for _line_idx, line in enumerate(predictions_arff["data"]):
             rep = line[repeat_idx]
             fold = line[fold_idx]
-            if has_samples:
-                samp = line[sample_idx]
-            else:
-                samp = 0  # No learning curve sample, always 0
-
-            if task.task_type_id in [TaskTypeEnum.SUPERVISED_CLASSIFICATION,
-                                     TaskTypeEnum.LEARNING_CURVE]:
-                prediction = predictions_arff['attributes'][predicted_idx][
-                    1].index(line[predicted_idx])
-                correct = predictions_arff['attributes'][predicted_idx][1]. \
-                    index(line[correct_idx])
-            elif task.task_type_id == TaskTypeEnum.SUPERVISED_REGRESSION:
+            samp = line[sample_idx] if has_samples else 0
+
+            if task.task_type_id in [
+                TaskType.SUPERVISED_CLASSIFICATION,
+                TaskType.LEARNING_CURVE,
+            ]:
+                prediction = predictions_arff["attributes"][predicted_idx][1].index(
+                    line[predicted_idx],
+                )
+                correct = predictions_arff["attributes"][predicted_idx][1].index(line[correct_idx])
+            elif task.task_type_id == TaskType.SUPERVISED_REGRESSION:
                 prediction = line[predicted_idx]
                 correct = line[correct_idx]
             if rep not in values_predict:
@@ -426,176 +591,111 @@ def _attribute_list_to_dict(attribute_list):
             values_correct[rep][fold][samp].append(correct)
 
         scores = []
-        for rep in values_predict.keys():
-            for fold in values_predict[rep].keys():
+        for rep in values_predict:  # noqa: PLC0206
+            for fold in values_predict[rep]:
                 last_sample = len(values_predict[rep][fold]) - 1
                 y_pred = values_predict[rep][fold][last_sample]
                 y_true = values_correct[rep][fold][last_sample]
                 scores.append(sklearn_fn(y_true, y_pred, **kwargs))
         return np.array(scores)
 
-    def publish(self) -> 'OpenMLRun':
-        """ Publish a run (and if necessary, its flow) to the OpenML server.
+    def _parse_publish_response(self, xml_response: dict) -> None:
+        """Parse the id from the xml_response and assign it to self."""
+        self.run_id = int(xml_response["oml:upload_run"]["oml:run_id"])
 
-        Uploads the results of a run to OpenML.
-        If the run is of an unpublished OpenMLFlow, the flow will be uploaded too.
-        Sets the run_id on self.
+    def _get_file_elements(self) -> dict:
+        """Get file_elements to upload to the server.
 
-        Returns
-        -------
-        self : OpenMLRun
+        Derived child classes should overwrite this method as necessary.
+        The description field will be populated automatically if not provided.
         """
-        if self.model is None:
+        if self.parameter_settings is None and self.model is None:
             raise PyOpenMLError(
-                "OpenMLRun obj does not contain a model. "
-                "(This should never happen.) "
+                "OpenMLRun must contain a model or be initialized with parameter_settings.",
             )
         if self.flow_id is None:
             if self.flow is None:
                 raise PyOpenMLError(
                     "OpenMLRun object does not contain a flow id or reference to OpenMLFlow "
-                    "(these should have been added while executing the task). "
+                    "(these should have been added while executing the task). ",
                 )
-            else:
-                # publish the linked Flow before publishing the run.
-                self.flow.publish()
-                self.flow_id = self.flow.flow_id
+
+            # publish the linked Flow before publishing the run.
+            self.flow.publish()
+            self.flow_id = self.flow.flow_id
 
         if self.parameter_settings is None:
             if self.flow is None:
+                if self.flow_id is None:
+                    raise ValueError(
+                        "Run has no associated flow_id and cannot obtain parameter values."
+                    )
                 self.flow = openml.flows.get_flow(self.flow_id)
             self.parameter_settings = self.flow.extension.obtain_parameter_values(
                 self.flow,
                 self.model,
             )
 
-        description_xml = self._create_description_xml()
-        file_elements = {'description': ("description.xml", description_xml)}
+        file_elements = {"description": ("description.xml", self._to_xml())}
 
         if self.error_message is None:
             predictions = arff.dumps(self._generate_arff_dict())
-            file_elements['predictions'] = ("predictions.arff", predictions)
+            file_elements["predictions"] = ("predictions.arff", predictions)
 
         if self.trace is not None:
             trace_arff = arff.dumps(self.trace.trace_to_arff())
-            file_elements['trace'] = ("trace.arff", trace_arff)
-
-        return_value = openml._api_calls._perform_api_call(
-            "/run/", 'post', file_elements=file_elements
-        )
-        result = xmltodict.parse(return_value)
-        self.run_id = int(result['oml:upload_run']['oml:run_id'])
-        return self
-
-    def _create_description_xml(self):
-        """Create xml representation of run for upload.
-
-        Returns
-        -------
-        xml_string : string
-            XML description of run.
-        """
-
-        # as a tag, it must be of the form ([a-zA-Z0-9_\-\.])+
-        # so we format time from 'mm/dd/yy hh:mm:ss' to 'mm-dd-yy_hh.mm.ss'
-        # well_formatted_time = time.strftime("%c").replace(
-        #     ' ', '_').replace('/', '-').replace(':', '.')
-        # tags = run_environment + [well_formatted_time] + ['run_task'] + \
-        #     [self.model.__module__ + "." + self.model.__class__.__name__]
-        description = _to_dict(taskid=self.task_id, flow_id=self.flow_id,
-                               setup_string=self.setup_string,
-                               parameter_settings=self.parameter_settings,
-                               error_message=self.error_message,
-                               fold_evaluations=self.fold_evaluations,
-                               sample_evaluations=self.sample_evaluations,
-                               tags=self.tags)
-        description_xml = xmltodict.unparse(description, pretty=True)
-        return description_xml
-
-    def push_tag(self, tag: str) -> None:
-        """Annotates this run with a tag on the server.
-
-        Parameters
-        ----------
-        tag : str
-            Tag to attach to the run.
-        """
-        _tag_entity('run', self.run_id, tag)
-
-    def remove_tag(self, tag: str) -> None:
-        """Removes a tag from this run on the server.
-
-        Parameters
-        ----------
-        tag : str
-            Tag to attach to the run.
-        """
-        _tag_entity('run', self.run_id, tag, untag=True)
-
-
-###############################################################################
-# Functions which cannot be in runs/functions due to circular imports
-
-def _to_dict(taskid, flow_id, setup_string, error_message, parameter_settings,
-             tags=None, fold_evaluations=None, sample_evaluations=None):
-    """ Creates a dictionary corresponding to the desired xml desired by openML
-
-    Parameters
-    ----------
-    taskid : int
-        the identifier of the task
-    setup_string : string
-        a CLI string which can invoke the learning with the correct parameter
-        settings
-    parameter_settings : array of dicts
-        each dict containing keys name, value and component, one per parameter
-        setting
-    tags : array of strings
-        information that give a description of the run, must conform to
-        regex ``([a-zA-Z0-9_\-\.])+``
-    fold_evaluations : dict mapping from evaluation measure to a dict mapping
-        repeat_nr to a dict mapping from fold nr to a value (double)
-    sample_evaluations : dict mapping from evaluation measure to a dict
-        mapping repeat_nr to a dict mapping from fold nr to a dict mapping to
-        a sample nr to a value (double)
-    sample_evaluations :
-    Returns
-    -------
-    result : an array with version information of the above packages
-    """  # noqa: W605
-    description = OrderedDict()
-    description['oml:run'] = OrderedDict()
-    description['oml:run']['@xmlns:oml'] = 'http://openml.org/openml'
-    description['oml:run']['oml:task_id'] = taskid
-    description['oml:run']['oml:flow_id'] = flow_id
-    if error_message is not None:
-        description['oml:run']['oml:error_message'] = error_message
-    description['oml:run']['oml:parameter_setting'] = parameter_settings
-    if tags is not None:
-        description['oml:run']['oml:tag'] = tags  # Tags describing the run
-    if (fold_evaluations is not None and len(fold_evaluations) > 0) or \
-            (sample_evaluations is not None and len(sample_evaluations) > 0):
-        description['oml:run']['oml:output_data'] = OrderedDict()
-        description['oml:run']['oml:output_data']['oml:evaluation'] = list()
-    if fold_evaluations is not None:
-        for measure in fold_evaluations:
-            for repeat in fold_evaluations[measure]:
-                for fold, value in fold_evaluations[measure][repeat].items():
-                    current = OrderedDict([
-                        ('@repeat', str(repeat)), ('@fold', str(fold)),
-                        ('oml:name', measure), ('oml:value', str(value))])
-                    description['oml:run']['oml:output_data'][
-                        'oml:evaluation'].append(current)
-    if sample_evaluations is not None:
-        for measure in sample_evaluations:
-            for repeat in sample_evaluations[measure]:
-                for fold in sample_evaluations[measure][repeat]:
-                    for sample, value in sample_evaluations[measure][repeat][
-                            fold].items():
-                        current = OrderedDict([
-                            ('@repeat', str(repeat)), ('@fold', str(fold)),
-                            ('@sample', str(sample)), ('oml:name', measure),
-                            ('oml:value', str(value))])
-                        description['oml:run']['oml:output_data'][
-                            'oml:evaluation'].append(current)
-    return description
+            file_elements["trace"] = ("trace.arff", trace_arff)
+        return file_elements
+
+    def _to_dict(self) -> dict[str, dict]:  # noqa: PLR0912, C901
+        """Creates a dictionary representation of self."""
+        description = OrderedDict()  # type: 'OrderedDict'
+        description["oml:run"] = OrderedDict()
+        description["oml:run"]["@xmlns:oml"] = "http://openml.org/openml"
+        description["oml:run"]["oml:task_id"] = self.task_id
+        description["oml:run"]["oml:flow_id"] = self.flow_id
+        if self.setup_string is not None:
+            description["oml:run"]["oml:setup_string"] = self.setup_string
+        if self.error_message is not None:
+            description["oml:run"]["oml:error_message"] = self.error_message
+        if self.run_details is not None:
+            description["oml:run"]["oml:run_details"] = self.run_details
+        description["oml:run"]["oml:parameter_setting"] = self.parameter_settings
+        if self.tags is not None:
+            description["oml:run"]["oml:tag"] = self.tags
+        if (self.fold_evaluations is not None and len(self.fold_evaluations) > 0) or (
+            self.sample_evaluations is not None and len(self.sample_evaluations) > 0
+        ):
+            description["oml:run"]["oml:output_data"] = OrderedDict()
+            description["oml:run"]["oml:output_data"]["oml:evaluation"] = []
+        if self.fold_evaluations is not None:
+            for measure in self.fold_evaluations:
+                for repeat in self.fold_evaluations[measure]:
+                    for fold, value in self.fold_evaluations[measure][repeat].items():
+                        current = OrderedDict(
+                            [
+                                ("@repeat", str(repeat)),
+                                ("@fold", str(fold)),
+                                ("oml:name", measure),
+                                ("oml:value", str(value)),
+                            ],
+                        )
+                        description["oml:run"]["oml:output_data"]["oml:evaluation"].append(current)
+        if self.sample_evaluations is not None:
+            for measure in self.sample_evaluations:
+                for repeat in self.sample_evaluations[measure]:
+                    for fold in self.sample_evaluations[measure][repeat]:
+                        for sample, value in self.sample_evaluations[measure][repeat][fold].items():
+                            current = OrderedDict(
+                                [
+                                    ("@repeat", str(repeat)),
+                                    ("@fold", str(fold)),
+                                    ("@sample", str(sample)),
+                                    ("oml:name", measure),
+                                    ("oml:value", str(value)),
+                                ],
+                            )
+                            description["oml:run"]["oml:output_data"]["oml:evaluation"].append(
+                                current,
+                            )
+        return description
diff --git a/openml/runs/trace.py b/openml/runs/trace.py
index 1786120e8..f76bd04e8 100644
--- a/openml/runs/trace.py
+++ b/openml/runs/trace.py
@@ -1,22 +1,105 @@
-from collections import OrderedDict
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import json
-import os
-from typing import List, Tuple  # noqa F401
+from collections import OrderedDict
+from collections.abc import Iterator
+from dataclasses import dataclass
+from pathlib import Path
+from typing import IO, Any
+from typing_extensions import Self
 
 import arff
 import xmltodict
 
-PREFIX = 'parameter_'
+PREFIX = "parameter_"
 REQUIRED_ATTRIBUTES = [
-    'repeat',
-    'fold',
-    'iteration',
-    'evaluation',
-    'selected',
+    "repeat",
+    "fold",
+    "iteration",
+    "evaluation",
+    "selected",
 ]
 
 
-class OpenMLRunTrace(object):
+@dataclass
+class OpenMLTraceIteration:
+    """
+    OpenML Trace Iteration: parsed output from Run Trace call
+    Exactly one of `setup_string` or `parameters` must be provided.
+
+    Parameters
+    ----------
+    repeat : int
+        repeat number (in case of no repeats: 0)
+
+    fold : int
+        fold number (in case of no folds: 0)
+
+    iteration : int
+        iteration number of optimization procedure
+
+    setup_string : str, optional
+        json string representing the parameters
+        If not provided, ``parameters`` should be set.
+
+    evaluation : double
+        The evaluation that was awarded to this trace iteration.
+        Measure is defined by the task
+
+    selected : bool
+        Whether this was the best of all iterations, and hence
+        selected for making predictions. Per fold/repeat there
+        should be only one iteration selected
+
+    parameters : OrderedDict, optional
+        Dictionary specifying parameter names and their values.
+        If not provided, ``setup_string`` should be set.
+    """
+
+    repeat: int
+    fold: int
+    iteration: int
+
+    evaluation: float
+    selected: bool
+
+    setup_string: dict[str, str] | None = None
+    parameters: dict[str, str | int | float] | None = None
+
+    def __post_init__(self) -> None:
+        # TODO: refactor into one argument of type <str | OrderedDict>
+        if self.setup_string and self.parameters:
+            raise ValueError(
+                "Can only be instantiated with either `setup_string` or `parameters` argument.",
+            )
+
+        if not (self.setup_string or self.parameters):
+            raise ValueError(
+                "Either `setup_string` or `parameters` needs to be passed as argument.",
+            )
+
+        if self.parameters is not None and not isinstance(self.parameters, dict):
+            raise TypeError(
+                f"argument parameters is not an instance of OrderedDict, but"
+                f" {type(self.parameters)!s}",
+            )
+
+    def get_parameters(self) -> dict[str, Any]:
+        """Get the parameters of this trace iteration."""
+        # parameters have prefix 'parameter_'
+        if self.setup_string:
+            return {
+                param[len(PREFIX) :]: json.loads(value)
+                for param, value in self.setup_string.items()
+            }
+
+        if self.parameters is None:
+            raise ValueError("Parameters must be set before calling get_parameters().")
+        return {param[len(PREFIX) :]: value for param, value in self.parameters.items()}
+
+
+class OpenMLRunTrace:
     """OpenML Run Trace: parsed output from Run Trace call
 
     Parameters
@@ -30,7 +113,20 @@ class OpenMLRunTrace(object):
 
     """
 
-    def __init__(self, run_id, trace_iterations):
+    def __init__(
+        self,
+        run_id: int | None,
+        trace_iterations: dict[tuple[int, int, int], OpenMLTraceIteration],
+    ):
+        """Object to hold the trace content of a run.
+
+        Parameters
+        ----------
+        run_id : int
+            Id for which the trace content is to be stored.
+        trace_iterations : List[List]
+            The trace content obtained by running a flow on a task.
+        """
         self.run_id = run_id
         self.trace_iterations = trace_iterations
 
@@ -47,25 +143,22 @@ def get_selected_iteration(self, fold: int, repeat: int) -> int:
         repeat: int
 
         Returns
-        ----------
+        -------
         int
             The trace iteration from the given fold and repeat that was
             selected as the best iteration by the search procedure
         """
-        for (r, f, i) in self.trace_iterations:
-            if (
-                r == repeat
-                and f == fold
-                and self.trace_iterations[(r, f, i)].selected is True
-            ):
+        for r, f, i in self.trace_iterations:
+            if r == repeat and f == fold and self.trace_iterations[(r, f, i)].selected is True:
                 return i
-        raise ValueError(
-            'Could not find the selected iteration for rep/fold %d/%d' %
-            (repeat, fold)
-        )
+        raise ValueError(f"Could not find the selected iteration for rep/fold {repeat}/{fold}")
 
     @classmethod
-    def generate(cls, attributes, content):
+    def generate(
+        cls,
+        attributes: list[tuple[str, str]],
+        content: list[list[int | float | str]],
+    ) -> OpenMLRunTrace:
         """Generates an OpenMLRunTrace.
 
         Generates the trace object from the attributes and content extracted
@@ -73,7 +166,6 @@ def generate(cls, attributes, content):
 
         Parameters
         ----------
-
         attributes : list
             List of tuples describing the arff attributes.
 
@@ -85,72 +177,73 @@ def generate(cls, attributes, content):
         -------
         OpenMLRunTrace
         """
-
         if content is None:
-            raise ValueError('Trace content not available.')
-        elif attributes is None:
-            raise ValueError('Trace attributes not available.')
-        elif len(content) == 0:
-            raise ValueError('Trace content is empty.')
-        elif len(attributes) != len(content[0]):
+            raise ValueError("Trace content not available.")
+        if attributes is None:
+            raise ValueError("Trace attributes not available.")
+        if len(content) == 0:
+            raise ValueError("Trace content is empty.")
+        if len(attributes) != len(content[0]):
             raise ValueError(
-                'Trace_attributes and trace_content not compatible:'
-                ' %s vs %s' % (attributes, content[0])
+                f"Trace_attributes and trace_content not compatible: {attributes} vs {content[0]}",
             )
 
         return cls._trace_from_arff_struct(
             attributes=attributes,
             content=content,
-            error_message='setup_string not allowed when constructing a '
-                          'trace object from run results.'
+            error_message="setup_string not allowed when constructing a "
+            "trace object from run results.",
         )
 
     @classmethod
-    def _from_filesystem(cls, file_path: str) -> 'OpenMLRunTrace':
+    def _from_filesystem(cls, file_path: str | Path) -> OpenMLRunTrace:
         """
         Logic to deserialize the trace from the filesystem.
 
         Parameters
         ----------
-        file_path: str
+        file_path: str | Path
             File path where the trace arff is stored.
 
         Returns
-        ----------
+        -------
         OpenMLRunTrace
         """
-        if not os.path.isfile(file_path):
-            raise ValueError('Trace file doesn\'t exist')
+        file_path = Path(file_path)
+
+        if not file_path.exists():
+            raise ValueError("Trace file doesn't exist")
 
-        with open(file_path, 'r') as fp:
+        with file_path.open("r") as fp:
             trace_arff = arff.load(fp)
 
-        for trace_idx in range(len(trace_arff['data'])):
+        for trace_idx in range(len(trace_arff["data"])):
             # iterate over first three entrees of a trace row
             # (fold, repeat, trace_iteration) these should be int
             for line_idx in range(3):
-                trace_arff['data'][trace_idx][line_idx] = int(
-                    trace_arff['data'][trace_idx][line_idx]
+                trace_arff["data"][trace_idx][line_idx] = int(
+                    trace_arff["data"][trace_idx][line_idx],
                 )
 
         return cls.trace_from_arff(trace_arff)
 
-    def _to_filesystem(self, file_path):
+    def _to_filesystem(self, file_path: str | Path) -> None:
         """Serialize the trace object to the filesystem.
 
         Serialize the trace object as an arff.
 
         Parameters
         ----------
-        file_path: str
+        file_path: str | Path
             File path where the trace arff will be stored.
         """
+        trace_path = Path(file_path) / "trace.arff"
 
         trace_arff = arff.dumps(self.trace_to_arff())
-        with open(os.path.join(file_path, 'trace.arff'), 'w') as f:
+        with trace_path.open("w") as f:
             f.write(trace_arff)
 
-    def trace_to_arff(self):
+    def trace_to_arff(self) -> dict[str, Any]:
         """Generate the arff dictionary for uploading predictions to the server.
 
         Uses the trace object to generate an arff dictionary representation.
@@ -166,44 +259,45 @@ def trace_to_arff(self):
 
         # attributes that will be in trace arff
         trace_attributes = [
-            ('repeat', 'NUMERIC'),
-            ('fold', 'NUMERIC'),
-            ('iteration', 'NUMERIC'),
-            ('evaluation', 'NUMERIC'),
-            ('selected', ['true', 'false']),
+            ("repeat", "NUMERIC"),
+            ("fold", "NUMERIC"),
+            ("iteration", "NUMERIC"),
+            ("evaluation", "NUMERIC"),
+            ("selected", ["true", "false"]),
         ]
-        trace_attributes.extend([
-            (PREFIX + parameter, 'STRING') for parameter in
-            next(iter(self.trace_iterations.values())).get_parameters()
-        ])
+        trace_attributes.extend(
+            [
+                (PREFIX + parameter, "STRING")
+                for parameter in next(iter(self.trace_iterations.values())).get_parameters()
+            ],
+        )
 
-        arff_dict = OrderedDict()
+        arff_dict: dict[str, Any] = {}
         data = []
         for trace_iteration in self.trace_iterations.values():
             tmp_list = []
-            for attr, _ in trace_attributes:
-                if attr.startswith(PREFIX):
-                    attr = attr[len(PREFIX):]
+            for _attr, _ in trace_attributes:
+                if _attr.startswith(PREFIX):
+                    attr = _attr[len(PREFIX) :]
                     value = trace_iteration.get_parameters()[attr]
                 else:
+                    attr = _attr
                     value = getattr(trace_iteration, attr)
-                if attr == 'selected':
-                    if value:
-                        tmp_list.append('true')
-                    else:
-                        tmp_list.append('false')
+
+                if attr == "selected":
+                    tmp_list.append("true" if value else "false")
                 else:
                     tmp_list.append(value)
             data.append(tmp_list)
 
-        arff_dict['attributes'] = trace_attributes
-        arff_dict['data'] = data
+        arff_dict["attributes"] = trace_attributes
+        arff_dict["data"] = data
         # TODO allow to pass a trace description when running a flow
-        arff_dict['relation'] = "Trace"
+        arff_dict["relation"] = "Trace"
         return arff_dict
 
     @classmethod
-    def trace_from_arff(cls, arff_obj):
+    def trace_from_arff(cls, arff_obj: dict[str, Any]) -> OpenMLRunTrace:
         """Generate trace from arff trace.
 
         Creates a trace file from arff object (for example, generated by a
@@ -218,63 +312,82 @@ def trace_from_arff(cls, arff_obj):
         -------
         OpenMLRunTrace
         """
-        attributes = arff_obj['attributes']
-        content = arff_obj['data']
+        attributes = arff_obj["attributes"]
+        content = arff_obj["data"]
         return cls._trace_from_arff_struct(
             attributes=attributes,
             content=content,
-            error_message='setup_string not supported for arff serialization'
+            error_message="setup_string not supported for arff serialization",
         )
 
     @classmethod
-    def _trace_from_arff_struct(cls, attributes, content, error_message):
+    def _trace_from_arff_struct(
+        cls,
+        attributes: list[tuple[str, str]],
+        content: list[list[int | float | str]],
+        error_message: str,
+    ) -> Self:
+        """Generate a trace dictionary from ARFF structure.
+
+        Parameters
+        ----------
+        cls : type
+            The trace object to be created.
+        attributes : list[tuple[str, str]]
+            Attribute descriptions.
+        content : list[list[int | float | str]]]
+            List of instances.
+        error_message : str
+            Error message to raise if `setup_string` is in `attributes`.
+
+        Returns
+        -------
+        OrderedDict
+            A dictionary representing the trace.
+        """
         trace = OrderedDict()
         attribute_idx = {att[0]: idx for idx, att in enumerate(attributes)}
 
         for required_attribute in REQUIRED_ATTRIBUTES:
             if required_attribute not in attribute_idx:
-                raise ValueError(
-                    'arff misses required attribute: %s' % required_attribute
-                )
-        if 'setup_string' in attribute_idx:
+                raise ValueError(f"arff misses required attribute: {required_attribute}")
+        if "setup_string" in attribute_idx:
             raise ValueError(error_message)
 
         # note that the required attributes can not be duplicated because
         # they are not parameters
         parameter_attributes = []
         for attribute in attribute_idx:
-            if attribute in REQUIRED_ATTRIBUTES:
+            if attribute in REQUIRED_ATTRIBUTES or attribute == "setup_string":
                 continue
-            elif attribute == 'setup_string':
-                continue
-            elif not attribute.startswith(PREFIX):
+
+            if not attribute.startswith(PREFIX):
                 raise ValueError(
-                    'Encountered unknown attribute %s that does not start '
-                    'with prefix %s' % (attribute, PREFIX)
+                    f"Encountered unknown attribute {attribute} that does not start "
+                    f"with prefix {PREFIX}",
                 )
-            else:
-                parameter_attributes.append(attribute)
+
+            parameter_attributes.append(attribute)
 
         for itt in content:
-            repeat = int(itt[attribute_idx['repeat']])
-            fold = int(itt[attribute_idx['fold']])
-            iteration = int(itt[attribute_idx['iteration']])
-            evaluation = float(itt[attribute_idx['evaluation']])
-            selected_value = itt[attribute_idx['selected']]
-            if selected_value == 'true':
+            repeat = int(itt[attribute_idx["repeat"]])
+            fold = int(itt[attribute_idx["fold"]])
+            iteration = int(itt[attribute_idx["iteration"]])
+            evaluation = float(itt[attribute_idx["evaluation"]])
+            selected_value = itt[attribute_idx["selected"]]
+            if selected_value == "true":
                 selected = True
-            elif selected_value == 'false':
+            elif selected_value == "false":
                 selected = False
             else:
                 raise ValueError(
                     'expected {"true", "false"} value for selected field, '
-                    'received: %s' % selected_value
+                    f"received: {selected_value}",
                 )
 
-            parameters = OrderedDict([
-                (attribute, itt[attribute_idx[attribute]])
-                for attribute in parameter_attributes
-            ])
+            parameters = {
+                attribute: itt[attribute_idx[attribute]] for attribute in parameter_attributes
+            }
 
             current = OpenMLTraceIteration(
                 repeat=repeat,
@@ -290,7 +403,7 @@ def _trace_from_arff_struct(cls, attributes, content, error_message):
         return cls(None, trace)
 
     @classmethod
-    def trace_from_xml(cls, xml):
+    def trace_from_xml(cls, xml: str | Path | IO) -> OpenMLRunTrace:
         """Generate trace from xml.
 
         Creates a trace file from the xml description.
@@ -307,72 +420,104 @@ def trace_from_xml(cls, xml):
             Object containing the run id and a dict containing the trace
             iterations.
         """
-        result_dict = xmltodict.parse(
-            xml, force_list=('oml:trace_iteration',)
-        )['oml:trace']
+        if isinstance(xml, Path):
+            xml = str(xml.absolute())
+
+        result_dict = xmltodict.parse(xml, force_list=("oml:trace_iteration",))["oml:trace"]
 
-        run_id = result_dict['oml:run_id']
+        run_id = result_dict["oml:run_id"]
         trace = OrderedDict()
 
-        if 'oml:trace_iteration' not in result_dict:
-            raise ValueError('Run does not contain valid trace. ')
-        if not isinstance(result_dict['oml:trace_iteration'], list):
-            raise TypeError(type(result_dict['oml:trace_iteration']))
-
-        for itt in result_dict['oml:trace_iteration']:
-            repeat = int(itt['oml:repeat'])
-            fold = int(itt['oml:fold'])
-            iteration = int(itt['oml:iteration'])
-            setup_string = json.loads(itt['oml:setup_string'])
-            evaluation = float(itt['oml:evaluation'])
-            selected_value = itt['oml:selected']
-            if selected_value == 'true':
+        if "oml:trace_iteration" not in result_dict:
+            raise ValueError("Run does not contain valid trace. ")
+        if not isinstance(result_dict["oml:trace_iteration"], list):
+            raise TypeError(type(result_dict["oml:trace_iteration"]))
+
+        for itt in result_dict["oml:trace_iteration"]:
+            repeat = int(itt["oml:repeat"])
+            fold = int(itt["oml:fold"])
+            iteration = int(itt["oml:iteration"])
+            setup_string = json.loads(itt["oml:setup_string"])
+            evaluation = float(itt["oml:evaluation"])
+            selected_value = itt["oml:selected"]
+            if selected_value == "true":
                 selected = True
-            elif selected_value == 'false':
+            elif selected_value == "false":
                 selected = False
             else:
                 raise ValueError(
                     'expected {"true", "false"} value for '
-                    'selected field, received: %s' % selected_value
+                    f"selected field, received: {selected_value}",
                 )
 
             current = OpenMLTraceIteration(
-                repeat,
-                fold,
-                iteration,
-                setup_string,
-                evaluation,
-                selected,
+                repeat=repeat,
+                fold=fold,
+                iteration=iteration,
+                setup_string=setup_string,
+                evaluation=evaluation,
+                selected=selected,
             )
             trace[(repeat, fold, iteration)] = current
 
         return cls(run_id, trace)
 
     @classmethod
-    def merge_traces(cls, traces: List['OpenMLRunTrace']) -> 'OpenMLRunTrace':
+    def merge_traces(cls, traces: list[OpenMLRunTrace]) -> OpenMLRunTrace:
+        """Merge multiple traces into a single trace.
 
-        merged_trace = OrderedDict()  # type: OrderedDict[Tuple[int, int, int], OpenMLTraceIteration]  # noqa E501
+        Parameters
+        ----------
+        cls : type
+            Type of the trace object to be created.
+        traces : List[OpenMLRunTrace]
+            List of traces to merge.
+
+        Returns
+        -------
+        OpenMLRunTrace
+            A trace object representing the merged traces.
+
+        Raises
+        ------
+        ValueError
+            If the parameters in the iterations of the traces being merged are not equal.
+            If a key (repeat, fold, iteration) is encountered twice while merging the traces.
+        """
+        merged_trace: dict[tuple[int, int, int], OpenMLTraceIteration] = {}
 
         previous_iteration = None
         for trace in traces:
             for iteration in trace:
                 key = (iteration.repeat, iteration.fold, iteration.iteration)
+
+                if iteration.parameters is None:
+                    raise ValueError(
+                        f"Iteration parameters cannot be None for repeat {iteration.repeat}, "
+                        f"fold {iteration.fold}, iteration {iteration.iteration}"
+                    )
+                param_keys = iteration.parameters.keys()
+
                 if previous_iteration is not None:
-                    if (
-                        list(merged_trace[previous_iteration].parameters.keys())
-                        != list(iteration.parameters.keys())
-                    ):
+                    trace_itr = merged_trace[previous_iteration]
+
+                    if trace_itr.parameters is None:
                         raise ValueError(
-                            'Cannot merge traces because the parameters are not equal: {} vs {}'.
-                            format(
-                                list(merged_trace[previous_iteration].parameters.keys()),
-                                list(iteration.parameters.keys()),
-                            )
+                            f"Trace iteration parameters cannot be None "
+                            f"for iteration {previous_iteration}"
+                        )
+                    trace_itr_keys = trace_itr.parameters.keys()
+
+                    if list(param_keys) != list(trace_itr_keys):
+                        raise ValueError(
+                            "Cannot merge traces because the parameters are not equal: "
+                            f"{list(trace_itr.parameters.keys())} vs "
+                            f"{list(iteration.parameters.keys())}",
                         )
 
                 if key in merged_trace:
                     raise ValueError(
-                        "Cannot merge traces because key '{}' was encountered twice".format(key)
+                        f"Cannot merge traces because key '{key}' was encountered twice",
                     )
 
                 merged_trace[key] = iteration
@@ -380,105 +525,11 @@ def merge_traces(cls, traces: List['OpenMLRunTrace']) -> 'OpenMLRunTrace':
 
         return cls(None, merged_trace)
 
-    def __repr__(self):
-        return '[Run id: %d, %d trace iterations]'.format(
-            -1 if self.run_id is None else self.run_id,
-            len(self.trace_iterations),
+    def __repr__(self) -> str:
+        return (
+            f"[Run id: {-1 if self.run_id is None else self.run_id}, "
+            f"{len(self.trace_iterations)} trace iterations]"
         )
 
-    def __iter__(self):
-        for val in self.trace_iterations.values():
-            yield val
-
-
-class OpenMLTraceIteration(object):
-    """OpenML Trace Iteration: parsed output from Run Trace call
-
-    Parameters
-    ----------
-    repeat : int
-        repeat number (in case of no repeats: 0)
-
-    fold : int
-        fold number (in case of no folds: 0)
-
-    iteration : int
-        iteration number of optimization procedure
-
-    setup_string : str
-        json string representing the parameters
-
-    evaluation : double
-        The evaluation that was awarded to this trace iteration.
-        Measure is defined by the task
-
-    selected : bool
-        Whether this was the best of all iterations, and hence
-        selected for making predictions. Per fold/repeat there
-        should be only one iteration selected
-
-    parameters : OrderedDict
-    """
-
-    def __init__(
-        self,
-        repeat,
-        fold,
-        iteration,
-        setup_string,
-        evaluation,
-        selected,
-        parameters=None,
-    ):
-
-        if not isinstance(selected, bool):
-            raise TypeError(type(selected))
-        if setup_string and parameters:
-            raise ValueError(
-                'Can only be instantiated with either '
-                'setup_string or parameters argument.'
-            )
-        elif not setup_string and not parameters:
-            raise ValueError(
-                'Either setup_string or parameters needs to be passed as '
-                'argument.'
-            )
-        if parameters is not None and not isinstance(parameters, OrderedDict):
-            raise TypeError(
-                'argument parameters is not an instance of OrderedDict, but %s'
-                % str(type(parameters))
-            )
-
-        self.repeat = repeat
-        self.fold = fold
-        self.iteration = iteration
-        self.setup_string = setup_string
-        self.evaluation = evaluation
-        self.selected = selected
-        self.parameters = parameters
-
-    def get_parameters(self):
-        result = {}
-        # parameters have prefix 'parameter_'
-
-        if self.setup_string:
-            for param in self.setup_string:
-                key = param[len(PREFIX):]
-                value = self.setup_string[param]
-                result[key] = json.loads(value)
-        else:
-            for param, value in self.parameters.items():
-                result[param[len(PREFIX):]] = value
-        return result
-
-    def __repr__(self):
-        """
-        tmp string representation, will be changed in the near future
-        """
-        return '[(%d,%d,%d): %f (%r)]' % (
-            self.repeat,
-            self.fold,
-            self.iteration,
-            self.evaluation,
-            self.selected,
-        )
+    def __iter__(self) -> Iterator[OpenMLTraceIteration]:
+        yield from self.trace_iterations.values()
diff --git a/openml/setups/__init__.py b/openml/setups/__init__.py
index a8b4a8863..fa4072059 100644
--- a/openml/setups/__init__.py
+++ b/openml/setups/__init__.py
@@ -1,5 +1,13 @@
-from .setup import OpenMLSetup, OpenMLParameter
-from .functions import get_setup, list_setups, setup_exists, initialize_model
+# License: BSD 3-Clause
 
-__all__ = ['OpenMLSetup', 'OpenMLParameter', 'get_setup', 'list_setups',
-           'setup_exists', 'initialize_model']
+from .functions import get_setup, initialize_model, list_setups, setup_exists
+from .setup import OpenMLParameter, OpenMLSetup
+
+__all__ = [
+    "OpenMLParameter",
+    "OpenMLSetup",
+    "get_setup",
+    "initialize_model",
+    "list_setups",
+    "setup_exists",
+]
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
index 97c001b24..a24d3a456 100644
--- a/openml/setups/functions.py
+++ b/openml/setups/functions.py
@@ -1,26 +1,31 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
 from collections import OrderedDict
-import io
-import os
-from typing import Any, Union, List, Dict, Optional
+from collections.abc import Iterable
+from functools import partial
+from itertools import chain
+from pathlib import Path
+from typing import Any, Literal
 
-import xmltodict
 import pandas as pd
+import xmltodict
 
 import openml
-from .. import config
-from .setup import OpenMLSetup, OpenMLParameter
-from openml.flows import flow_exists
 import openml.exceptions
 import openml.utils
+from openml.flows import OpenMLFlow, flow_exists
 
+from .setup import OpenMLParameter, OpenMLSetup
 
-def setup_exists(flow) -> int:
+
+def setup_exists(flow: OpenMLFlow) -> int:
     """
     Checks whether a hyperparameter configuration already exists on the server.
 
     Parameters
     ----------
-    flow : flow
+    flow : OpenMLFlow
         The openml flow object. Should have flow id present for the main flow
         and all subflows (i.e., it should be downloaded from the server by
         means of flow.get, and not instantiated locally)
@@ -33,48 +38,66 @@ def setup_exists(flow) -> int:
     # sadly, this api call relies on a run object
     openml.flows.functions._check_flow_for_server_id(flow)
     if flow.model is None:
-        raise ValueError('Flow should have model field set with the actual model.')
+        raise ValueError("Flow should have model field set with the actual model.")
     if flow.extension is None:
-        raise ValueError('Flow should have model field set with the correct extension.')
+        raise ValueError("Flow should have model field set with the correct extension.")
 
     # checks whether the flow exists on the server and flow ids align
     exists = flow_exists(flow.name, flow.external_version)
     if exists != flow.flow_id:
-        raise ValueError('This should not happen!')
+        raise ValueError(
+            f"Local flow id ({flow.id}) differs from server id ({exists}). "
+            "If this issue persists, please contact the developers.",
+        )
 
     openml_param_settings = flow.extension.obtain_parameter_values(flow)
-    description = xmltodict.unparse(_to_dict(flow.flow_id,
-                                             openml_param_settings),
-                                    pretty=True)
-    file_elements = {'description': ('description.arff', description)}
-    result = openml._api_calls._perform_api_call('/setup/exists/',
-                                                 'post',
-                                                 file_elements=file_elements)
+    description = xmltodict.unparse(_to_dict(flow.flow_id, openml_param_settings), pretty=True)
+    file_elements = {
+        "description": ("description.arff", description),
+    }  # type: openml._api_calls.FILE_ELEMENTS_TYPE
+    result = openml._api_calls._perform_api_call(
+        "/setup/exists/",
+        "post",
+        file_elements=file_elements,
+    )
     result_dict = xmltodict.parse(result)
-    setup_id = int(result_dict['oml:setup_exists']['oml:id'])
-    if setup_id > 0:
-        return setup_id
-    else:
-        return False
+    setup_id = int(result_dict["oml:setup_exists"]["oml:id"])
+    return setup_id if setup_id > 0 else False
+
 
+def _get_cached_setup(setup_id: int) -> OpenMLSetup:
+    """Load a run from the cache.
 
-def _get_cached_setup(setup_id):
-    """Load a run from the cache."""
-    cache_dir = config.get_cache_directory()
-    setup_cache_dir = os.path.join(cache_dir, "setups", str(setup_id))
+    Parameters
+    ----------
+    setup_id : int
+        ID of the setup to be loaded.
+
+    Returns
+    -------
+    OpenMLSetup
+        The loaded setup object.
+
+    Raises
+    ------
+    OpenMLCacheException
+        If the setup file for the given setup ID is not cached.
+    """
+    cache_dir = Path(openml.config.get_cache_directory())
+    setup_cache_dir = cache_dir / "setups" / str(setup_id)
     try:
-        setup_file = os.path.join(setup_cache_dir, "description.xml")
-        with io.open(setup_file, encoding='utf8') as fh:
+        setup_file = setup_cache_dir / "description.xml"
+        with setup_file.open(encoding="utf8") as fh:
             setup_xml = xmltodict.parse(fh.read())
-            setup = _create_setup_from_xml(setup_xml, output_format='object')
-        return setup
+            return _create_setup_from_xml(setup_xml)
 
-    except (OSError, IOError):
+    except OSError as e:
         raise openml.exceptions.OpenMLCacheException(
-            "Setup file for setup id %d not cached" % setup_id)
+            f"Setup file for setup id {setup_id} not cached",
+        ) from e
 
 
-def get_setup(setup_id):
+def get_setup(setup_id: int) -> OpenMLSetup:
     """
      Downloads the setup (configuration) description from OpenML
      and returns a structured object
@@ -86,36 +109,33 @@ def get_setup(setup_id):
 
     Returns
     -------
-    dict or OpenMLSetup(an initialized openml setup object)
+    OpenMLSetup (an initialized openml setup object)
     """
-    setup_dir = os.path.join(config.get_cache_directory(),
-                             "setups",
-                             str(setup_id))
-    setup_file = os.path.join(setup_dir, "description.xml")
+    setup_dir = Path(openml.config.get_cache_directory()) / "setups" / str(setup_id)
+    setup_dir.mkdir(exist_ok=True, parents=True)
 
-    if not os.path.exists(setup_dir):
-        os.makedirs(setup_dir)
+    setup_file = setup_dir / "description.xml"
 
     try:
         return _get_cached_setup(setup_id)
-    except (openml.exceptions.OpenMLCacheException):
-        url_suffix = '/setup/%d' % setup_id
-        setup_xml = openml._api_calls._perform_api_call(url_suffix, 'get')
-        with io.open(setup_file, "w", encoding='utf8') as fh:
+    except openml.exceptions.OpenMLCacheException:
+        url_suffix = f"/setup/{setup_id}"
+        setup_xml = openml._api_calls._perform_api_call(url_suffix, "get")
+        with setup_file.open("w", encoding="utf8") as fh:
             fh.write(setup_xml)
 
     result_dict = xmltodict.parse(setup_xml)
-    return _create_setup_from_xml(result_dict, output_format='object')
+    return _create_setup_from_xml(result_dict)
 
 
-def list_setups(
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    flow: Optional[int] = None,
-    tag: Optional[str] = None,
-    setup: Optional[List] = None,
-    output_format: str = 'object'
-) -> Union[Dict, pd.DataFrame]:
+def list_setups(  # noqa: PLR0913
+    offset: int | None = None,
+    size: int | None = None,
+    flow: int | None = None,
+    tag: str | None = None,
+    setup: Iterable[int] | None = None,
+    output_format: Literal["object", "dataframe"] = "object",
+) -> dict[int, OpenMLSetup] | pd.DataFrame:
     """
     List all setups matching all of the given filters.
 
@@ -125,105 +145,108 @@ def list_setups(
     size : int, optional
     flow : int, optional
     tag : str, optional
-    setup : list(int), optional
+    setup : Iterable[int], optional
     output_format: str, optional (default='object')
         The parameter decides the format of the output.
-        - If 'object' the output is a dict of OpenMLSetup objects
-        - If 'dict' the output is a dict of dict
         - If 'dataframe' the output is a pandas DataFrame
+        - If 'object' the output is a dictionary of OpenMLSetup objects
 
     Returns
     -------
     dict or dataframe
     """
-    if output_format not in ['dataframe', 'dict', 'object']:
-        raise ValueError("Invalid output format selected. "
-                         "Only 'dict', 'object', or 'dataframe' applicable.")
-
-    batch_size = 1000  # batch size for setups is lower
-    return openml.utils._list_all(output_format=output_format,
-                                  listing_call=_list_setups,
-                                  offset=offset,
-                                  size=size,
-                                  flow=flow,
-                                  tag=tag,
-                                  setup=setup,
-                                  batch_size=batch_size)
-
-
-def _list_setups(setup=None, output_format='object', **kwargs):
-    """
-    Perform API call `/setup/list/{filters}`
+    if output_format not in ["dataframe", "object"]:
+        raise ValueError(
+            "Invalid output format selected. Only 'object', or 'dataframe' applicable.",
+        )
+
+    listing_call = partial(_list_setups, flow=flow, tag=tag, setup=setup)
+    batches = openml.utils._list_all(
+        listing_call,
+        batch_size=1_000,  # batch size for setups is lower
+        offset=offset,
+        limit=size,
+    )
+    flattened = list(chain.from_iterable(batches))
+    if output_format == "object":
+        return {setup.setup_id: setup for setup in flattened}
+
+    records = [setup._to_dict() for setup in flattened]
+    return pd.DataFrame.from_records(records, index="setup_id")
+
+
+def _list_setups(
+    limit: int,
+    offset: int,
+    *,
+    setup: Iterable[int] | None = None,
+    flow: int | None = None,
+    tag: str | None = None,
+) -> list[OpenMLSetup]:
+    """Perform API call `/setup/list/{filters}`
 
     Parameters
     ----------
     The setup argument that is a list is separated from the single value
     filters which are put into the kwargs.
 
+    limit : int
+    offset : int
     setup : list(int), optional
-
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
-
-    kwargs: dict, optional
-        Legal filter operators: flow, setup, limit, offset, tag.
+    flow : int, optional
+    tag : str, optional
 
     Returns
     -------
-    dict or dataframe
-        """
-
+    The setups that match the filters, going from id to the OpenMLSetup object.
+    """
     api_call = "setup/list"
+    if limit is not None:
+        api_call += f"/limit/{limit}"
+    if offset is not None:
+        api_call += f"/offset/{offset}"
     if setup is not None:
-        api_call += "/setup/%s" % ','.join([str(int(i)) for i in setup])
-    if kwargs is not None:
-        for operator, value in kwargs.items():
-            api_call += "/%s/%s" % (operator, value)
+        api_call += f"/setup/{','.join([str(int(i)) for i in setup])}"
+    if flow is not None:
+        api_call += f"/flow/{flow}"
+    if tag is not None:
+        api_call += f"/tag/{tag}"
 
-    return __list_setups(api_call=api_call, output_format=output_format)
+    return __list_setups(api_call=api_call)
 
 
-def __list_setups(api_call, output_format='object'):
+def __list_setups(api_call: str) -> list[OpenMLSetup]:
     """Helper function to parse API calls which are lists of setups"""
-    xml_string = openml._api_calls._perform_api_call(api_call, 'get')
-    setups_dict = xmltodict.parse(xml_string, force_list=('oml:setup',))
-    openml_uri = 'http://openml.org/openml'
+    xml_string = openml._api_calls._perform_api_call(api_call, "get")
+    setups_dict = xmltodict.parse(xml_string, force_list=("oml:setup",))
+    openml_uri = "http://openml.org/openml"
     # Minimalistic check if the XML is useful
-    if 'oml:setups' not in setups_dict:
-        raise ValueError('Error in return XML, does not contain "oml:setups":'
-                         ' %s' % str(setups_dict))
-    elif '@xmlns:oml' not in setups_dict['oml:setups']:
-        raise ValueError('Error in return XML, does not contain '
-                         '"oml:setups"/@xmlns:oml: %s'
-                         % str(setups_dict))
-    elif setups_dict['oml:setups']['@xmlns:oml'] != openml_uri:
-        raise ValueError('Error in return XML, value of  '
-                         '"oml:seyups"/@xmlns:oml is not '
-                         '"%s": %s'
-                         % (openml_uri, str(setups_dict)))
-
-    assert type(setups_dict['oml:setups']['oml:setup']) == list, \
-        type(setups_dict['oml:setups'])
-
-    setups = dict()
-    for setup_ in setups_dict['oml:setups']['oml:setup']:
-        # making it a dict to give it the right format
-        current = _create_setup_from_xml({'oml:setup_parameters': setup_},
-                                         output_format=output_format)
-        if output_format == 'object':
-            setups[current.setup_id] = current
-        else:
-            setups[current['setup_id']] = current
-
-    if output_format == 'dataframe':
-        setups = pd.DataFrame.from_dict(setups, orient='index')
-
-    return setups
-
-
-def initialize_model(setup_id: int) -> Any:
+    if "oml:setups" not in setups_dict:
+        raise ValueError(
+            f'Error in return XML, does not contain "oml:setups": {setups_dict!s}',
+        )
+
+    if "@xmlns:oml" not in setups_dict["oml:setups"]:
+        raise ValueError(
+            f'Error in return XML, does not contain "oml:setups"/@xmlns:oml: {setups_dict!s}',
+        )
+
+    if setups_dict["oml:setups"]["@xmlns:oml"] != openml_uri:
+        raise ValueError(
+            "Error in return XML, value of  "
+            '"oml:seyups"/@xmlns:oml is not '
+            f'"{openml_uri}": {setups_dict!s}',
+        )
+
+    assert isinstance(setups_dict["oml:setups"]["oml:setup"], list), type(setups_dict["oml:setups"])
+
+    return [
+        _create_setup_from_xml({"oml:setup_parameters": setup_})
+        for setup_ in setups_dict["oml:setups"]["oml:setup"]
+    ]
+
+
+def initialize_model(setup_id: int, *, strict_version: bool = True) -> Any:
     """
     Initialized a model based on a setup_id (i.e., using the exact
     same parameter settings)
@@ -232,6 +255,8 @@ def initialize_model(setup_id: int) -> Any:
     ----------
     setup_id : int
         The Openml setup_id
+    strict_version: bool (default=True)
+        See `flow_to_model` strict_version.
 
     Returns
     -------
@@ -243,79 +268,79 @@ def initialize_model(setup_id: int) -> Any:
     # instead of using scikit-learns or any other library's "set_params" function, we override the
     # OpenMLFlow objects default parameter value so we can utilize the
     # Extension.flow_to_model() function to reinitialize the flow with the set defaults.
-    for hyperparameter in setup.parameters.values():
-        structure = flow.get_structure('flow_id')
-        if len(structure[hyperparameter.flow_id]) > 0:
-            subflow = flow.get_subflow(structure[hyperparameter.flow_id])
-        else:
-            subflow = flow
-        subflow.parameters[hyperparameter.parameter_name] = \
-            hyperparameter.value
+    if setup.parameters is not None:
+        for hyperparameter in setup.parameters.values():
+            structure = flow.get_structure("flow_id")
+            if len(structure[hyperparameter.flow_id]) > 0:
+                subflow = flow.get_subflow(structure[hyperparameter.flow_id])
+            else:
+                subflow = flow
+            subflow.parameters[hyperparameter.parameter_name] = hyperparameter.value
+
+    return flow.extension.flow_to_model(flow, strict_version=strict_version)
 
-    model = flow.extension.flow_to_model(flow)
-    return model
 
+def _to_dict(flow_id: int, openml_parameter_settings: list[dict[str, Any]]) -> OrderedDict:
+    """Convert a flow ID and a list of OpenML parameter settings to
+    a dictionary representation that can be serialized to XML.
 
-def _to_dict(flow_id, openml_parameter_settings):
+    Parameters
+    ----------
+    flow_id : int
+        ID of the flow.
+    openml_parameter_settings : list[dict[str, Any]]
+        A list of OpenML parameter settings.
+
+    Returns
+    -------
+    OrderedDict
+        A dictionary representation of the flow ID and parameter settings.
+    """
     # for convenience, this function (ab)uses the run object.
-    xml = OrderedDict()
-    xml['oml:run'] = OrderedDict()
-    xml['oml:run']['@xmlns:oml'] = 'http://openml.org/openml'
-    xml['oml:run']['oml:flow_id'] = flow_id
-    xml['oml:run']['oml:parameter_setting'] = openml_parameter_settings
+    xml: OrderedDict = OrderedDict()
+    xml["oml:run"] = OrderedDict()
+    xml["oml:run"]["@xmlns:oml"] = "http://openml.org/openml"
+    xml["oml:run"]["oml:flow_id"] = flow_id
+    xml["oml:run"]["oml:parameter_setting"] = openml_parameter_settings
 
     return xml
 
 
-def _create_setup_from_xml(result_dict, output_format='object'):
-    """
-    Turns an API xml result into a OpenMLSetup object (or dict)
-    """
-    setup_id = int(result_dict['oml:setup_parameters']['oml:setup_id'])
-    flow_id = int(result_dict['oml:setup_parameters']['oml:flow_id'])
-    parameters = {}
-    if 'oml:parameter' not in result_dict['oml:setup_parameters']:
-        parameters = None
+def _create_setup_from_xml(result_dict: dict) -> OpenMLSetup:
+    """Turns an API xml result into a OpenMLSetup object (or dict)"""
+    setup_id = int(result_dict["oml:setup_parameters"]["oml:setup_id"])
+    flow_id = int(result_dict["oml:setup_parameters"]["oml:flow_id"])
+
+    if "oml:parameter" not in result_dict["oml:setup_parameters"]:
+        return OpenMLSetup(setup_id, flow_id, parameters=None)
+
+    xml_parameters = result_dict["oml:setup_parameters"]["oml:parameter"]
+    if isinstance(xml_parameters, dict):
+        parameters = {
+            int(xml_parameters["oml:id"]): _create_setup_parameter_from_xml(xml_parameters),
+        }
+    elif isinstance(xml_parameters, list):
+        parameters = {
+            int(xml_parameter["oml:id"]): _create_setup_parameter_from_xml(xml_parameter)
+            for xml_parameter in xml_parameters
+        }
     else:
-        # basically all others
-        xml_parameters = result_dict['oml:setup_parameters']['oml:parameter']
-        if isinstance(xml_parameters, dict):
-            id = int(xml_parameters['oml:id'])
-            parameters[id] = _create_setup_parameter_from_xml(result_dict=xml_parameters,
-                                                              output_format=output_format)
-        elif isinstance(xml_parameters, list):
-            for xml_parameter in xml_parameters:
-                id = int(xml_parameter['oml:id'])
-                parameters[id] = \
-                    _create_setup_parameter_from_xml(result_dict=xml_parameter,
-                                                     output_format=output_format)
-        else:
-            raise ValueError('Expected None, list or dict, received '
-                             'something else: %s' % str(type(xml_parameters)))
-
-    if output_format in ['dataframe', 'dict']:
-        return_dict = {'setup_id': setup_id, 'flow_id': flow_id}
-        return_dict['parameters'] = parameters
-        return(return_dict)
+        raise ValueError(
+            f"Expected None, list or dict, received something else: {type(xml_parameters)!s}",
+        )
+
     return OpenMLSetup(setup_id, flow_id, parameters)
 
 
-def _create_setup_parameter_from_xml(result_dict, output_format='object'):
-    if output_format == 'object':
-        return OpenMLParameter(input_id=int(result_dict['oml:id']),
-                               flow_id=int(result_dict['oml:flow_id']),
-                               flow_name=result_dict['oml:flow_name'],
-                               full_name=result_dict['oml:full_name'],
-                               parameter_name=result_dict['oml:parameter_name'],
-                               data_type=result_dict['oml:data_type'],
-                               default_value=result_dict['oml:default_value'],
-                               value=result_dict['oml:value'])
-    else:
-        return({'input_id': int(result_dict['oml:id']),
-                'flow_id': int(result_dict['oml:flow_id']),
-                'flow_name': result_dict['oml:flow_name'],
-                'full_name': result_dict['oml:full_name'],
-                'parameter_name': result_dict['oml:parameter_name'],
-                'data_type': result_dict['oml:data_type'],
-                'default_value': result_dict['oml:default_value'],
-                'value': result_dict['oml:value']})
+def _create_setup_parameter_from_xml(result_dict: dict[str, str]) -> OpenMLParameter:
+    """Create an OpenMLParameter object or a dictionary from an API xml result."""
+    return OpenMLParameter(
+        input_id=int(result_dict["oml:id"]),
+        flow_id=int(result_dict["oml:flow_id"]),
+        flow_name=result_dict["oml:flow_name"],
+        full_name=result_dict["oml:full_name"],
+        parameter_name=result_dict["oml:parameter_name"],
+        data_type=result_dict["oml:data_type"],
+        default_value=result_dict["oml:default_value"],
+        value=result_dict["oml:value"],
+    )
diff --git a/openml/setups/setup.py b/openml/setups/setup.py
index aee1aa0bf..0c3a3cb6b 100644
--- a/openml/setups/setup.py
+++ b/openml/setups/setup.py
@@ -1,7 +1,14 @@
-import openml.config
+# License: BSD 3-Clause
+from __future__ import annotations
 
+from dataclasses import asdict, dataclass
+from typing import Any
 
-class OpenMLSetup(object):
+import openml.flows
+
+
+@dataclass
+class OpenMLSetup:
     """Setup object (a.k.a. Configuration).
 
     Parameters
@@ -14,40 +21,54 @@ class OpenMLSetup(object):
         The setting of the parameters
     """
 
-    def __init__(self, setup_id, flow_id, parameters):
-        if not isinstance(setup_id, int):
-            raise ValueError('setup id should be int')
-        if not isinstance(flow_id, int):
-            raise ValueError('flow id should be int')
-        if parameters is not None:
-            if not isinstance(parameters, dict):
-                raise ValueError('parameters should be dict')
+    setup_id: int
+    flow_id: int
+    parameters: dict[int, Any] | None
+
+    def __post_init__(self) -> None:
+        if not isinstance(self.setup_id, int):
+            raise ValueError("setup id should be int")
+
+        if not isinstance(self.flow_id, int):
+            raise ValueError("flow id should be int")
 
-        self.setup_id = setup_id
-        self.flow_id = flow_id
-        self.parameters = parameters
+        if self.parameters is not None and not isinstance(self.parameters, dict):
+            raise ValueError("parameters should be dict")
 
-    def __repr__(self):
+    def _to_dict(self) -> dict[str, Any]:
+        return {
+            "setup_id": self.setup_id,
+            "flow_id": self.flow_id,
+            "parameters": {p.id: p._to_dict() for p in self.parameters.values()}
+            if self.parameters is not None
+            else None,
+        }
+
+    def __repr__(self) -> str:
         header = "OpenML Setup"
-        header = '{}\n{}\n'.format(header, '=' * len(header))
+        header = f"{header}\n{'=' * len(header)}\n"
 
-        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
-        fields = {"Setup ID": self.setup_id,
-                  "Flow ID": self.flow_id,
-                  "Flow URL": "{}f/{}".format(base_url, self.flow_id),
-                  "# of Parameters": len(self.parameters)}
+        fields = {
+            "Setup ID": self.setup_id,
+            "Flow ID": self.flow_id,
+            "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id),
+            "# of Parameters": (
+                len(self.parameters) if self.parameters is not None else float("nan")
+            ),
+        }
 
         # determines the order in which the information will be printed
         order = ["Setup ID", "Flow ID", "Flow URL", "# of Parameters"]
-        fields = [(key, fields[key]) for key in order if key in fields]
+        _fields = [(key, fields[key]) for key in order if key in fields]
 
-        longest_field_name_length = max(len(name) for name, value in fields)
-        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
-        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
+        longest_field_name_length = max(len(name) for name, _ in _fields)
+        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
+        body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
         return header + body
 
 
-class OpenMLParameter(object):
+@dataclass
+class OpenMLParameter:
     """Parameter object (used in setup).
 
     Parameters
@@ -71,44 +92,62 @@ class OpenMLParameter(object):
     value : str
         If the parameter was set, the value that it was set to.
     """
-    def __init__(self, input_id, flow_id, flow_name, full_name, parameter_name,
-                 data_type, default_value, value):
-        self.id = input_id
-        self.flow_id = flow_id
-        self.flow_name = flow_name
-        self.full_name = full_name
-        self.parameter_name = parameter_name
-        self.data_type = data_type
-        self.default_value = default_value
-        self.value = value
-
-    def __repr__(self):
+
+    input_id: int
+    flow_id: int
+    flow_name: str
+    full_name: str
+    parameter_name: str
+    data_type: str
+    default_value: str
+    value: str
+
+    def __post_init__(self) -> None:
+        # Map input_id to id for backward compatibility
+        self.id = self.input_id
+
+    def _to_dict(self) -> dict[str, Any]:
+        result = asdict(self)
+        # Replaces input_id with id for backward compatibility
+        result["id"] = result.pop("input_id")
+        return result
+
+    def __repr__(self) -> str:
         header = "OpenML Parameter"
-        header = '{}\n{}\n'.format(header, '=' * len(header))
-
-        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
-        fields = {"ID": self.id,
-                  "Flow ID": self.flow_id,
-                  # "Flow Name": self.flow_name,
-                  "Flow Name": self.full_name,
-                  "Flow URL": "{}f/{}".format(base_url, self.flow_id),
-                  "Parameter Name": self.parameter_name}
+        header = f"{header}\n{'=' * len(header)}\n"
+
+        fields = {
+            "ID": self.id,
+            "Flow ID": self.flow_id,
+            # "Flow Name": self.flow_name,
+            "Flow Name": self.full_name,
+            "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id),
+            "Parameter Name": self.parameter_name,
+        }
         # indented prints for parameter attributes
         # indention = 2 spaces + 1 | + 2 underscores
-        indent = "{}|{}".format(" " * 2, "_" * 2)
-        parameter_data_type = "{}Data Type".format(indent)
+        indent = f"{' ' * 2}|{'_' * 2}"
+        parameter_data_type = f"{indent}Data Type"
         fields[parameter_data_type] = self.data_type
-        parameter_default = "{}Default".format(indent)
+        parameter_default = f"{indent}Default"
         fields[parameter_default] = self.default_value
-        parameter_value = "{}Value".format(indent)
+        parameter_value = f"{indent}Value"
         fields[parameter_value] = self.value
 
         # determines the order in which the information will be printed
-        order = ["ID", "Flow ID", "Flow Name", "Flow URL", "Parameter Name",
-                 parameter_data_type, parameter_default, parameter_value]
-        fields = [(key, fields[key]) for key in order if key in fields]
-
-        longest_field_name_length = max(len(name) for name, value in fields)
-        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
-        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
+        order = [
+            "ID",
+            "Flow ID",
+            "Flow Name",
+            "Flow URL",
+            "Parameter Name",
+            parameter_data_type,
+            parameter_default,
+            parameter_value,
+        ]
+        _fields = [(key, fields[key]) for key in order if key in fields]
+
+        longest_field_name_length = max(len(name) for name, _ in _fields)
+        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
+        body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
         return header + body
diff --git a/openml/study/__init__.py b/openml/study/__init__.py
index 02b37d514..37a6d376a 100644
--- a/openml/study/__init__.py
+++ b/openml/study/__init__.py
@@ -1,37 +1,38 @@
-from .study import OpenMLStudy, OpenMLBenchmarkSuite
+# License: BSD 3-Clause
+
 from .functions import (
-    get_study,
-    get_suite,
-    create_study,
-    create_benchmark_suite,
-    update_study_status,
-    update_suite_status,
     attach_to_study,
     attach_to_suite,
-    detach_from_study,
-    detach_from_suite,
+    create_benchmark_suite,
+    create_study,
     delete_study,
     delete_suite,
+    detach_from_study,
+    detach_from_suite,
+    get_study,
+    get_suite,
     list_studies,
     list_suites,
+    update_study_status,
+    update_suite_status,
 )
-
+from .study import OpenMLBenchmarkSuite, OpenMLStudy
 
 __all__ = [
-    'OpenMLStudy',
-    'OpenMLBenchmarkSuite',
-    'attach_to_study',
-    'attach_to_suite',
-    'create_benchmark_suite',
-    'create_study',
-    'delete_study',
-    'delete_suite',
-    'detach_from_study',
-    'detach_from_suite',
-    'get_study',
-    'get_suite',
-    'list_studies',
-    'list_suites',
-    'update_suite_status',
-    'update_study_status',
+    "OpenMLBenchmarkSuite",
+    "OpenMLStudy",
+    "attach_to_study",
+    "attach_to_suite",
+    "create_benchmark_suite",
+    "create_study",
+    "delete_study",
+    "delete_suite",
+    "detach_from_study",
+    "detach_from_suite",
+    "get_study",
+    "get_suite",
+    "list_studies",
+    "list_suites",
+    "update_study_status",
+    "update_suite_status",
 ]
diff --git a/openml/study/functions.py b/openml/study/functions.py
index ccd523016..7268ea97c 100644
--- a/openml/study/functions.py
+++ b/openml/study/functions.py
@@ -1,16 +1,22 @@
-from typing import cast, Dict, List, Optional, Union
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import warnings
+from functools import partial
+from typing import TYPE_CHECKING, Any
 
-import dateutil.parser
-import xmltodict
 import pandas as pd
+import xmltodict
 
-from openml.study import OpenMLStudy, OpenMLBenchmarkSuite
-from openml.study.study import BaseStudy
 import openml._api_calls
+import openml.utils
+from openml.study.study import OpenMLBenchmarkSuite, OpenMLStudy
 
+if TYPE_CHECKING:
+    from openml.study.study import BaseStudy
 
-def get_suite(suite_id: Union[int, str]) -> OpenMLBenchmarkSuite:
+
+def get_suite(suite_id: int | str) -> OpenMLBenchmarkSuite:
     """
     Retrieves all relevant information of an OpenML benchmarking suite from the server.
 
@@ -24,14 +30,16 @@ def get_suite(suite_id: Union[int, str]) -> OpenMLBenchmarkSuite:
     OpenMLSuite
         The OpenML suite object
     """
-    suite = cast(OpenMLBenchmarkSuite, _get_study(suite_id, entity_type='task'))
-    return suite
+    study = _get_study(suite_id, entity_type="task")
+    assert isinstance(study, OpenMLBenchmarkSuite)
+
+    return study
 
 
 def get_study(
-    study_id: Union[int, str],
-    arg_for_backwards_compat: Optional[str] = None,
-) -> OpenMLStudy:  # noqa F401
+    study_id: int | str,
+    arg_for_backwards_compat: str | None = None,  # noqa: ARG001
+) -> OpenMLStudy:  # F401
     """
     Retrieves all relevant information of an OpenML study from the server.
 
@@ -51,86 +59,86 @@ def get_study(
     OpenMLStudy
         The OpenML study object
     """
-    if study_id == 'OpenML100':
+    if study_id == "OpenML100":
         message = (
             "It looks like you are running code from the OpenML100 paper. It still works, but lots "
             "of things have changed since then. Please use `get_suite('OpenML100')` instead."
         )
-        warnings.warn(message, DeprecationWarning)
-        openml.config.logger.warn(message)
-        study = _get_study(study_id, entity_type='task')
-        return cast(OpenMLBenchmarkSuite, study)  # type: ignore
-    else:
-        study = cast(OpenMLStudy, _get_study(study_id, entity_type='run'))
-        return study
+        warnings.warn(message, DeprecationWarning, stacklevel=2)
+        openml.config.logger.warning(message)
+        study = _get_study(study_id, entity_type="task")
+        assert isinstance(study, OpenMLBenchmarkSuite)
+
+        return study  # type: ignore
 
+    study = _get_study(study_id, entity_type="run")
+    assert isinstance(study, OpenMLStudy)
+    return study
 
-def _get_study(id_: Union[int, str], entity_type) -> BaseStudy:
-    call_suffix = "study/{}".format(str(id_))
-    xml_string = openml._api_calls._perform_api_call(call_suffix, 'get')
+
+def _get_study(id_: int | str, entity_type: str) -> BaseStudy:
+    xml_string = openml._api_calls._perform_api_call(f"study/{id_}", "get")
     force_list_tags = (
-        'oml:data_id', 'oml:flow_id', 'oml:task_id', 'oml:setup_id',
-        'oml:run_id',
-        'oml:tag'  # legacy.
+        "oml:data_id",
+        "oml:flow_id",
+        "oml:task_id",
+        "oml:setup_id",
+        "oml:run_id",
+        "oml:tag",  # legacy.
     )
-    result_dict = xmltodict.parse(xml_string, force_list=force_list_tags)['oml:study']
-    study_id = int(result_dict['oml:id'])
-    alias = result_dict['oml:alias'] if 'oml:alias' in result_dict else None
-    main_entity_type = result_dict['oml:main_entity_type']
+    result_dict = xmltodict.parse(xml_string, force_list=force_list_tags)["oml:study"]
+    study_id = int(result_dict["oml:id"])
+    alias = result_dict.get("oml:alias", None)
+    main_entity_type = result_dict["oml:main_entity_type"]
+
     if entity_type != main_entity_type:
         raise ValueError(
-            "Unexpected entity type '{}' reported by the server, expected '{}'".format(
-                main_entity_type, entity_type,
-            )
+            f"Unexpected entity type '{main_entity_type}' reported by the server"
+            f", expected '{entity_type}'"
         )
-    benchmark_suite = result_dict['oml:benchmark_suite'] \
-        if 'oml:benchmark_suite' in result_dict else None
-    name = result_dict['oml:name']
-    description = result_dict['oml:description']
-    status = result_dict['oml:status']
-    creation_date = result_dict['oml:creation_date']
-    creation_date_as_date = dateutil.parser.parse(creation_date)
-    creator = result_dict['oml:creator']
+
+    benchmark_suite = result_dict.get("oml:benchmark_suite", None)
+    name = result_dict["oml:name"]
+    description = result_dict["oml:description"]
+    status = result_dict["oml:status"]
+    creation_date = result_dict["oml:creation_date"]
+    creator = result_dict["oml:creator"]
 
     # tags is legacy. remove once no longer needed.
     tags = []
-    if 'oml:tag' in result_dict:
-        for tag in result_dict['oml:tag']:
-            current_tag = {'name': tag['oml:name'],
-                           'write_access': tag['oml:write_access']}
-            if 'oml:window_start' in tag:
-                current_tag['window_start'] = tag['oml:window_start']
+    if "oml:tag" in result_dict:
+        for tag in result_dict["oml:tag"]:
+            current_tag = {"name": tag["oml:name"], "write_access": tag["oml:write_access"]}
+            if "oml:window_start" in tag:
+                current_tag["window_start"] = tag["oml:window_start"]
             tags.append(current_tag)
 
-    if 'oml:data' in result_dict:
-        datasets = [int(x) for x in result_dict['oml:data']['oml:data_id']]
-    else:
-        raise ValueError('No datasets attached to study {}!'.format(id_))
-    if 'oml:tasks' in result_dict:
-        tasks = [int(x) for x in result_dict['oml:tasks']['oml:task_id']]
-    else:
-        raise ValueError('No tasks attached to study {}!'.format(id_))
-
-    if main_entity_type in ['runs', 'run']:
-
-        if 'oml:flows' in result_dict:
-            flows = [int(x) for x in result_dict['oml:flows']['oml:flow_id']]
-        else:
-            raise ValueError('No flows attached to study {}!'.format(id_))
-        if 'oml:setups' in result_dict:
-            setups = [int(x) for x in result_dict['oml:setups']['oml:setup_id']]
-        else:
-            raise ValueError('No setups attached to study!'.format(id_))
-        if 'oml:runs' in result_dict:
-            runs = [
-                int(x) for x in result_dict['oml:runs']['oml:run_id']
-            ]  # type: Optional[List[int]]
-        else:
-            if creation_date_as_date < dateutil.parser.parse('2019-01-01'):
-                # Legacy studies did not require runs
-                runs = None
-            else:
-                raise ValueError('No runs attached to study!'.format(id_))
+    def get_nested_ids_from_result_dict(key: str, subkey: str) -> list[int] | None:
+        """Extracts a list of nested IDs from a result dictionary.
+
+        Parameters
+        ----------
+        key : str
+            Nested OpenML IDs.
+        subkey : str
+            The subkey contains the nested OpenML IDs.
+
+        Returns
+        -------
+        Optional[List]
+            A list of nested OpenML IDs, or None if the key is not present in the dictionary.
+        """
+        if result_dict.get(key) is not None:
+            return [int(oml_id) for oml_id in result_dict[key][subkey]]
+        return None
+
+    datasets = get_nested_ids_from_result_dict("oml:data", "oml:data_id")
+    tasks = get_nested_ids_from_result_dict("oml:tasks", "oml:task_id")
+
+    if main_entity_type in ["runs", "run"]:
+        flows = get_nested_ids_from_result_dict("oml:flows", "oml:flow_id")
+        setups = get_nested_ids_from_result_dict("oml:setups", "oml:setup_id")
+        runs = get_nested_ids_from_result_dict("oml:runs", "oml:run_id")
 
         study = OpenMLStudy(
             study_id=study_id,
@@ -149,8 +157,7 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy:
             runs=runs,
         )  # type: BaseStudy
 
-    elif main_entity_type in ['tasks', 'task']:
-
+    elif main_entity_type in ["tasks", "task"]:
         study = OpenMLBenchmarkSuite(
             suite_id=study_id,
             alias=alias,
@@ -161,11 +168,11 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy:
             creator=creator,
             tags=tags,
             data=datasets,
-            tasks=tasks
+            tasks=tasks,
         )
 
     else:
-        raise ValueError('Unknown entity type {}'.format(main_entity_type))
+        raise ValueError(f"Unknown entity type {main_entity_type}")
 
     return study
 
@@ -173,9 +180,9 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy:
 def create_study(
     name: str,
     description: str,
-    run_ids: List[int],
-    alias: Optional[str],
-    benchmark_suite: Optional[int],
+    run_ids: list[int] | None = None,
+    alias: str | None = None,
+    benchmark_suite: int | None = None,
 ) -> OpenMLStudy:
     """
     Creates an OpenML study (collection of data, tasks, flows, setups and run),
@@ -184,16 +191,19 @@ def create_study(
 
     Parameters
     ----------
-    alias : str (optional)
-        a string ID, unique on server (url-friendly)
     benchmark_suite : int (optional)
         the benchmark suite (another study) upon which this study is ran.
     name : str
         the name of the study (meta-info)
     description : str
         brief description (meta-info)
-    run_ids : list
-        a list of run ids associated with this study
+    run_ids : list, optional
+        a list of run ids associated with this study,
+        these can also be added later with ``attach_to_study``.
+    alias : str (optional)
+        a string ID, unique on server (url-friendly)
+    benchmark_suite: int (optional)
+        the ID of the suite for which this study contains run results
 
     Returns
     -------
@@ -213,7 +223,7 @@ def create_study(
         data=None,
         tasks=None,
         flows=None,
-        runs=run_ids,
+        runs=run_ids if run_ids != [] else None,
         setups=None,
     )
 
@@ -221,8 +231,8 @@ def create_study(
 def create_benchmark_suite(
     name: str,
     description: str,
-    task_ids: List[int],
-    alias: Optional[str],
+    task_ids: list[int],
+    alias: str | None = None,
 ) -> OpenMLBenchmarkSuite:
     """
     Creates an OpenML benchmark suite (collection of entity types, where
@@ -230,14 +240,15 @@ def create_benchmark_suite(
 
     Parameters
     ----------
-    alias : str (optional)
-        a string ID, unique on server (url-friendly)
     name : str
         the name of the study (meta-info)
     description : str
         brief description (meta-info)
     task_ids : list
         a list of task ids associated with this study
+        more can be added later with ``attach_to_suite``.
+    alias : str (optional)
+        a string ID, unique on server (url-friendly)
 
     Returns
     -------
@@ -283,20 +294,17 @@ def update_study_status(study_id: int, status: str) -> None:
     status : str,
         'active' or 'deactivated'
     """
-    legal_status = {'active', 'deactivated'}
+    legal_status = {"active", "deactivated"}
     if status not in legal_status:
-        raise ValueError('Illegal status value. '
-                         'Legal values: %s' % legal_status)
-    data = {'study_id': study_id, 'status': status}
-    result_xml = openml._api_calls._perform_api_call("study/status/update",
-                                                     'post',
-                                                     data=data)
+        raise ValueError(f"Illegal status value. Legal values: {legal_status}")
+    data = {"study_id": study_id, "status": status}  # type: openml._api_calls.DATA_TYPE
+    result_xml = openml._api_calls._perform_api_call("study/status/update", "post", data=data)
     result = xmltodict.parse(result_xml)
-    server_study_id = result['oml:study_status_update']['oml:id']
-    server_status = result['oml:study_status_update']['oml:status']
+    server_study_id = result["oml:study_status_update"]["oml:id"]
+    server_status = result["oml:study_status_update"]["oml:status"]
     if status != server_status or int(study_id) != int(server_study_id):
         # This should never happen
-        raise ValueError('Study id/status does not collide')
+        raise ValueError("Study id/status does not collide")
 
 
 def delete_suite(suite_id: int) -> bool:
@@ -328,10 +336,10 @@ def delete_study(study_id: int) -> bool:
     bool
         True iff the deletion was successful. False otherwise
     """
-    return openml.utils._delete_entity('study', study_id)
+    return openml.utils._delete_entity("study", study_id)
 
 
-def attach_to_suite(suite_id: int, task_ids: List[int]) -> int:
+def attach_to_suite(suite_id: int, task_ids: list[int]) -> int:
     """Attaches a set of tasks to a benchmarking suite.
 
     Parameters
@@ -350,7 +358,7 @@ def attach_to_suite(suite_id: int, task_ids: List[int]) -> int:
     return attach_to_study(suite_id, task_ids)
 
 
-def attach_to_study(study_id: int, run_ids: List[int]) -> int:
+def attach_to_study(study_id: int, run_ids: list[int]) -> int:
     """Attaches a set of runs to a study.
 
     Parameters
@@ -366,16 +374,17 @@ def attach_to_study(study_id: int, run_ids: List[int]) -> int:
     int
         new size of the study (in terms of explicitly linked entities)
     """
-
     # Interestingly, there's no need to tell the server about the entity type, it knows by itself
-    uri = 'study/%d/attach' % study_id
-    post_variables = {'ids': ','.join(str(x) for x in run_ids)}
-    result_xml = openml._api_calls._perform_api_call(uri, 'post', post_variables)
-    result = xmltodict.parse(result_xml)['oml:study_attach']
-    return int(result['oml:linked_entities'])
+    result_xml = openml._api_calls._perform_api_call(
+        call=f"study/{study_id}/attach",
+        request_method="post",
+        data={"ids": ",".join(str(x) for x in run_ids)},
+    )
+    result = xmltodict.parse(result_xml)["oml:study_attach"]
+    return int(result["oml:linked_entities"])
 
 
-def detach_from_suite(suite_id: int, task_ids: List[int]) -> int:
+def detach_from_suite(suite_id: int, task_ids: list[int]) -> int:
     """Detaches a set of task ids from a suite.
 
     Parameters
@@ -384,16 +393,17 @@ def detach_from_suite(suite_id: int, task_ids: List[int]) -> int:
         OpenML id of the study
 
     task_ids : list (int)
-        List of entities to link to the collection
+        List of entities to unlink from the collection
 
     Returns
     -------
     int
-        new size of the study (in terms of explicitly linked entities)"""
+    new size of the study (in terms of explicitly linked entities)
+    """
     return detach_from_study(suite_id, task_ids)
 
 
-def detach_from_study(study_id: int, run_ids: List[int]) -> int:
+def detach_from_study(study_id: int, run_ids: list[int]) -> int:
     """Detaches a set of run ids from a study.
 
     Parameters
@@ -402,29 +412,31 @@ def detach_from_study(study_id: int, run_ids: List[int]) -> int:
         OpenML id of the study
 
     run_ids : list (int)
-        List of entities to link to the collection
+        List of entities to unlink from the collection
 
     Returns
     -------
     int
         new size of the study (in terms of explicitly linked entities)
     """
-
     # Interestingly, there's no need to tell the server about the entity type, it knows by itself
-    uri = 'study/%d/detach' % study_id
-    post_variables = {'ids': ','.join(str(x) for x in run_ids)}
-    result_xml = openml._api_calls._perform_api_call(uri, 'post', post_variables)
-    result = xmltodict.parse(result_xml)['oml:study_detach']
-    return int(result['oml:linked_entities'])
+    uri = f"study/{study_id}/detach"
+    post_variables = {"ids": ",".join(str(x) for x in run_ids)}  # type: openml._api_calls.DATA_TYPE
+    result_xml = openml._api_calls._perform_api_call(
+        call=uri,
+        request_method="post",
+        data=post_variables,
+    )
+    result = xmltodict.parse(result_xml)["oml:study_detach"]
+    return int(result["oml:linked_entities"])
 
 
 def list_suites(
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    status: Optional[str] = None,
-    uploader: Optional[List[int]] = None,
-    output_format: str = 'dict'
-) -> Union[Dict, pd.DataFrame]:
+    offset: int | None = None,
+    size: int | None = None,
+    status: str | None = None,
+    uploader: list[int] | None = None,
+) -> pd.DataFrame:
     """
     Return a list of all suites which are on OpenML.
 
@@ -439,55 +451,39 @@ def list_suites(
         suites are returned.
     uploader : list (int), optional
         Result filter. Will only return suites created by these users.
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
 
     Returns
     -------
-    datasets : dict of dicts, or dataframe
-        - If output_format='dict'
-            Every suite is represented by a dictionary containing the following information:
-            - id
-            - alias (optional)
-            - name
-            - main_entity_type
-            - status
-            - creator
-            - creation_date
-
-        - If output_format='dataframe'
-            Every row is represented by a dictionary containing the following information:
-            - id
-            - alias (optional)
-            - name
-            - main_entity_type
-            - status
-            - creator
-            - creation_date
+    datasets : dataframe
+        Every row is represented by a dictionary containing the following information:
+        - id
+        - alias (optional)
+        - name
+        - main_entity_type
+        - status
+        - creator
+        - creation_date
     """
-    if output_format not in ['dataframe', 'dict']:
-        raise ValueError("Invalid output format selected. "
-                         "Only 'dict' or 'dataframe' applicable.")
+    listing_call = partial(
+        _list_studies,
+        main_entity_type="task",
+        status=status,
+        uploader=uploader,
+    )
+    batches = openml.utils._list_all(listing_call, limit=size, offset=offset)
+    if len(batches) == 0:
+        return pd.DataFrame()
 
-    return openml.utils._list_all(output_format=output_format,
-                                  listing_call=_list_studies,
-                                  offset=offset,
-                                  size=size,
-                                  main_entity_type='task',
-                                  status=status,
-                                  uploader=uploader,)
+    return pd.concat(batches)
 
 
 def list_studies(
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    status: Optional[str] = None,
-    uploader: Optional[List[str]] = None,
-    benchmark_suite: Optional[int] = None,
-    output_format: str = 'dict'
-) -> Union[Dict, pd.DataFrame]:
+    offset: int | None = None,
+    size: int | None = None,
+    status: str | None = None,
+    uploader: list[str] | None = None,
+    benchmark_suite: int | None = None,
+) -> pd.DataFrame:
     """
     Return a list of all studies which are on OpenML.
 
@@ -503,110 +499,109 @@ def list_studies(
     uploader : list (int), optional
         Result filter. Will only return studies created by these users.
     benchmark_suite : int, optional
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
 
     Returns
     -------
-    datasets : dict of dicts, or dataframe
-        - If output_format='dict'
-            Every dataset is represented by a dictionary containing
-            the following information:
-            - id
-            - alias (optional)
-            - name
-            - benchmark_suite (optional)
-            - status
-            - creator
-            - creation_date
-            If qualities are calculated for the dataset, some of
-            these are also returned.
-
-        - If output_format='dataframe'
-            Every dataset is represented by a dictionary containing
-            the following information:
-            - id
-            - alias (optional)
-            - name
-            - benchmark_suite (optional)
-            - status
-            - creator
-            - creation_date
-            If qualities are calculated for the dataset, some of
-            these are also returned.
+    datasets : dataframe
+        Every dataset is represented by a dictionary containing
+        the following information:
+        - id
+        - alias (optional)
+        - name
+        - benchmark_suite (optional)
+        - status
+        - creator
+        - creation_date
+        If qualities are calculated for the dataset, some of
+        these are also returned.
     """
-    if output_format not in ['dataframe', 'dict']:
-        raise ValueError("Invalid output format selected. "
-                         "Only 'dict' or 'dataframe' applicable.")
+    listing_call = partial(
+        _list_studies,
+        main_entity_type="run",
+        status=status,
+        uploader=uploader,
+        benchmark_suite=benchmark_suite,
+    )
+    batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+    if len(batches) == 0:
+        return pd.DataFrame()
 
-    return openml.utils._list_all(output_format=output_format,
-                                  listing_call=_list_studies,
-                                  offset=offset,
-                                  size=size,
-                                  main_entity_type='run',
-                                  status=status,
-                                  uploader=uploader,
-                                  benchmark_suite=benchmark_suite)
+    return pd.concat(batches)
 
 
-def _list_studies(output_format='dict', **kwargs) -> Union[Dict, pd.DataFrame]:
-    """
-    Perform api call to return a list of studies.
+def _list_studies(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame:
+    """Perform api call to return a list of studies.
 
     Parameters
     ----------
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
+    limit: int
+        The maximum number of studies to return.
+    offset: int
+        The number of studies to skip, starting from the first.
     kwargs : dict, optional
         Legal filter operators (keys in the dict):
-        status, limit, offset, main_entity_type, uploader
+        status, main_entity_type, uploader, benchmark_suite
 
     Returns
     -------
-    studies : dict of dicts
+    studies : dataframe
     """
     api_call = "study/list"
+    if limit is not None:
+        api_call += f"/limit/{limit}"
+    if offset is not None:
+        api_call += f"/offset/{offset}"
     if kwargs is not None:
         for operator, value in kwargs.items():
-            api_call += "/%s/%s" % (operator, value)
-    return __list_studies(api_call=api_call, output_format=output_format)
+            if value is not None:
+                api_call += f"/{operator}/{value}"
+    return __list_studies(api_call=api_call)
 
 
-def __list_studies(api_call, output_format='object') -> Union[Dict, pd.DataFrame]:
-    xml_string = openml._api_calls._perform_api_call(api_call, 'get')
-    study_dict = xmltodict.parse(xml_string, force_list=('oml:study',))
+def __list_studies(api_call: str) -> pd.DataFrame:
+    """Retrieves the list of OpenML studies and
+    returns it in a dictionary or a Pandas DataFrame.
+
+    Parameters
+    ----------
+    api_call : str
+        The API call for retrieving the list of OpenML studies.
+
+    Returns
+    -------
+    pd.DataFrame
+        A Pandas DataFrame of OpenML studies
+    """
+    xml_string = openml._api_calls._perform_api_call(api_call, "get")
+    study_dict = xmltodict.parse(xml_string, force_list=("oml:study",))
 
     # Minimalistic check if the XML is useful
-    assert type(study_dict['oml:study_list']['oml:study']) == list, \
-        type(study_dict['oml:study_list'])
-    assert study_dict['oml:study_list']['@xmlns:oml'] == \
-        'http://openml.org/openml', study_dict['oml:study_list']['@xmlns:oml']
+    assert isinstance(study_dict["oml:study_list"]["oml:study"], list), type(
+        study_dict["oml:study_list"],
+    )
+    assert study_dict["oml:study_list"]["@xmlns:oml"] == "http://openml.org/openml", study_dict[
+        "oml:study_list"
+    ]["@xmlns:oml"]
 
-    studies = dict()
-    for study_ in study_dict['oml:study_list']['oml:study']:
+    studies = {}
+    for study_ in study_dict["oml:study_list"]["oml:study"]:
         # maps from xml name to a tuple of (dict name, casting fn)
         expected_fields = {
-            'oml:id': ('id', int),
-            'oml:alias': ('alias', str),
-            'oml:main_entity_type': ('main_entity_type', str),
-            'oml:benchmark_suite': ('benchmark_suite', int),
-            'oml:name': ('name', str),
-            'oml:status': ('status', str),
-            'oml:creation_date': ('creation_date', str),
-            'oml:creator': ('creator', int),
+            "oml:id": ("id", int),
+            "oml:alias": ("alias", str),
+            "oml:main_entity_type": ("main_entity_type", str),
+            "oml:benchmark_suite": ("benchmark_suite", int),
+            "oml:name": ("name", str),
+            "oml:status": ("status", str),
+            "oml:creation_date": ("creation_date", str),
+            "oml:creator": ("creator", int),
         }
-        study_id = int(study_['oml:id'])
-        current_study = dict()
+        study_id = int(study_["oml:id"])
+        current_study = {}
         for oml_field_name, (real_field_name, cast_fn) in expected_fields.items():
             if oml_field_name in study_:
                 current_study[real_field_name] = cast_fn(study_[oml_field_name])
-        current_study['id'] = int(current_study['id'])
+        current_study["id"] = int(current_study["id"])
         studies[study_id] = current_study
 
-    if output_format == 'dataframe':
-        studies = pd.DataFrame.from_dict(studies, orient='index')
-    return studies
+    return pd.DataFrame.from_dict(studies, orient="index")
diff --git a/openml/study/study.py b/openml/study/study.py
index 8657749da..803c6455b 100644
--- a/openml/study/study.py
+++ b/openml/study/study.py
@@ -1,12 +1,15 @@
-import collections
-from typing import Dict, List, Optional
+# License: BSD 3-Clause
+# TODO(eddiebergman): Begging for dataclassses to shorten this all
+from __future__ import annotations
 
-import xmltodict
+from collections.abc import Sequence
+from typing import Any
 
 import openml
+from openml.base import OpenMLBase
 
 
-class BaseStudy(object):
+class BaseStudy(OpenMLBase):
     """
     An OpenMLStudy represents the OpenML concept of a study. It contains
     the following information: name, id, description, creation date,
@@ -53,26 +56,26 @@ class BaseStudy(object):
     setups : list
         a list of setup ids associated with this study
     """
-    def __init__(
+
+    def __init__(  # noqa: PLR0913
         self,
-        study_id: Optional[int],
-        alias: Optional[str],
+        study_id: int | None,
+        alias: str | None,
         main_entity_type: str,
-        benchmark_suite: Optional[int],
+        benchmark_suite: int | None,
         name: str,
         description: str,
-        status: Optional[str],
-        creation_date: Optional[str],
-        creator: Optional[int],
-        tags: Optional[List[Dict]],
-        data: Optional[List[int]],
-        tasks: Optional[List[int]],
-        flows: Optional[List[int]],
-        runs: Optional[List[int]],
-        setups: Optional[List[int]],
+        status: str | None,
+        creation_date: str | None,
+        creator: int | None,
+        tags: list[dict] | None,
+        data: list[int] | None,
+        tasks: list[int] | None,
+        flows: list[int] | None,
+        runs: list[int] | None,
+        setups: list[int] | None,
     ):
-
-        self.id = study_id
+        self.study_id = study_id
         self.alias = alias
         self.main_entity_type = main_entity_type
         self.benchmark_suite = benchmark_suite
@@ -87,21 +90,30 @@ def __init__(
         self.flows = flows
         self.setups = setups
         self.runs = runs
-        pass
-
-    def __repr__(self):
-        # header is provided by the sub classes
-        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
-        fields = {"Name": self.name,
-                  "Status": self.status,
-                  "Main Entity Type": self.main_entity_type}
-        if self.id is not None:
-            fields["ID"] = self.id
-            fields["Study URL"] = "{}s/{}".format(base_url, self.id)
+
+    @classmethod
+    def _entity_letter(cls) -> str:
+        return "s"
+
+    @property
+    def id(self) -> int | None:
+        """Return the id of the study."""
+        return self.study_id
+
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
+        """Collect all information to display in the __repr__ body."""
+        fields: dict[str, Any] = {
+            "Name": self.name,
+            "Status": self.status,
+            "Main Entity Type": self.main_entity_type,
+        }
+        if self.study_id is not None:
+            fields["ID"] = self.study_id
+            fields["Study URL"] = self.openml_url
         if self.creator is not None:
-            fields["Creator"] = "{}u/{}".format(base_url, self.creator)
+            fields["Creator"] = f"{openml.config.get_server_base_url()}/u/{self.creator}"
         if self.creation_date is not None:
-            fields["Upload Time"] = self.creation_date.replace('T', ' ')
+            fields["Upload Time"] = self.creation_date.replace("T", " ")
         if self.data is not None:
             fields["# of Data"] = len(self.data)
         if self.tasks is not None:
@@ -112,80 +124,75 @@ def __repr__(self):
             fields["# of Runs"] = len(self.runs)
 
         # determines the order in which the information will be printed
-        order = ["ID", "Name", "Status", "Main Entity Type", "Study URL",
-                 "# of Data", "# of Tasks", "# of Flows", "# of Runs",
-                 "Creator", "Upload Time"]
-        fields = [(key, fields[key]) for key in order if key in fields]
-
-        longest_field_name_length = max(len(name) for name, value in fields)
-        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
-        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
-        return body
-
-    def publish(self) -> int:
-        """
-        Publish the study on the OpenML server.
-
-        Returns
-        -------
-        study_id: int
-            Id of the study uploaded to the server.
-        """
-        file_elements = {
-            'description': self._to_xml()
-        }
-        return_value = openml._api_calls._perform_api_call(
-            "study/",
-            'post',
-            file_elements=file_elements,
-        )
-        study_res = xmltodict.parse(return_value)
-        self.study_id = int(study_res['oml:study_upload']['oml:id'])
-        return self.study_id
-
-    def _to_xml(self) -> str:
-        """Serialize object to xml for upload
-
-        Returns
-        -------
-        xml_study : str
-            XML description of the data.
-        """
+        order = [
+            "ID",
+            "Name",
+            "Status",
+            "Main Entity Type",
+            "Study URL",
+            "# of Data",
+            "# of Tasks",
+            "# of Flows",
+            "# of Runs",
+            "Creator",
+            "Upload Time",
+        ]
+        return [(key, fields[key]) for key in order if key in fields]
+
+    def _parse_publish_response(self, xml_response: dict) -> None:
+        """Parse the id from the xml_response and assign it to self."""
+        self.study_id = int(xml_response["oml:study_upload"]["oml:id"])
+
+    def _to_dict(self) -> dict[str, dict]:
+        """Creates a dictionary representation of self."""
         # some can not be uploaded, e.g., id, creator, creation_date
-        simple_props = ['alias', 'main_entity_type', 'name', 'description']
-        # maps from attribute name (which is used as outer tag name) to immer
-        # tag name (e.g., self.tasks -> <oml:tasks><oml:task_id>1987
-        # </oml:task_id></oml:tasks>)
-        complex_props = {
-            'tasks': 'task_id',
-            'runs': 'run_id',
-        }
-
-        study_container = collections.OrderedDict()  # type: 'collections.OrderedDict'
-        namespace_list = [('@xmlns:oml', 'http://openml.org/openml')]
-        study_dict = collections.OrderedDict(namespace_list)  # type: 'collections.OrderedDict'
-        study_container['oml:study'] = study_dict
+        simple_props = ["alias", "main_entity_type", "name", "description"]
 
+        # TODO(eddiebergman): Begging for a walrus if we can drop 3.7
+        simple_prop_values = {}
         for prop_name in simple_props:
             content = getattr(self, prop_name, None)
             if content is not None:
-                study_dict["oml:" + prop_name] = content
+                simple_prop_values["oml:" + prop_name] = content
+
+        # maps from attribute name (which is used as outer tag name) to immer
+        # tag name e.g., self.tasks -> <oml:tasks><oml:task_id>1987</oml:task_id></oml:tasks>
+        complex_props = {"tasks": "task_id", "runs": "run_id"}
+
+        # TODO(eddiebergman): Begging for a walrus if we can drop 3.7
+        complex_prop_values = {}
         for prop_name, inner_name in complex_props.items():
             content = getattr(self, prop_name, None)
             if content is not None:
-                sub_dict = {
-                    'oml:' + inner_name: content
-                }
-                study_dict["oml:" + prop_name] = sub_dict
-
-        xml_string = xmltodict.unparse(
-            input_dict=study_container,
-            pretty=True,
+                complex_prop_values["oml:" + prop_name] = {"oml:" + inner_name: content}
+
+        return {
+            "oml:study": {
+                "@xmlns:oml": "http://openml.org/openml",
+                **simple_prop_values,
+                **complex_prop_values,
+            }
+        }
+
+    def push_tag(self, tag: str) -> None:
+        """Add a tag to the study."""
+        raise NotImplementedError(
+            "Tag management for studies is not yet supported. "
+            "The OpenML Python SDK does not currently provide functionality"
+            "for adding tags to studies."
+            "For updates on this feature, please refer to the GitHub issues at: "
+            "https://github.com/openml/openml-python/issues"
+        )
+
+    def remove_tag(self, tag: str) -> None:
+        """Remove a tag from the study."""
+        raise NotImplementedError(
+            "Tag management for studies is not yet supported. "
+            "The OpenML Python SDK does not currently provide functionality"
+            "for removing tags from studies. "
+            "For updates on this feature, please refer to the GitHub issues at: "
+            "https://github.com/openml/openml-python/issues"
         )
-        # A flow may not be uploaded with the xml encoding specification:
-        # <?xml version="1.0" encoding="utf-8"?>
-        xml_string = xml_string.split('\n', 1)[-1]
-        return xml_string
 
 
 class OpenMLStudy(BaseStudy):
@@ -198,8 +205,6 @@ class OpenMLStudy(BaseStudy):
     According to this list of run ids, the study object receives a list of
     OpenML object ids (datasets, flows, tasks and setups).
 
-    Inherits from :class:`openml.BaseStudy`
-
     Parameters
     ----------
     study_id : int
@@ -233,27 +238,28 @@ class OpenMLStudy(BaseStudy):
     setups : list
         a list of setup ids associated with this study
     """
-    def __init__(
+
+    def __init__(  # noqa: PLR0913
         self,
-        study_id: Optional[int],
-        alias: Optional[str],
-        benchmark_suite: Optional[int],
+        study_id: int | None,
+        alias: str | None,
+        benchmark_suite: int | None,
         name: str,
         description: str,
-        status: Optional[str],
-        creation_date: Optional[str],
-        creator: Optional[int],
-        tags: Optional[List[Dict]],
-        data: Optional[List[int]],
-        tasks: Optional[List[int]],
-        flows: Optional[List[int]],
-        runs: Optional[List[int]],
-        setups: Optional[List[int]],
+        status: str | None,
+        creation_date: str | None,
+        creator: int | None,
+        tags: list[dict] | None,
+        data: list[int] | None,
+        tasks: list[int] | None,
+        flows: list[int] | None,
+        runs: list[int] | None,
+        setups: list[int] | None,
     ):
         super().__init__(
             study_id=study_id,
             alias=alias,
-            main_entity_type='run',
+            main_entity_type="run",
             benchmark_suite=benchmark_suite,
             name=name,
             description=description,
@@ -268,12 +274,6 @@ def __init__(
             setups=setups,
         )
 
-    def __repr__(self):
-        header = "OpenML Study"
-        header = '{}\n{}\n'.format(header, '=' * len(header))
-        body = super(OpenMLStudy, self).__repr__()
-        return header + body
-
 
 class OpenMLBenchmarkSuite(BaseStudy):
     """
@@ -285,8 +285,6 @@ class OpenMLBenchmarkSuite(BaseStudy):
     According to this list of task ids, the suite object receives a list of
     OpenML object ids (datasets).
 
-    Inherits from :class:`openml.BaseStudy`
-
     Parameters
     ----------
     suite_id : int
@@ -315,23 +313,23 @@ class OpenMLBenchmarkSuite(BaseStudy):
         a list of task ids associated with this study
     """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
-        suite_id: Optional[int],
-        alias: Optional[str],
+        suite_id: int | None,
+        alias: str | None,
         name: str,
         description: str,
-        status: Optional[str],
-        creation_date: Optional[str],
-        creator: Optional[int],
-        tags: Optional[List[Dict]],
-        data: Optional[List[int]],
-        tasks: List[int],
+        status: str | None,
+        creation_date: str | None,
+        creator: int | None,
+        tags: list[dict] | None,
+        data: list[int] | None,
+        tasks: list[int] | None,
     ):
         super().__init__(
             study_id=suite_id,
             alias=alias,
-            main_entity_type='task',
+            main_entity_type="task",
             benchmark_suite=None,
             name=name,
             description=description,
@@ -345,9 +343,3 @@ def __init__(
             runs=None,
             setups=None,
         )
-
-    def __repr__(self):
-        header = "OpenML Benchmark Suite"
-        header = '{}\n{}\n'.format(header, '=' * len(header))
-        body = super(OpenMLBenchmarkSuite, self).__repr__()
-        return header + body
diff --git a/openml/tasks/__init__.py b/openml/tasks/__init__.py
index f21cac871..34c994e3a 100644
--- a/openml/tasks/__init__.py
+++ b/openml/tasks/__init__.py
@@ -1,31 +1,35 @@
-from .task import (
-    OpenMLTask,
-    OpenMLSupervisedTask,
-    OpenMLClassificationTask,
-    OpenMLRegressionTask,
-    OpenMLClusteringTask,
-    OpenMLLearningCurveTask,
-    TaskTypeEnum,
-)
-from .split import OpenMLSplit
+# License: BSD 3-Clause
+
 from .functions import (
     create_task,
+    delete_task,
     get_task,
     get_tasks,
     list_tasks,
 )
+from .split import OpenMLSplit
+from .task import (
+    OpenMLClassificationTask,
+    OpenMLClusteringTask,
+    OpenMLLearningCurveTask,
+    OpenMLRegressionTask,
+    OpenMLSupervisedTask,
+    OpenMLTask,
+    TaskType,
+)
 
 __all__ = [
-    'OpenMLTask',
-    'OpenMLSupervisedTask',
-    'OpenMLClusteringTask',
-    'OpenMLRegressionTask',
-    'OpenMLClassificationTask',
-    'OpenMLLearningCurveTask',
-    'create_task',
-    'get_task',
-    'get_tasks',
-    'list_tasks',
-    'OpenMLSplit',
-    'TaskTypeEnum'
+    "OpenMLClassificationTask",
+    "OpenMLClusteringTask",
+    "OpenMLLearningCurveTask",
+    "OpenMLRegressionTask",
+    "OpenMLSplit",
+    "OpenMLSupervisedTask",
+    "OpenMLTask",
+    "TaskType",
+    "create_task",
+    "delete_task",
+    "get_task",
+    "get_tasks",
+    "list_tasks",
 ]
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index 4bb93b007..3fbc7adee 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -1,54 +1,50 @@
-from collections import OrderedDict
-import io
-import re
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import os
-from typing import Union, Dict, Optional
+import re
+import warnings
+from functools import partial
+from typing import Any
 
 import pandas as pd
 import xmltodict
 
-from ..exceptions import OpenMLCacheException
-from ..datasets import get_dataset
+import openml._api_calls
+import openml.utils
+from openml.datasets import get_dataset
+from openml.exceptions import OpenMLCacheException
+
 from .task import (
     OpenMLClassificationTask,
     OpenMLClusteringTask,
     OpenMLLearningCurveTask,
-    TaskTypeEnum,
     OpenMLRegressionTask,
     OpenMLSupervisedTask,
-    OpenMLTask
+    OpenMLTask,
+    TaskType,
 )
-import openml.utils
-import openml._api_calls
 
+TASKS_CACHE_DIR_NAME = "tasks"
 
-TASKS_CACHE_DIR_NAME = 'tasks'
 
-
-def _get_cached_tasks():
+def _get_cached_tasks() -> dict[int, OpenMLTask]:
     """Return a dict of all the tasks which are cached locally.
+
     Returns
     -------
     tasks : OrderedDict
         A dict of all the cached tasks. Each task is an instance of
         OpenMLTask.
     """
-    tasks = OrderedDict()
-
     task_cache_dir = openml.utils._create_cache_directory(TASKS_CACHE_DIR_NAME)
-    directory_content = os.listdir(task_cache_dir)
+    directory_content = os.listdir(task_cache_dir)  # noqa: PTH208
     directory_content.sort()
+
     # Find all dataset ids for which we have downloaded the dataset
     # description
-
-    for filename in directory_content:
-        if not re.match(r"[0-9]*", filename):
-            continue
-
-        tid = int(filename)
-        tasks[tid] = _get_cached_task(tid)
-
-    return tasks
+    tids = (int(did) for did in directory_content if re.match(r"[0-9]*", did))
+    return {tid: _get_cached_task(tid) for tid in tids}
 
 
 def _get_cached_task(tid: int) -> OpenMLTask:
@@ -63,24 +59,20 @@ def _get_cached_task(tid: int) -> OpenMLTask:
     -------
     OpenMLTask
     """
-    tid_cache_dir = openml.utils._create_cache_directory_for_id(
-        TASKS_CACHE_DIR_NAME,
-        tid
-    )
+    tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, tid)
 
+    task_xml_path = tid_cache_dir / "task.xml"
     try:
-        with io.open(os.path.join(tid_cache_dir, "task.xml"), encoding='utf8')\
-                as fh:
+        with task_xml_path.open(encoding="utf8") as fh:
             return _create_task_from_xml(fh.read())
-    except (OSError, IOError):
-        openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME,
-                                              tid_cache_dir)
-        raise OpenMLCacheException("Task file for tid %d not "
-                                   "cached" % tid)
+    except OSError as e:
+        openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir)
+        raise OpenMLCacheException(f"Task file for tid {tid} not cached") from e
 
 
-def _get_estimation_procedure_list():
+def _get_estimation_procedure_list() -> list[dict[str, Any]]:
     """Return a list of all estimation procedures which are on OpenML.
+
     Returns
     -------
     procedures : list
@@ -89,129 +81,137 @@ def _get_estimation_procedure_list():
         name, type, repeats, folds, stratified.
     """
     url_suffix = "estimationprocedure/list"
-    xml_string = openml._api_calls._perform_api_call(url_suffix,
-                                                     'get')
+    xml_string = openml._api_calls._perform_api_call(url_suffix, "get")
 
     procs_dict = xmltodict.parse(xml_string)
     # Minimalistic check if the XML is useful
-    if 'oml:estimationprocedures' not in procs_dict:
-        raise ValueError('Error in return XML, does not contain tag '
-                         'oml:estimationprocedures.')
-    elif '@xmlns:oml' not in procs_dict['oml:estimationprocedures']:
-        raise ValueError('Error in return XML, does not contain tag '
-                         '@xmlns:oml as a child of oml:estimationprocedures.')
-    elif procs_dict['oml:estimationprocedures']['@xmlns:oml'] != \
-            'http://openml.org/openml':
-        raise ValueError('Error in return XML, value of '
-                         'oml:estimationprocedures/@xmlns:oml is not '
-                         'http://openml.org/openml, but %s' %
-                         str(procs_dict['oml:estimationprocedures'][
-                             '@xmlns:oml']))
-
-    procs = []
-    for proc_ in procs_dict['oml:estimationprocedures'][
-            'oml:estimationprocedure']:
-        procs.append(
-            {
-                'id': int(proc_['oml:id']),
-                'task_type_id': int(proc_['oml:ttid']),
-                'name': proc_['oml:name'],
-                'type': proc_['oml:type'],
-            }
+    if "oml:estimationprocedures" not in procs_dict:
+        raise ValueError("Error in return XML, does not contain tag oml:estimationprocedures.")
+
+    if "@xmlns:oml" not in procs_dict["oml:estimationprocedures"]:
+        raise ValueError(
+            "Error in return XML, does not contain tag "
+            "@xmlns:oml as a child of oml:estimationprocedures.",
         )
 
+    if procs_dict["oml:estimationprocedures"]["@xmlns:oml"] != "http://openml.org/openml":
+        raise ValueError(
+            "Error in return XML, value of "
+            "oml:estimationprocedures/@xmlns:oml is not "
+            "http://openml.org/openml, but {}".format(
+                str(procs_dict["oml:estimationprocedures"]["@xmlns:oml"])
+            ),
+        )
+
+    procs: list[dict[str, Any]] = []
+    for proc_ in procs_dict["oml:estimationprocedures"]["oml:estimationprocedure"]:
+        task_type_int = int(proc_["oml:ttid"])
+        try:
+            task_type_id = TaskType(task_type_int)
+            procs.append(
+                {
+                    "id": int(proc_["oml:id"]),
+                    "task_type_id": task_type_id,
+                    "name": proc_["oml:name"],
+                    "type": proc_["oml:type"],
+                },
+            )
+        except ValueError as e:
+            warnings.warn(
+                f"Could not create task type id for {task_type_int} due to error {e}",
+                RuntimeWarning,
+                stacklevel=2,
+            )
+
     return procs
 
 
-def list_tasks(
-    task_type_id: Optional[int] = None,
-    offset: Optional[int] = None,
-    size: Optional[int] = None,
-    tag: Optional[str] = None,
-    output_format: str = 'dict',
-    **kwargs
-) -> Union[Dict, pd.DataFrame]:
+def list_tasks(  # noqa: PLR0913
+    task_type: TaskType | None = None,
+    offset: int | None = None,
+    size: int | None = None,
+    tag: str | None = None,
+    data_tag: str | None = None,
+    status: str | None = None,
+    data_name: str | None = None,
+    data_id: int | None = None,
+    number_instances: int | None = None,
+    number_features: int | None = None,
+    number_classes: int | None = None,
+    number_missing_values: int | None = None,
+) -> pd.DataFrame:
     """
-    Return a number of tasks having the given tag and task_type_id
+    Return a number of tasks having the given tag and task_type
 
     Parameters
     ----------
-    Filter task_type_id is separated from the other filters because
-    it is used as task_type_id in the task description, but it is named
+    Filter task_type is separated from the other filters because
+    it is used as task_type in the task description, but it is named
     type when used as a filter in list tasks call.
-    task_type_id : int, optional
-        ID of the task type as detailed `here <https://www.openml.org/search?type=task_type>`_.
-        - Supervised classification: 1
-        - Supervised regression: 2
-        - Learning curve: 3
-        - Supervised data stream classification: 4
-        - Clustering: 5
-        - Machine Learning Challenge: 6
-        - Survival Analysis: 7
-        - Subgroup Discovery: 8
     offset : int, optional
         the number of tasks to skip, starting from the first
+    task_type : TaskType, optional
+        Refers to the type of task.
     size : int, optional
         the maximum number of tasks to show
     tag : str, optional
         the tag to include
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
-    kwargs: dict, optional
-        Legal filter operators: data_tag, status, data_id, data_name,
-        number_instances, number_features,
-        number_classes, number_missing_values.
+    data_tag : str, optional
+        the tag of the dataset
+    data_id : int, optional
+    status : str, optional
+    data_name : str, optional
+    number_instances : int, optional
+    number_features : int, optional
+    number_classes : int, optional
+    number_missing_values : int, optional
 
     Returns
     -------
-    dict
-        All tasks having the given task_type_id and the give tag. Every task is
-        represented by a dictionary containing the following information:
-        task id, dataset id, task_type and status. If qualities are calculated
-        for the associated dataset, some of these are also returned.
     dataframe
-        All tasks having the given task_type_id and the give tag. Every task is
+        All tasks having the given task_type and the give tag. Every task is
         represented by a row in the data frame containing the following information
         as columns: task id, dataset id, task_type and status. If qualities are
         calculated for the associated dataset, some of these are also returned.
     """
-    if output_format not in ['dataframe', 'dict']:
-        raise ValueError("Invalid output format selected. "
-                         "Only 'dict' or 'dataframe' applicable.")
-    return openml.utils._list_all(output_format=output_format,
-                                  listing_call=_list_tasks,
-                                  task_type_id=task_type_id,
-                                  offset=offset,
-                                  size=size,
-                                  tag=tag,
-                                  **kwargs)
-
-
-def _list_tasks(task_type_id=None, output_format='dict', **kwargs):
+    listing_call = partial(
+        _list_tasks,
+        task_type=task_type,
+        tag=tag,
+        data_tag=data_tag,
+        status=status,
+        data_id=data_id,
+        data_name=data_name,
+        number_instances=number_instances,
+        number_features=number_features,
+        number_classes=number_classes,
+        number_missing_values=number_missing_values,
+    )
+    batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+    if len(batches) == 0:
+        return pd.DataFrame()
+
+    return pd.concat(batches)
+
+
+def _list_tasks(
+    limit: int,
+    offset: int,
+    task_type: TaskType | int | None = None,
+    **kwargs: Any,
+) -> pd.DataFrame:
     """
     Perform the api call to return a number of tasks having the given filters.
+
     Parameters
     ----------
-    Filter task_type_id is separated from the other filters because
-    it is used as task_type_id in the task description, but it is named
+    Filter task_type is separated from the other filters because
+    it is used as task_type in the task description, but it is named
     type when used as a filter in list tasks call.
-    task_type_id : int, optional
-        ID of the task type as detailed
-        `here <https://www.openml.org/search?type=task_type>`_.
-        - Supervised classification: 1
-        - Supervised regression: 2
-        - Learning curve: 3
-        - Supervised data stream classification: 4
-        - Clustering: 5
-        - Machine Learning Challenge: 6
-        - Survival Analysis: 7
-        - Subgroup Discovery: 8
-    output_format: str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
+    limit: int
+    offset: int
+    task_type : TaskType, optional
+        Refers to the type of task.
     kwargs: dict, optional
         Legal filter operators: tag, task_id (list), data_tag, status, limit,
         offset, data_id, data_name, number_instances, number_features,
@@ -219,188 +219,243 @@ def _list_tasks(task_type_id=None, output_format='dict', **kwargs):
 
     Returns
     -------
-    dict or dataframe
+    dataframe
     """
     api_call = "task/list"
-    if task_type_id is not None:
-        api_call += "/type/%d" % int(task_type_id)
+    if limit is not None:
+        api_call += f"/limit/{limit}"
+    if offset is not None:
+        api_call += f"/offset/{offset}"
+    if task_type is not None:
+        tvalue = task_type.value if isinstance(task_type, TaskType) else task_type
+        api_call += f"/type/{tvalue}"
     if kwargs is not None:
         for operator, value in kwargs.items():
-            if operator == 'task_id':
-                value = ','.join([str(int(i)) for i in value])
-            api_call += "/%s/%s" % (operator, value)
-    return __list_tasks(api_call=api_call, output_format=output_format)
+            if value is not None:
+                if operator == "task_id":
+                    value = ",".join([str(int(i)) for i in value])  # noqa: PLW2901
+                api_call += f"/{operator}/{value}"
+
+    return __list_tasks(api_call=api_call)
+
 
+def __list_tasks(api_call: str) -> pd.DataFrame:  # noqa: C901, PLR0912
+    """Returns a Pandas DataFrame with information about OpenML tasks.
+
+    Parameters
+    ----------
+    api_call : str
+        The API call specifying which tasks to return.
 
-def __list_tasks(api_call, output_format='dict'):
-    xml_string = openml._api_calls._perform_api_call(api_call, 'get')
-    tasks_dict = xmltodict.parse(xml_string, force_list=('oml:task',
-                                                         'oml:input'))
+    Returns
+    -------
+        A Pandas DataFrame with information about OpenML tasks.
+
+    Raises
+    ------
+    ValueError
+        If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml',
+        or has an incorrect value for '@xmlns:oml'.
+    KeyError
+        If an invalid key is found in the XML for a task.
+    """
+    xml_string = openml._api_calls._perform_api_call(api_call, "get")
+    tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input"))
     # Minimalistic check if the XML is useful
-    if 'oml:tasks' not in tasks_dict:
-        raise ValueError('Error in return XML, does not contain "oml:runs": %s'
-                         % str(tasks_dict))
-    elif '@xmlns:oml' not in tasks_dict['oml:tasks']:
-        raise ValueError('Error in return XML, does not contain '
-                         '"oml:runs"/@xmlns:oml: %s'
-                         % str(tasks_dict))
-    elif tasks_dict['oml:tasks']['@xmlns:oml'] != 'http://openml.org/openml':
-        raise ValueError('Error in return XML, value of  '
-                         '"oml:runs"/@xmlns:oml is not '
-                         '"http://openml.org/openml": %s'
-                         % str(tasks_dict))
-
-    assert type(tasks_dict['oml:tasks']['oml:task']) == list, \
-        type(tasks_dict['oml:tasks'])
-
-    tasks = dict()
+    if "oml:tasks" not in tasks_dict:
+        raise ValueError(f'Error in return XML, does not contain "oml:runs": {tasks_dict}')
+
+    if "@xmlns:oml" not in tasks_dict["oml:tasks"]:
+        raise ValueError(
+            f'Error in return XML, does not contain "oml:runs"/@xmlns:oml: {tasks_dict}'
+        )
+
+    if tasks_dict["oml:tasks"]["@xmlns:oml"] != "http://openml.org/openml":
+        raise ValueError(
+            "Error in return XML, value of  "
+            '"oml:runs"/@xmlns:oml is not '
+            f'"http://openml.org/openml": {tasks_dict!s}',
+        )
+
+    assert isinstance(tasks_dict["oml:tasks"]["oml:task"], list), type(tasks_dict["oml:tasks"])
+
+    tasks = {}
     procs = _get_estimation_procedure_list()
-    proc_dict = dict((x['id'], x) for x in procs)
+    proc_dict = {x["id"]: x for x in procs}
 
-    for task_ in tasks_dict['oml:tasks']['oml:task']:
+    for task_ in tasks_dict["oml:tasks"]["oml:task"]:
         tid = None
         try:
-            tid = int(task_['oml:task_id'])
-            task = {'tid': tid,
-                    'ttid': int(task_['oml:task_type_id']),
-                    'did': int(task_['oml:did']),
-                    'name': task_['oml:name'],
-                    'task_type': task_['oml:task_type'],
-                    'status': task_['oml:status']}
+            tid = int(task_["oml:task_id"])
+            task_type_int = int(task_["oml:task_type_id"])
+            try:
+                task_type_id = TaskType(task_type_int)
+            except ValueError as e:
+                warnings.warn(
+                    f"Could not create task type id for {task_type_int} due to error {e}",
+                    RuntimeWarning,
+                    stacklevel=2,
+                )
+                continue
+
+            task = {
+                "tid": tid,
+                "ttid": task_type_id,
+                "did": int(task_["oml:did"]),
+                "name": task_["oml:name"],
+                "task_type": task_["oml:task_type"],
+                "status": task_["oml:status"],
+            }
 
             # Other task inputs
-            for input in task_.get('oml:input', list()):
-                if input['@name'] == 'estimation_procedure':
-                    task[input['@name']] = \
-                        proc_dict[int(input['#text'])]['name']
+            for _input in task_.get("oml:input", []):
+                if _input["@name"] == "estimation_procedure":
+                    task[_input["@name"]] = proc_dict[int(_input["#text"])]["name"]
                 else:
-                    value = input.get('#text')
-                    task[input['@name']] = value
+                    value = _input.get("#text")
+                    task[_input["@name"]] = value
 
             # The number of qualities can range from 0 to infinity
-            for quality in task_.get('oml:quality', list()):
-                if '#text' not in quality:
+            for quality in task_.get("oml:quality", []):
+                if "#text" not in quality:
                     quality_value = 0.0
                 else:
-                    quality['#text'] = float(quality['#text'])
-                    if abs(int(quality['#text']) - quality['#text']) \
-                            < 0.0000001:
-                        quality['#text'] = int(quality['#text'])
-                    quality_value = quality['#text']
-                task[quality['@name']] = quality_value
+                    quality["#text"] = float(quality["#text"])
+                    if abs(int(quality["#text"]) - quality["#text"]) < 0.0000001:
+                        quality["#text"] = int(quality["#text"])
+                    quality_value = quality["#text"]
+                task[quality["@name"]] = quality_value
             tasks[tid] = task
         except KeyError as e:
             if tid is not None:
-                raise KeyError(
-                    "Invalid xml for task %d: %s\nFrom %s" % (
-                        tid, e, task_
-                    )
+                warnings.warn(
+                    f"Invalid xml for task {tid}: {e}\nFrom {task_}",
+                    RuntimeWarning,
+                    stacklevel=2,
                 )
             else:
-                raise KeyError('Could not find key %s in %s!' % (e, task_))
-
-    if output_format == 'dataframe':
-        tasks = pd.DataFrame.from_dict(tasks, orient='index')
+                warnings.warn(f"Could not find key {e} in {task_}!", RuntimeWarning, stacklevel=2)
 
-    return tasks
+    return pd.DataFrame.from_dict(tasks, orient="index")
 
 
-def get_tasks(task_ids, download_data=True):
+def get_tasks(
+    task_ids: list[int],
+    download_data: bool | None = None,
+    download_qualities: bool | None = None,
+) -> list[OpenMLTask]:
     """Download tasks.
 
     This function iterates :meth:`openml.tasks.get_task`.
 
     Parameters
     ----------
-    task_ids : iterable
-        Integers/Strings representing task ids.
-    download_data : bool
+    task_ids : List[int]
+        A list of task ids to download.
+    download_data : bool (default = True)
         Option to trigger download of data along with the meta data.
+    download_qualities : bool (default=True)
+        Option to download 'qualities' meta-data in addition to the minimal dataset description.
 
     Returns
     -------
     list
     """
+    if download_data is None:
+        warnings.warn(
+            "`download_data` will default to False starting in 0.16. "
+            "Please set `download_data` explicitly to suppress this warning.",
+            stacklevel=1,
+        )
+        download_data = True
+
+    if download_qualities is None:
+        warnings.warn(
+            "`download_qualities` will default to False starting in 0.16. "
+            "Please set `download_qualities` explicitly to suppress this warning.",
+            stacklevel=1,
+        )
+        download_qualities = True
+
     tasks = []
     for task_id in task_ids:
-        tasks.append(get_task(task_id, download_data))
+        tasks.append(
+            get_task(task_id, download_data=download_data, download_qualities=download_qualities)
+        )
     return tasks
 
 
 @openml.utils.thread_safe_if_oslo_installed
-def get_task(task_id: int, download_data: bool = True) -> OpenMLTask:
+def get_task(
+    task_id: int,
+    download_splits: bool = False,  # noqa: FBT002
+    **get_dataset_kwargs: Any,
+) -> OpenMLTask:
     """Download OpenML task for a given task ID.
 
-    Downloads the task representation, while the data splits can be
-    downloaded optionally based on the additional parameter. Else,
-    splits will either way be downloaded when the task is being used.
+    Downloads the task representation.
+
+    Use the `download_splits` parameter to control whether the splits are downloaded.
+    Moreover, you may pass additional parameter (args or kwargs) that are passed to
+    :meth:`openml.datasets.get_dataset`.
 
     Parameters
     ----------
-    task_id : int or str
-        The OpenML task id.
-    download_data : bool
-        Option to trigger download of data along with the meta data.
+    task_id : int
+        The OpenML task id of the task to download.
+    download_splits: bool (default=False)
+        Whether to download the splits as well.
+    get_dataset_kwargs :
+        Args and kwargs can be used pass optional parameters to :meth:`openml.datasets.get_dataset`.
 
     Returns
     -------
-    task
+    task: OpenMLTask
     """
-    try:
-        task_id = int(task_id)
-    except (ValueError, TypeError):
-        raise ValueError("Dataset ID is neither an Integer nor can be "
-                         "cast to an Integer.")
+    if not isinstance(task_id, int):
+        raise TypeError(f"Task id should be integer, is {type(task_id)}")
 
-    tid_cache_dir = openml.utils._create_cache_directory_for_id(
-        TASKS_CACHE_DIR_NAME, task_id,
+    task_cache_directory = openml.utils._create_cache_directory_for_id(
+        TASKS_CACHE_DIR_NAME, task_id
     )
-
+    task_cache_directory_existed = task_cache_directory.exists()
     try:
         task = _get_task_description(task_id)
-        dataset = get_dataset(task.dataset_id, download_data)
-        # List of class labels availaible in dataset description
+        dataset = get_dataset(task.dataset_id, **get_dataset_kwargs)
+        # List of class labels available in dataset description
         # Including class labels as part of task meta data handles
         #   the case where data download was initially disabled
         if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
-            task.class_labels = \
-                dataset.retrieve_class_labels(task.target_name)
+            assert task.target_name is not None, (
+                "Supervised tasks must define a target feature before retrieving class labels."
+            )
+            task.class_labels = dataset.retrieve_class_labels(task.target_name)
         # Clustering tasks do not have class labels
         # and do not offer download_split
-        if download_data:
-            if isinstance(task, OpenMLSupervisedTask):
-                task.download_split()
+        if download_splits and isinstance(task, OpenMLSupervisedTask):
+            task.download_split()
     except Exception as e:
-        openml.utils._remove_cache_dir_for_id(
-            TASKS_CACHE_DIR_NAME,
-            tid_cache_dir,
-        )
+        if not task_cache_directory_existed:
+            openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, task_cache_directory)
         raise e
 
     return task
 
 
-def _get_task_description(task_id):
-
+def _get_task_description(task_id: int) -> OpenMLTask:
     try:
         return _get_cached_task(task_id)
     except OpenMLCacheException:
-        xml_file = os.path.join(
-            openml.utils._create_cache_directory_for_id(
-                TASKS_CACHE_DIR_NAME,
-                task_id,
-            ),
-            "task.xml",
-        )
-        task_xml = openml._api_calls._perform_api_call("task/%d" % task_id,
-                                                       'get')
+        _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
+        xml_file = _cache_dir / "task.xml"
+        task_xml = openml._api_calls._perform_api_call(f"task/{task_id}", "get")
 
-        with io.open(xml_file, "w", encoding='utf8') as fh:
+        with xml_file.open("w", encoding="utf8") as fh:
             fh.write(task_xml)
         return _create_task_from_xml(task_xml)
 
 
-def _create_task_from_xml(xml):
+def _create_task_from_xml(xml: str) -> OpenMLTask:
     """Create a task given a xml string.
 
     Parameters
@@ -413,8 +468,8 @@ def _create_task_from_xml(xml):
     OpenMLTask
     """
     dic = xmltodict.parse(xml)["oml:task"]
-    estimation_parameters = dict()
-    inputs = dict()
+    estimation_parameters = {}
+    inputs = {}
     # Due to the unordered structure we obtain, we first have to extract
     # the possible keys of oml:input; dic["oml:input"] is a list of
     # OrderedDicts
@@ -430,64 +485,73 @@ def _create_task_from_xml(xml):
         inputs[name] = dic["oml:input"]
 
     evaluation_measures = None
-    if 'evaluation_measures' in inputs:
-        evaluation_measures = inputs["evaluation_measures"][
-            "oml:evaluation_measures"]["oml:evaluation_measure"]
+    if "evaluation_measures" in inputs:
+        evaluation_measures = inputs["evaluation_measures"]["oml:evaluation_measures"][
+            "oml:evaluation_measure"
+        ]
 
-    task_type_id = int(dic["oml:task_type_id"])
+    task_type = TaskType(int(dic["oml:task_type_id"]))
     common_kwargs = {
-        'task_id': dic["oml:task_id"],
-        'task_type': dic["oml:task_type"],
-        'task_type_id': dic["oml:task_type_id"],
-        'data_set_id': inputs["source_data"][
-            "oml:data_set"]["oml:data_set_id"],
-        'evaluation_measure': evaluation_measures,
+        "task_id": dic["oml:task_id"],
+        "task_type": dic["oml:task_type"],
+        "task_type_id": task_type,
+        "data_set_id": inputs["source_data"]["oml:data_set"]["oml:data_set_id"],
+        "evaluation_measure": evaluation_measures,
     }
-    if task_type_id in (
-        TaskTypeEnum.SUPERVISED_CLASSIFICATION,
-        TaskTypeEnum.SUPERVISED_REGRESSION,
-        TaskTypeEnum.LEARNING_CURVE
+    # TODO: add OpenMLClusteringTask?
+    if task_type in (
+        TaskType.SUPERVISED_CLASSIFICATION,
+        TaskType.SUPERVISED_REGRESSION,
+        TaskType.LEARNING_CURVE,
     ):
         # Convert some more parameters
-        for parameter in \
-                inputs["estimation_procedure"]["oml:estimation_procedure"][
-                    "oml:parameter"]:
+        for parameter in inputs["estimation_procedure"]["oml:estimation_procedure"][
+            "oml:parameter"
+        ]:
             name = parameter["@name"]
             text = parameter.get("#text", "")
             estimation_parameters[name] = text
 
-        common_kwargs['estimation_procedure_type'] = inputs[
-            "estimation_procedure"][
-            "oml:estimation_procedure"]["oml:type"]
-        common_kwargs['estimation_parameters'] = estimation_parameters
-        common_kwargs['target_name'] = inputs[
-            "source_data"]["oml:data_set"]["oml:target_feature"]
-        common_kwargs['data_splits_url'] = inputs["estimation_procedure"][
-            "oml:estimation_procedure"]["oml:data_splits_url"]
+        common_kwargs["estimation_procedure_type"] = inputs["estimation_procedure"][
+            "oml:estimation_procedure"
+        ]["oml:type"]
+        common_kwargs["estimation_procedure_id"] = int(
+            inputs["estimation_procedure"]["oml:estimation_procedure"]["oml:id"]
+        )
+
+        common_kwargs["estimation_parameters"] = estimation_parameters
+        common_kwargs["target_name"] = inputs["source_data"]["oml:data_set"]["oml:target_feature"]
+        common_kwargs["data_splits_url"] = inputs["estimation_procedure"][
+            "oml:estimation_procedure"
+        ]["oml:data_splits_url"]
 
     cls = {
-        TaskTypeEnum.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask,
-        TaskTypeEnum.SUPERVISED_REGRESSION: OpenMLRegressionTask,
-        TaskTypeEnum.CLUSTERING: OpenMLClusteringTask,
-        TaskTypeEnum.LEARNING_CURVE: OpenMLLearningCurveTask,
-    }.get(task_type_id)
+        TaskType.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask,
+        TaskType.SUPERVISED_REGRESSION: OpenMLRegressionTask,
+        TaskType.CLUSTERING: OpenMLClusteringTask,
+        TaskType.LEARNING_CURVE: OpenMLLearningCurveTask,
+    }.get(task_type)
     if cls is None:
-        raise NotImplementedError('Task type %s not supported.' %
-                                  common_kwargs['task_type'])
-    return cls(**common_kwargs)
+        raise NotImplementedError(
+            f"Task type '{common_kwargs['task_type']}' is not supported. "
+            f"Supported task types: SUPERVISED_CLASSIFICATION,"
+            f"SUPERVISED_REGRESSION, CLUSTERING, LEARNING_CURVE."
+            f"Please check the OpenML documentation for available task types."
+        )
+    return cls(**common_kwargs)  # type: ignore
 
 
+# TODO(eddiebergman): overload on `task_type`
 def create_task(
-        task_type_id: int,
-        dataset_id: int,
-        estimation_procedure_id: int,
-        target_name: Optional[str] = None,
-        evaluation_measure: Optional[str] = None,
-        **kwargs
-) -> Union[
-    OpenMLClassificationTask, OpenMLRegressionTask,
-    OpenMLLearningCurveTask, OpenMLClusteringTask
-]:
+    task_type: TaskType,
+    dataset_id: int,
+    estimation_procedure_id: int,
+    target_name: str | None = None,
+    evaluation_measure: str | None = None,
+    **kwargs: Any,
+) -> (
+    OpenMLClassificationTask | OpenMLRegressionTask | OpenMLLearningCurveTask | OpenMLClusteringTask
+):
     """Create a task based on different given attributes.
 
     Builds a task object with the function arguments as
@@ -500,7 +564,7 @@ def create_task(
 
     Parameters
     ----------
-    task_type_id : int
+    task_type : TaskType
         Id of the task type.
     dataset_id : int
         The id of the dataset for the task.
@@ -520,24 +584,49 @@ def create_task(
     OpenMLClassificationTask, OpenMLRegressionTask,
     OpenMLLearningCurveTask, OpenMLClusteringTask
     """
-    task_cls = {
-        TaskTypeEnum.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask,
-        TaskTypeEnum.SUPERVISED_REGRESSION: OpenMLRegressionTask,
-        TaskTypeEnum.CLUSTERING: OpenMLClusteringTask,
-        TaskTypeEnum.LEARNING_CURVE: OpenMLLearningCurveTask,
-    }.get(task_type_id)
-
-    if task_cls is None:
-        raise NotImplementedError(
-            'Task type {0:d} not supported.'.format(task_type_id)
-        )
+    if task_type == TaskType.CLUSTERING:
+        task_cls = OpenMLClusteringTask
+    elif task_type == TaskType.LEARNING_CURVE:
+        task_cls = OpenMLLearningCurveTask  # type: ignore
+    elif task_type == TaskType.SUPERVISED_CLASSIFICATION:
+        task_cls = OpenMLClassificationTask  # type: ignore
+    elif task_type == TaskType.SUPERVISED_REGRESSION:
+        task_cls = OpenMLRegressionTask  # type: ignore
     else:
-        return task_cls(
-            task_type_id=task_type_id,
-            task_type=None,
-            data_set_id=dataset_id,
-            target_name=target_name,
-            estimation_procedure_id=estimation_procedure_id,
-            evaluation_measure=evaluation_measure,
-            **kwargs
+        raise NotImplementedError(
+            f"Task type ID {task_type:d} is not supported. "
+            f"Supported task type IDs: {TaskType.SUPERVISED_CLASSIFICATION.value},"
+            f"{TaskType.SUPERVISED_REGRESSION.value}, "
+            f"{TaskType.CLUSTERING.value}, {TaskType.LEARNING_CURVE.value}. "
+            f"Please refer to the TaskType enum for valid task type identifiers."
         )
+
+    return task_cls(
+        task_id=None,
+        task_type_id=task_type,
+        task_type="None",  # TODO: refactor to get task type string from ID.
+        data_set_id=dataset_id,
+        target_name=target_name,  # type: ignore
+        estimation_procedure_id=estimation_procedure_id,
+        evaluation_measure=evaluation_measure,
+        **kwargs,
+    )
+
+
+def delete_task(task_id: int) -> bool:
+    """Delete task with id `task_id` from the OpenML server.
+
+    You can only delete tasks which you created and have
+    no runs associated with them.
+
+    Parameters
+    ----------
+    task_id : int
+        OpenML id of the task
+
+    Returns
+    -------
+    bool
+        True if the deletion was successful. False otherwise.
+    """
+    return openml.utils._delete_entity("task", task_id)
diff --git a/openml/tasks/split.py b/openml/tasks/split.py
index 3815f4257..464e41b2a 100644
--- a/openml/tasks/split.py
+++ b/openml/tasks/split.py
@@ -1,106 +1,127 @@
-from collections import namedtuple, OrderedDict
-import os
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import pickle
+from collections import OrderedDict
+from pathlib import Path
+from typing import Any
+from typing_extensions import NamedTuple
 
+import arff  # type: ignore
 import numpy as np
-import arff
 
 
-Split = namedtuple("Split", ["train", "test"])
+class Split(NamedTuple):
+    """A single split of a dataset."""
+
+    train: np.ndarray
+    test: np.ndarray
 
 
-class OpenMLSplit(object):
+class OpenMLSplit:  # noqa: PLW1641
     """OpenML Split object.
 
-       Parameters
-       ----------
-       name : int or str
-       description : str
-       split : dict
+    This class manages train-test splits for a dataset across multiple
+    repetitions, folds, and samples.
+
+    Parameters
+    ----------
+    name : int or str
+        The name or ID of the split.
+    description : str
+        A description of the split.
+    split : dict
+        A dictionary containing the splits organized by repetition, fold,
+        and sample.
     """
 
-    def __init__(self, name, description, split):
+    def __init__(
+        self,
+        name: int | str,
+        description: str,
+        split: dict[int, dict[int, dict[int, tuple[np.ndarray, np.ndarray]]]],
+    ):
         self.description = description
         self.name = name
-        self.split = dict()
+        self.split: dict[int, dict[int, dict[int, tuple[np.ndarray, np.ndarray]]]] = {}
 
         # Add splits according to repetition
         for repetition in split:
-            repetition = int(repetition)
-            self.split[repetition] = OrderedDict()
-            for fold in split[repetition]:
-                self.split[repetition][fold] = OrderedDict()
-                for sample in split[repetition][fold]:
-                    self.split[repetition][fold][sample] = split[
-                        repetition][fold][sample]
+            _rep = int(repetition)
+            self.split[_rep] = OrderedDict()
+            for fold in split[_rep]:
+                self.split[_rep][fold] = OrderedDict()
+                for sample in split[_rep][fold]:
+                    self.split[_rep][fold][sample] = split[_rep][fold][sample]
 
         self.repeats = len(self.split)
-        if any([len(self.split[0]) != len(self.split[i])
-                for i in range(self.repeats)]):
-            raise ValueError('')
+
+        # TODO(eddiebergman): Better error message
+        if any(len(self.split[0]) != len(self.split[i]) for i in range(self.repeats)):
+            raise ValueError("")
+
         self.folds = len(self.split[0])
         self.samples = len(self.split[0][0])
 
-    def __eq__(self, other):
-        if (type(self) != type(other)
-                or self.name != other.name
-                or self.description != other.description
-                or self.split.keys() != other.split.keys()):
-            return False
-
-        if any(self.split[repetition].keys() != other.split[repetition].keys()
-                for repetition in self.split):
+    def __eq__(self, other: Any) -> bool:
+        if (
+            (not isinstance(self, type(other)))
+            or self.name != other.name
+            or self.description != other.description
+            or self.split.keys() != other.split.keys()
+            or any(
+                self.split[repetition].keys() != other.split[repetition].keys()
+                for repetition in self.split
+            )
+        ):
             return False
 
-        samples = [(repetition, fold, sample)
-                   for repetition in self.split
-                   for fold in self.split[repetition]
-                   for sample in self.split[repetition][fold]]
+        samples = [
+            (repetition, fold, sample)
+            for repetition in self.split
+            for fold in self.split[repetition]
+            for sample in self.split[repetition][fold]
+        ]
 
         for repetition, fold, sample in samples:
             self_train, self_test = self.split[repetition][fold][sample]
             other_train, other_test = other.split[repetition][fold][sample]
-            if not (np.all(self_train == other_train)
-                    and np.all(self_test == other_test)):
+            if not (np.all(self_train == other_train) and np.all(self_test == other_test)):
                 return False
         return True
 
     @classmethod
-    def _from_arff_file(cls, filename: str) -> 'OpenMLSplit':
-
+    def _from_arff_file(cls, filename: Path) -> OpenMLSplit:  # noqa: C901, PLR0912
         repetitions = None
+        name = None
 
-        pkl_filename = filename.replace(".arff", ".pkl.py3")
+        pkl_filename = filename.with_suffix(".pkl.py3")
 
-        if os.path.exists(pkl_filename):
-            with open(pkl_filename, "rb") as fh:
-                _ = pickle.load(fh)
-            repetitions = _["repetitions"]
-            name = _["name"]
+        if pkl_filename.exists():
+            with pkl_filename.open("rb") as fh:
+                # TODO(eddiebergman): Would be good to figure out what _split is and assert it is
+                _split = pickle.load(fh)  # noqa: S301
+            repetitions = _split["repetitions"]
+            name = _split["name"]
 
         # Cache miss
         if repetitions is None:
             # Faster than liac-arff and sufficient in this situation!
-            if not os.path.exists(filename):
-                raise FileNotFoundError(
-                    'Split arff %s does not exist!' % filename
-                )
-            file_data = arff.load(open(filename), return_type=arff.DENSE_GEN)
-            splits = file_data['data']
-            name = file_data['relation']
-            attrnames = [attr[0] for attr in file_data['attributes']]
+            if not filename.exists():
+                raise FileNotFoundError(f"Split arff {filename} does not exist!")
+
+            file_data = arff.load(filename.open("r"), return_type=arff.DENSE_GEN)
+            splits = file_data["data"]
+            name = file_data["relation"]
+            attrnames = [attr[0] for attr in file_data["attributes"]]
 
             repetitions = OrderedDict()
 
-            type_idx = attrnames.index('type')
-            rowid_idx = attrnames.index('rowid')
-            repeat_idx = attrnames.index('repeat')
-            fold_idx = attrnames.index('fold')
-            sample_idx = (
-                attrnames.index('sample')
-                if 'sample' in attrnames
-                else None
-            )
+            type_idx = attrnames.index("type")
+            rowid_idx = attrnames.index("rowid")
+            repeat_idx = attrnames.index("repeat")
+            fold_idx = attrnames.index("fold")
+            sample_idx = attrnames.index("sample") if "sample" in attrnames else None
 
             for line in splits:
                 # A line looks like type, rowid, repeat, fold
@@ -119,9 +140,9 @@ def _from_arff_file(cls, filename: str) -> 'OpenMLSplit':
                 split = repetitions[repetition][fold][sample]
 
                 type_ = line[type_idx]
-                if type_ == 'TRAIN':
+                if type_ == "TRAIN":
                     split[0].append(line[rowid_idx])
-                elif type_ == 'TEST':
+                elif type_ == "TEST":
                     split[1].append(line[rowid_idx])
                 else:
                     raise ValueError(type_)
@@ -130,25 +151,42 @@ def _from_arff_file(cls, filename: str) -> 'OpenMLSplit':
                 for fold in repetitions[repetition]:
                     for sample in repetitions[repetition][fold]:
                         repetitions[repetition][fold][sample] = Split(
-                            np.array(repetitions[repetition][fold][sample][0],
-                                     dtype=np.int32),
-                            np.array(repetitions[repetition][fold][sample][1],
-                                     dtype=np.int32))
-
-            with open(pkl_filename, "wb") as fh:
-                pickle.dump({"name": name, "repetitions": repetitions}, fh,
-                            protocol=2)
-
-        return cls(name, '', repetitions)
-
-    def from_dataset(self, X, Y, folds, repeats):
-        raise NotImplementedError()
-
-    def get(self, repeat=0, fold=0, sample=0):
+                            np.array(repetitions[repetition][fold][sample][0], dtype=np.int32),
+                            np.array(repetitions[repetition][fold][sample][1], dtype=np.int32),
+                        )
+
+            with pkl_filename.open("wb") as fh:
+                pickle.dump({"name": name, "repetitions": repetitions}, fh, protocol=2)
+
+        assert name is not None
+        return cls(name, "", repetitions)
+
+    def get(self, repeat: int = 0, fold: int = 0, sample: int = 0) -> tuple[np.ndarray, np.ndarray]:
+        """Returns the specified data split from the CrossValidationSplit object.
+
+        Parameters
+        ----------
+        repeat : int
+            Index of the repeat to retrieve.
+        fold : int
+            Index of the fold to retrieve.
+        sample : int
+            Index of the sample to retrieve.
+
+        Returns
+        -------
+        numpy.ndarray
+            The data split for the specified repeat, fold, and sample.
+
+        Raises
+        ------
+        ValueError
+            If the specified repeat, fold, or sample is not known.
+        """
         if repeat not in self.split:
-            raise ValueError("Repeat %s not known" % str(repeat))
+            raise ValueError(f"Repeat {repeat!s} not known")
         if fold not in self.split[repeat]:
-            raise ValueError("Fold %s not known" % str(fold))
+            raise ValueError(f"Fold {fold!s} not known")
         if sample not in self.split[repeat][fold]:
-            raise ValueError("Sample %s not known" % str(sample))
+            raise ValueError(f"Sample {sample!s} not known")
         return self.split[repeat][fold][sample]
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
index 83af79373..ab3cb3da4 100644
--- a/openml/tasks/task.py
+++ b/openml/tasks/task.py
@@ -1,271 +1,290 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import logging
+import warnings
 from abc import ABC
-from collections import OrderedDict
-import io
-import os
-from typing import Union, Tuple, Dict, List, Optional
-from warnings import warn
+from collections.abc import Sequence
+from enum import Enum
+from typing import TYPE_CHECKING, Any, ClassVar
+from typing_extensions import TypedDict
 
-import numpy as np
-import pandas as pd
-import scipy.sparse
-import xmltodict
+import arff
 
 import openml._api_calls
-from .. import datasets
+from openml import datasets
+from openml.base import OpenMLBase
+from openml.utils import _create_cache_directory_for_id
+
 from .split import OpenMLSplit
-from ..utils import _create_cache_directory_for_id, _tag_entity
 
+if TYPE_CHECKING:
+    import numpy as np
+    import pandas as pd
+
+
+logger = logging.getLogger(__name__)
+
+
+# TODO(eddiebergman): Should use `auto()` but might be too late if these numbers are used
+# and stored on server.
+class TaskType(Enum):
+    """Possible task types as defined in OpenML."""
 
-class OpenMLTask(ABC):
+    SUPERVISED_CLASSIFICATION = 1
+    SUPERVISED_REGRESSION = 2
+    LEARNING_CURVE = 3
+    SUPERVISED_DATASTREAM_CLASSIFICATION = 4
+    CLUSTERING = 5
+    MACHINE_LEARNING_CHALLENGE = 6
+    SURVIVAL_ANALYSIS = 7
+    SUBGROUP_DISCOVERY = 8
+    MULTITASK_REGRESSION = 9
+
+
+class _EstimationProcedure(TypedDict):
+    type: str | None
+    parameters: dict[str, str] | None
+    data_splits_url: str | None
+
+
+class OpenMLTask(OpenMLBase):
     """OpenML Task object.
 
-       Parameters
-       ----------
-       task_type_id : int
-           Refers to the type of task.
-       task_type : str
-           Refers to the task.
-       data_set_id: int
-           Refers to the data.
-       estimation_procedure_id: int
-           Refers to the type of estimates used.
+    Parameters
+    ----------
+    task_id: Union[int, None]
+        Refers to the unique identifier of OpenML task.
+    task_type_id: TaskType
+        Refers to the type of OpenML task.
+    task_type: str
+        Refers to the OpenML task.
+    data_set_id: int
+        Refers to the data.
+    estimation_procedure_id: int
+        Refers to the type of estimates used.
+    estimation_procedure_type: str, default=None
+        Refers to the type of estimation procedure used for the OpenML task.
+    estimation_parameters: [Dict[str, str]], default=None
+        Estimation parameters used for the OpenML task.
+    evaluation_measure: str, default=None
+        Refers to the evaluation measure.
+    data_splits_url: str, default=None
+        Refers to the URL of the data splits used for the OpenML task.
     """
-    def __init__(
-            self,
-            task_id: Optional[int],
-            task_type_id: int,
-            task_type: str,
-            data_set_id: int,
-            estimation_procedure_id: int = 1,
-            estimation_procedure_type: Optional[str] = None,
-            estimation_parameters: Optional[Dict[str, str]] = None,
-            evaluation_measure: Optional[str] = None,
-            data_splits_url: Optional[str] = None,
-    ):
 
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1
+
+    def __init__(  # noqa: PLR0913
+        self,
+        task_id: int | None,
+        task_type_id: TaskType,
+        task_type: str,
+        data_set_id: int,
+        estimation_procedure_id: int | None = None,
+        estimation_procedure_type: str | None = None,
+        estimation_parameters: dict[str, str] | None = None,
+        evaluation_measure: str | None = None,
+        data_splits_url: str | None = None,
+        target_name: str | None = None,
+    ):
         self.task_id = int(task_id) if task_id is not None else None
-        self.task_type_id = int(task_type_id)
+        self.task_type_id = task_type_id
         self.task_type = task_type
         self.dataset_id = int(data_set_id)
+        self.target_name = target_name
+        resolved_estimation_procedure_id = self._resolve_estimation_procedure_id(
+            estimation_procedure_id,
+        )
         self.evaluation_measure = evaluation_measure
-        self.estimation_procedure = dict()  # type: Dict[str, Optional[Union[str, Dict]]] # noqa E501
-        self.estimation_procedure["type"] = estimation_procedure_type
-        self.estimation_procedure["parameters"] = estimation_parameters
-        self.estimation_procedure["data_splits_url"] = data_splits_url
-        self.estimation_procedure_id = estimation_procedure_id
-        self.split = None  # type: Optional[OpenMLSplit]
-
-    def __repr__(self):
-        header = "OpenML Task"
-        header = '{}\n{}\n'.format(header, '=' * len(header))
-
-        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
-        fields = {"Task Type": self.task_type}
+        self.estimation_procedure: _EstimationProcedure = {
+            "type": estimation_procedure_type,
+            "parameters": estimation_parameters,
+            "data_splits_url": data_splits_url,
+        }
+        self.estimation_procedure_id = resolved_estimation_procedure_id
+        self.split: OpenMLSplit | None = None
+
+    def _resolve_estimation_procedure_id(self, estimation_procedure_id: int | None) -> int:
+        return (
+            estimation_procedure_id
+            if estimation_procedure_id is not None
+            else self.DEFAULT_ESTIMATION_PROCEDURE_ID
+        )
+
+    @classmethod
+    def _entity_letter(cls) -> str:
+        return "t"
+
+    @property
+    def id(self) -> int | None:
+        """Return the OpenML ID of this task."""
+        return self.task_id
+
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
+        """Collect all information to display in the __repr__ body."""
+        base_server_url = openml.config.get_server_base_url()
+        fields: dict[str, Any] = {
+            "Task Type Description": f"{base_server_url}/tt/{self.task_type_id}"
+        }
         if self.task_id is not None:
             fields["Task ID"] = self.task_id
-            fields["Task URL"] = "{}t/{}".format(base_url, self.task_id)
+            fields["Task URL"] = self.openml_url
         if self.evaluation_measure is not None:
             fields["Evaluation Measure"] = self.evaluation_measure
         if self.estimation_procedure is not None:
-            fields["Estimation Procedure"] = self.estimation_procedure['type']
-        if self.target_name is not None:
-            fields["Target Feature"] = self.target_name
-            if hasattr(self, 'class_labels'):
-                fields["# of Classes"] = len(self.class_labels)
-            if hasattr(self, 'cost_matrix'):
-                fields["Cost Matrix"] = "Available"
+            fields["Estimation Procedure"] = self.estimation_procedure["type"]
 
-        # determines the order in which the information will be printed
-        order = ["Task Type", "Task ID", "Task URL", "Estimation Procedure", "Evaluation Measure",
-                 "Target Feature", "# of Classes", "Cost Matrix"]
-        fields = [(key, fields[key]) for key in order if key in fields]
+        # TODO(eddiebergman): Subclasses could advertise/provide this, instead of having to
+        # have the base class know about it's subclasses.
+        target_name = getattr(self, "target_name", None)
+        if target_name is not None:
+            fields["Target Feature"] = target_name
 
-        longest_field_name_length = max(len(name) for name, value in fields)
-        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
-        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
-        return header + body
+            class_labels = getattr(self, "class_labels", None)
+            if class_labels is not None:
+                fields["# of Classes"] = len(class_labels)
 
-    def get_dataset(self) -> datasets.OpenMLDataset:
-        """Download dataset associated with task"""
-        return datasets.get_dataset(self.dataset_id)
+            cost_matrix = getattr(self, "cost_matrix", None)
+            if cost_matrix is not None:
+                fields["Cost Matrix"] = "Available"
 
-    def get_train_test_split_indices(
-            self,
-            fold: int = 0,
-            repeat: int = 0,
-            sample: int = 0,
-    ) -> Tuple[np.ndarray, np.ndarray]:
+        # determines the order in which the information will be printed
+        order = [
+            "Task Type Description",
+            "Task ID",
+            "Task URL",
+            "Estimation Procedure",
+            "Evaluation Measure",
+            "Target Feature",
+            "# of Classes",
+            "Cost Matrix",
+        ]
+        return [(key, fields[key]) for key in order if key in fields]
+
+    def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
+        """Download dataset associated with task.
+
+        Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
+        """
+        return datasets.get_dataset(self.dataset_id, **kwargs)
 
+    def get_train_test_split_indices(
+        self,
+        fold: int = 0,
+        repeat: int = 0,
+        sample: int = 0,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """Get the indices of the train and test splits for a given task."""
         # Replace with retrieve from cache
         if self.split is None:
             self.split = self.download_split()
 
-        train_indices, test_indices = self.split.get(
-            repeat=repeat,
-            fold=fold,
-            sample=sample,
-        )
-        return train_indices, test_indices
-
-    def _download_split(self, cache_file: str):
-        try:
-            with io.open(cache_file, encoding='utf8'):
-                pass
-        except (OSError, IOError):
-            split_url = self.estimation_procedure["data_splits_url"]
-            split_arff = openml._api_calls._read_url(split_url,
-                                                     request_method='get')
-
-            with io.open(cache_file, "w", encoding='utf8') as fh:
-                fh.write(split_arff)
-            del split_arff
+        return self.split.get(repeat=repeat, fold=fold, sample=sample)
 
     def download_split(self) -> OpenMLSplit:
-        """Download the OpenML split for a given task.
-        """
-        cached_split_file = os.path.join(
-            _create_cache_directory_for_id('tasks', self.task_id),
-            "datasplits.arff",
-        )
+        """Download the OpenML split for a given task."""
+        # TODO(eddiebergman): Can this every be `None`?
+        assert self.task_id is not None
+        cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
+        cached_split_file = cache_dir / "datasplits.arff"
 
         try:
             split = OpenMLSplit._from_arff_file(cached_split_file)
-        except (OSError, IOError):
+            logger.debug("Loaded file from cache: %s", str(cached_split_file))
+        except (OSError, arff.BadDataFormat):
+            logger.info("Failed to load file from cache: %s", str(cached_split_file))
+            if cached_split_file.exists():
+                logger.debug("Cleaning up old file")
+                cached_split_file.unlink()
             # Next, download and cache the associated split file
-            self._download_split(cached_split_file)
+            split_url = self.estimation_procedure["data_splits_url"]
+            openml._api_calls._download_text_file(
+                source=str(split_url),
+                output_path=str(cached_split_file),
+            )
+            if cached_split_file.exists():
+                logger.info("New file created of size %d", cached_split_file.stat().st_size)
+            else:
+                logger.info("Failed to create new file")
+
             split = OpenMLSplit._from_arff_file(cached_split_file)
 
         return split
 
-    def get_split_dimensions(self) -> Tuple[int, int, int]:
-
+    def get_split_dimensions(self) -> tuple[int, int, int]:
+        """Get the (repeats, folds, samples) of the split for a given task."""
         if self.split is None:
             self.split = self.download_split()
 
         return self.split.repeats, self.split.folds, self.split.samples
 
-    def push_tag(self, tag: str):
-        """Annotates this task with a tag on the server.
-
-        Parameters
-        ----------
-        tag : str
-            Tag to attach to the task.
-        """
-        _tag_entity('task', self.task_id, tag)
-
-    def remove_tag(self, tag: str):
-        """Removes a tag from this task on the server.
-
-        Parameters
-        ----------
-        tag : str
-            Tag to attach to the task.
-        """
-        _tag_entity('task', self.task_id, tag, untag=True)
-
-    def _to_dict(self) -> 'OrderedDict[str, OrderedDict]':
-
-        task_container = OrderedDict()  # type: OrderedDict[str, OrderedDict]
-        task_dict = OrderedDict([
-            ('@xmlns:oml', 'http://openml.org/openml')
-        ])  # type: OrderedDict[str, Union[List, str, int]]
-
-        task_container['oml:task_inputs'] = task_dict
-        task_dict['oml:task_type_id'] = self.task_type_id
-
-        # having task_inputs and adding a type annotation
-        # solves wrong warnings
-        task_inputs = [
-            OrderedDict([
-                ('@name', 'source_data'),
-                ('#text', str(self.dataset_id))
-            ]),
-            OrderedDict([
-                ('@name', 'estimation_procedure'),
-                ('#text', str(self.estimation_procedure_id))
-            ])
-        ]  # type: List[OrderedDict]
-
+    # TODO(eddiebergman): Really need some better typing on all this
+    def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]:
+        """Creates a dictionary representation of self in a string format (for XML parsing)."""
+        oml_input = [
+            {"@name": "source_data", "#text": str(self.dataset_id)},
+            {"@name": "estimation_procedure", "#text": str(self.estimation_procedure_id)},
+        ]
         if self.evaluation_measure is not None:
-            task_inputs.append(
-                OrderedDict([
-                    ('@name', 'evaluation_measures'),
-                    ('#text', self.evaluation_measure)
-                ])
-            )
-
-        task_dict['oml:input'] = task_inputs
-
-        return task_container
-
-    def _to_xml(self) -> str:
-        """Generate xml representation of self for upload to server.
+            oml_input.append({"@name": "evaluation_measures", "#text": self.evaluation_measure})
 
-        Returns
-        -------
-        str
-            Task represented as XML string.
-        """
-        task_dict = self._to_dict()
-        task_xml = xmltodict.unparse(task_dict, pretty=True)
-
-        # A task may not be uploaded with the xml encoding specification:
-        # <?xml version="1.0" encoding="utf-8"?>
-        task_xml = task_xml.split('\n', 1)[-1]
-
-        return task_xml
-
-    def publish(self) -> int:
-        """Publish task to OpenML server.
-
-        Returns
-        -------
-        task_id: int
-            Returns the id of the uploaded task
-            if successful.
-
-        """
-
-        xml_description = self._to_xml()
-
-        file_elements = {'description': xml_description}
-
-        return_value = openml._api_calls._perform_api_call(
-            "task/",
-            'post',
-            file_elements=file_elements,
-        )
+        return {
+            "oml:task_inputs": {
+                "@xmlns:oml": "http://openml.org/openml",
+                "oml:task_type_id": self.task_type_id.value,  # This is an int from the enum?
+                "oml:input": oml_input,
+            }
+        }
 
-        task_id = int(xmltodict.parse(return_value)['oml:upload_task']['oml:id'])
-
-        return task_id
+    def _parse_publish_response(self, xml_response: dict) -> None:
+        """Parse the id from the xml_response and assign it to self."""
+        self.task_id = int(xml_response["oml:upload_task"]["oml:id"])
 
 
 class OpenMLSupervisedTask(OpenMLTask, ABC):
     """OpenML Supervised Classification object.
 
-       Inherited from :class:`openml.OpenMLTask`
-
-       Parameters
-       ----------
-       target_name : str
-           Name of the target feature (the class variable).
+    Parameters
+    ----------
+    task_type_id : TaskType
+        ID of the task type.
+    task_type : str
+        Name of the task type.
+    data_set_id : int
+        ID of the OpenML dataset associated with the task.
+    target_name : str
+        Name of the target feature (the class variable).
+    estimation_procedure_id : int, default=None
+        ID of the estimation procedure for the task.
+    estimation_procedure_type : str, default=None
+        Type of the estimation procedure for the task.
+    estimation_parameters : dict, default=None
+        Estimation parameters for the task.
+    evaluation_measure : str, default=None
+        Name of the evaluation measure for the task.
+    data_splits_url : str, default=None
+        URL of the data splits for the task.
+    task_id: Union[int, None]
+        Refers to the unique identifier of task.
     """
-    def __init__(
-            self,
-            task_type_id: int,
-            task_type: str,
-            data_set_id: int,
-            target_name: str,
-            estimation_procedure_id: int = 1,
-            estimation_procedure_type: Optional[str] = None,
-            estimation_parameters: Optional[Dict[str, str]] = None,
-            evaluation_measure: Optional[str] = None,
-            data_splits_url: Optional[str] = None,
-            task_id: Optional[int] = None,
+
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1
+
+    def __init__(  # noqa: PLR0913
+        self,
+        task_type_id: TaskType,
+        task_type: str,
+        data_set_id: int,
+        target_name: str,
+        estimation_procedure_id: int | None = None,
+        estimation_procedure_type: str | None = None,
+        estimation_parameters: dict[str, str] | None = None,
+        evaluation_measure: str | None = None,
+        data_splits_url: str | None = None,
+        task_id: int | None = None,
     ):
-        super(OpenMLSupervisedTask, self).__init__(
+        super().__init__(
             task_id=task_id,
             task_type_id=task_type_id,
             task_type=task_type,
@@ -275,216 +294,201 @@ def __init__(
             estimation_parameters=estimation_parameters,
             evaluation_measure=evaluation_measure,
             data_splits_url=data_splits_url,
+            target_name=target_name,
         )
 
-        self.target_name = target_name
-
-    def get_X_and_y(
-        self,
-        dataset_format: str = 'array',
-    ) -> Tuple[
-        Union[np.ndarray, pd.DataFrame, scipy.sparse.spmatrix],
-        Union[np.ndarray, pd.Series]
-    ]:
+    def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
         """Get data associated with the current task.
 
-        Parameters
-        ----------
-        dataset_format : str
-            Data structure of the returned data. See :meth:`openml.datasets.OpenMLDataset.get_data`
-            for possible options.
-
         Returns
         -------
         tuple - X and y
 
         """
         dataset = self.get_dataset()
-        if self.task_type_id not in (1, 2, 3):
-            raise NotImplementedError(self.task_type)
-        X, y, _, _ = dataset.get_data(
-            dataset_format=dataset_format, target=self.target_name,
-        )
+        if self.task_type_id not in (
+            TaskType.SUPERVISED_CLASSIFICATION,
+            TaskType.SUPERVISED_REGRESSION,
+            TaskType.LEARNING_CURVE,
+        ):
+            raise NotImplementedError(
+                f"Task type '{self.task_type}' is not implemented for get_X_and_y(). "
+                f"Supported types: SUPERVISED_CLASSIFICATION, SUPERVISED_REGRESSION,"
+                f"LEARNING_CURVE."
+                f"Task ID: {getattr(self, 'task_id', 'unknown')}. "
+            )
+
+        X, y, _, _ = dataset.get_data(target=self.target_name)
         return X, y
 
-    def _to_dict(self) -> 'OrderedDict[str, OrderedDict]':
-
-        task_container = super(OpenMLSupervisedTask, self)._to_dict()
-        task_dict = task_container['oml:task_inputs']
-
-        task_dict['oml:input'].append(
-            OrderedDict([
-                ('@name', 'target_feature'),
-                ('#text', self.target_name)
-            ])
-        )
+    def _to_dict(self) -> dict[str, dict]:
+        task_container = super()._to_dict()
+        oml_input = task_container["oml:task_inputs"]["oml:input"]  # type: ignore
+        assert isinstance(oml_input, list)
 
+        oml_input.append({"@name": "target_feature", "#text": self.target_name})
         return task_container
 
     @property
-    def estimation_parameters(self):
-
-        warn(
+    def estimation_parameters(self) -> dict[str, str] | None:
+        """Return the estimation parameters for the task."""
+        warnings.warn(
             "The estimation_parameters attribute will be "
             "deprecated in the future, please use "
             "estimation_procedure['parameters'] instead",
-            PendingDeprecationWarning
+            PendingDeprecationWarning,
+            stacklevel=2,
         )
         return self.estimation_procedure["parameters"]
 
     @estimation_parameters.setter
-    def estimation_parameters(self, est_parameters):
-
+    def estimation_parameters(self, est_parameters: dict[str, str] | None) -> None:
         self.estimation_procedure["parameters"] = est_parameters
 
 
 class OpenMLClassificationTask(OpenMLSupervisedTask):
     """OpenML Classification object.
 
-       Inherited from :class:`openml.OpenMLSupervisedTask`
-
-       Parameters
-       ----------
-       class_labels : List of str (optional)
-       cost_matrix: array (optional)
+    Parameters
+    ----------
+    task_id : Union[int, None]
+        ID of the Classification task (if it already exists on OpenML).
+    task_type_id : TaskType
+        ID of the Classification task type.
+    task_type : str
+        Name of the Classification task type.
+    data_set_id : int
+        ID of the OpenML dataset associated with the Classification task.
+    target_name : str
+        Name of the target variable.
+    estimation_procedure_id : int, default=1
+        ID of the estimation procedure for the Classification task.
+    estimation_procedure_type : str, default=None
+        Type of the estimation procedure.
+    estimation_parameters : dict, default=None
+        Estimation parameters for the Classification task.
+    evaluation_measure : str, default=None
+        Name of the evaluation measure.
+    data_splits_url : str, default=None
+        URL of the data splits for the Classification task.
+    class_labels : List of str, default=None
+        A list of class labels (for classification tasks).
+    cost_matrix : array, default=None
+        A cost matrix (for classification tasks).
     """
-    def __init__(
-            self,
-            task_type_id: int,
-            task_type: str,
-            data_set_id: int,
-            target_name: str,
-            estimation_procedure_id: int = 1,
-            estimation_procedure_type: Optional[str] = None,
-            estimation_parameters: Optional[Dict[str, str]] = None,
-            evaluation_measure: Optional[str] = None,
-            data_splits_url: Optional[str] = None,
-            task_id: Optional[int] = None,
-            class_labels: Optional[List[str]] = None,
-            cost_matrix: Optional[np.ndarray] = None,
-    ):
 
-        super(OpenMLClassificationTask, self).__init__(
-            task_id=task_id,
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1
+
+    def __init__(  # noqa: PLR0913
+        self,
+        task_type_id: TaskType,
+        task_type: str,
+        data_set_id: int,
+        target_name: str,
+        estimation_procedure_id: int | None = None,
+        estimation_procedure_type: str | None = None,
+        estimation_parameters: dict[str, str] | None = None,
+        evaluation_measure: str | None = None,
+        data_splits_url: str | None = None,
+        task_id: int | None = None,
+        class_labels: list[str] | None = None,
+        cost_matrix: np.ndarray | None = None,
+    ):
+        super().__init__(
             task_type_id=task_type_id,
             task_type=task_type,
             data_set_id=data_set_id,
+            target_name=target_name,
             estimation_procedure_id=estimation_procedure_id,
             estimation_procedure_type=estimation_procedure_type,
             estimation_parameters=estimation_parameters,
             evaluation_measure=evaluation_measure,
-            target_name=target_name,
             data_splits_url=data_splits_url,
+            task_id=task_id,
         )
         self.class_labels = class_labels
         self.cost_matrix = cost_matrix
-
         if cost_matrix is not None:
-            raise NotImplementedError("Costmatrix")
+            raise NotImplementedError("Costmatrix functionality is not yet implemented.")
 
 
 class OpenMLRegressionTask(OpenMLSupervisedTask):
     """OpenML Regression object.
 
-       Inherited from :class:`openml.OpenMLSupervisedTask`
+    Parameters
+    ----------
+    task_id : Union[int, None]
+        ID of the OpenML Regression task.
+    task_type_id : TaskType
+        Task type ID of the OpenML Regression task.
+    task_type : str
+        Task type of the OpenML Regression task.
+    data_set_id : int
+        ID of the OpenML dataset.
+    target_name : str
+        Name of the target feature used in the Regression task.
+    estimation_procedure_id : int, default=7
+        ID of the OpenML estimation procedure.
+    estimation_procedure_type : str, default=None
+        Type of the OpenML estimation procedure.
+    estimation_parameters : dict, default=None
+        Parameters used by the OpenML estimation procedure.
+    data_splits_url : str, default=None
+        URL of the OpenML data splits for the Regression task.
+    evaluation_measure : str, default=None
+        Evaluation measure used in the Regression task.
     """
-    def __init__(
-            self,
-            task_type_id: int,
-            task_type: str,
-            data_set_id: int,
-            target_name: str,
-            estimation_procedure_id: int = 7,
-            estimation_procedure_type: Optional[str] = None,
-            estimation_parameters: Optional[Dict[str, str]] = None,
-            data_splits_url: Optional[str] = None,
-            task_id: Optional[int] = None,
-            evaluation_measure: Optional[str] = None,
-    ):
-        super(OpenMLRegressionTask, self).__init__(
-            task_id=task_id,
-            task_type_id=task_type_id,
-            task_type=task_type,
-            data_set_id=data_set_id,
-            estimation_procedure_id=estimation_procedure_id,
-            estimation_procedure_type=estimation_procedure_type,
-            estimation_parameters=estimation_parameters,
-            evaluation_measure=evaluation_measure,
-            target_name=target_name,
-            data_splits_url=data_splits_url,
-        )
+
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 7
 
 
 class OpenMLClusteringTask(OpenMLTask):
     """OpenML Clustering object.
 
-       Inherited from :class:`openml.OpenMLTask`
-
-       Parameters
-       ----------
-       target_name : str (optional)
-           Name of the target feature (class) that is not part of the
-           feature set for the clustering task.
+    Parameters
+    ----------
+    task_id : Union[int, None]
+        ID of the OpenML clustering task.
+    task_type_id : TaskType
+        Task type ID of the OpenML clustering task.
+    task_type : str
+        Task type of the OpenML clustering task.
+    data_set_id : int
+        ID of the OpenML dataset used in clustering the task.
+    estimation_procedure_id : int, default=17
+        ID of the OpenML estimation procedure.
+    estimation_procedure_type : str, default=None
+        Type of the OpenML estimation procedure used in the clustering task.
+    estimation_parameters : dict, default=None
+        Parameters used by the OpenML estimation procedure.
+    data_splits_url : str, default=None
+        URL of the OpenML data splits for the clustering task.
+    evaluation_measure : str, default=None
+        Evaluation measure used in the clustering task.
+    target_name : str, default=None
+        Name of the target feature (class) that is not part of the
+        feature set for the clustering task.
     """
-    def __init__(
-            self,
-            task_type_id: int,
-            task_type: str,
-            data_set_id: int,
-            estimation_procedure_id: int = 17,
-            task_id: Optional[int] = None,
-            estimation_procedure_type: Optional[str] = None,
-            estimation_parameters: Optional[Dict[str, str]] = None,
-            data_splits_url: Optional[str] = None,
-            evaluation_measure: Optional[str] = None,
-            target_name: Optional[str] = None,
-    ):
-        super(OpenMLClusteringTask, self).__init__(
-            task_id=task_id,
-            task_type_id=task_type_id,
-            task_type=task_type,
-            data_set_id=data_set_id,
-            evaluation_measure=evaluation_measure,
-            estimation_procedure_id=estimation_procedure_id,
-            estimation_procedure_type=estimation_procedure_type,
-            estimation_parameters=estimation_parameters,
-            data_splits_url=data_splits_url,
-        )
 
-        self.target_name = target_name
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 17
 
-    def get_X(
-        self,
-        dataset_format: str = 'array',
-    ) -> Union[np.ndarray, pd.DataFrame, scipy.sparse.spmatrix]:
+    def get_X(self) -> pd.DataFrame:
         """Get data associated with the current task.
 
-        Parameters
-        ----------
-        dataset_format : str
-            Data structure of the returned data. See :meth:`openml.datasets.OpenMLDataset.get_data`
-            for possible options.
-
         Returns
         -------
-        tuple - X and y
-
+        The X data as a dataframe
         """
         dataset = self.get_dataset()
-        data, *_ = dataset.get_data(
-            dataset_format=dataset_format, target=None,
-        )
+        data, *_ = dataset.get_data(target=None)
         return data
 
-    def _to_dict(self) -> 'OrderedDict[str, OrderedDict]':
-
-        task_container = super(OpenMLClusteringTask, self)._to_dict()
-
+    def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]:
         # Right now, it is not supported as a feature.
         # Uncomment if it is supported on the server
         # in the future.
         # https://github.com/openml/OpenML/issues/925
-        '''
+        """
         task_dict = task_container['oml:task_inputs']
         if self.target_name is not None:
             task_dict['oml:input'].append(
@@ -493,52 +497,39 @@ def _to_dict(self) -> 'OrderedDict[str, OrderedDict]':
                     ('#text', self.target_name)
                 ])
             )
-        '''
-        return task_container
+        """
+        return super()._to_dict()
 
 
 class OpenMLLearningCurveTask(OpenMLClassificationTask):
     """OpenML Learning Curve object.
 
-       Inherited from :class:`openml.OpenMLClassificationTask`
+    Parameters
+    ----------
+    task_id : Union[int, None]
+        ID of the Learning Curve task.
+    task_type_id : TaskType
+        ID of the Learning Curve task.
+    task_type : str
+        Name of the Learning Curve task.
+    data_set_id : int
+        ID of the dataset that this task is associated with.
+    target_name : str
+        Name of the target feature in the dataset.
+    estimation_procedure_id : int, default=13
+        ID of the estimation procedure to use for evaluating models.
+    estimation_procedure_type : str, default=None
+        Type of the estimation procedure.
+    estimation_parameters : dict, default=None
+        Additional parameters for the estimation procedure.
+    data_splits_url : str, default=None
+        URL of the file containing the data splits for Learning Curve task.
+    evaluation_measure : str, default=None
+        Name of the evaluation measure to use for evaluating models.
+    class_labels : list of str, default=None
+        Class labels for Learning Curve tasks.
+    cost_matrix : numpy array, default=None
+        Cost matrix for Learning Curve tasks.
     """
-    def __init__(
-            self,
-            task_type_id: int,
-            task_type: str,
-            data_set_id: int,
-            target_name: str,
-            estimation_procedure_id: int = 13,
-            estimation_procedure_type: Optional[str] = None,
-            estimation_parameters: Optional[Dict[str, str]] = None,
-            data_splits_url: Optional[str] = None,
-            task_id: Optional[int] = None,
-            evaluation_measure: Optional[str] = None,
-            class_labels: Optional[List[str]] = None,
-            cost_matrix: Optional[np.ndarray] = None,
-    ):
-        super(OpenMLLearningCurveTask, self).__init__(
-            task_id=task_id,
-            task_type_id=task_type_id,
-            task_type=task_type,
-            data_set_id=data_set_id,
-            estimation_procedure_id=estimation_procedure_id,
-            estimation_procedure_type=estimation_procedure_type,
-            estimation_parameters=estimation_parameters,
-            evaluation_measure=evaluation_measure,
-            target_name=target_name,
-            data_splits_url=data_splits_url,
-            class_labels=class_labels,
-            cost_matrix=cost_matrix,
-        )
-
 
-class TaskTypeEnum(object):
-    SUPERVISED_CLASSIFICATION = 1
-    SUPERVISED_REGRESSION = 2
-    LEARNING_CURVE = 3
-    SUPERVISED_DATASTREAM_CLASSIFICATION = 4
-    CLUSTERING = 5
-    MACHINE_LEARNING_CHALLENGE = 6
-    SURVIVAL_ANALYSIS = 7
-    SUBGROUP_DISCOVERY = 8
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 13
diff --git a/openml/testing.py b/openml/testing.py
index 4841ca4b6..9f694f9bf 100644
--- a/openml/testing.py
+++ b/openml/testing.py
@@ -1,24 +1,31 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import hashlib
 import inspect
+import logging
 import os
+import pathlib
 import shutil
-import sys
 import time
-from typing import Dict
 import unittest
-import warnings
+from pathlib import Path
+from typing import ClassVar
 
-# Currently, importing oslo raises a lot of warning that it will stop working
-# under python3.8; remove this once they disappear
-with warnings.catch_warnings():
-    warnings.simplefilter("ignore")
-    from oslo_concurrency import lockutils
+import requests
 
 import openml
-from openml.tasks import TaskTypeEnum
+from openml.exceptions import OpenMLServerException
+from openml.tasks import TaskType
 
-import pytest
-import logging
+
+def _check_dataset(dataset: dict) -> None:
+    assert isinstance(dataset, dict)
+    assert len(dataset) >= 2
+    assert "did" in dataset
+    assert isinstance(dataset["did"], int)
+    assert "status" in dataset
+    assert dataset["status"] in ["in_preparation", "active", "deactivated"]
 
 
 class TestBase(unittest.TestCase):
@@ -29,20 +36,26 @@ class TestBase(unittest.TestCase):
     Currently hard-codes a read-write key.
     Hopefully soon allows using a test server, not the production server.
     """
-    publish_tracker = {'run': [], 'data': [], 'flow': [], 'task': [],
-                       'study': [], 'user': []}  # type: dict
-    test_server = "https://test.openml.org/api/v1/xml"
-    # amueller's read/write key that he will throw away later
-    apikey = "610344db6388d9ba34f6db45a3cf71de"
-
-    # creating logger for unit test file deletion status
-    logger = logging.getLogger("unit_tests")
-    logger.setLevel(logging.INFO)
-    fh = logging.FileHandler('TestBase.log')
-    fh.setLevel(logging.INFO)
-    logger.addHandler(fh)
-
-    def setUp(self, n_levels: int = 1):
+
+    # TODO: This could be made more explcit with a TypedDict instead of list[str | int]
+    publish_tracker: ClassVar[dict[str, list[str | int]]] = {
+        "run": [],
+        "data": [],
+        "flow": [],
+        "task": [],
+        "study": [],
+        "user": [],
+    }
+    flow_name_tracker: ClassVar[list[str]] = []
+    test_server = f"{openml.config.TEST_SERVER_URL}/api/v1/xml"
+    admin_key = os.environ.get(openml.config.OPENML_TEST_SERVER_ADMIN_KEY_ENV_VAR)
+    user_key = openml.config._TEST_SERVER_NORMAL_USER_KEY
+
+    # creating logger for tracking files uploaded to test server
+    logger = logging.getLogger("unit_tests_published_entities")
+    logger.setLevel(logging.DEBUG)
+
+    def setUp(self, n_levels: int = 1, tmpdir_suffix: str = "") -> None:
         """Setup variables and temporary directories.
 
         In particular, this methods:
@@ -59,67 +72,72 @@ def setUp(self, n_levels: int = 1):
             Number of nested directories the test is in. Necessary to resolve the path to the
             ``files`` directory, which is located directly under the ``tests`` directory.
         """
-
         # This cache directory is checked in to git to simulate a populated
         # cache
         self.maxDiff = None
-        self.static_cache_dir = None
-        abspath_this_file = os.path.abspath(inspect.getfile(self.__class__))
-        static_cache_dir = os.path.dirname(abspath_this_file)
+        abspath_this_file = Path(inspect.getfile(self.__class__)).absolute()
+        static_cache_dir = abspath_this_file.parent
         for _ in range(n_levels):
-            static_cache_dir = os.path.abspath(os.path.join(static_cache_dir, '..'))
-        content = os.listdir(static_cache_dir)
-        if 'files' in content:
-            self.static_cache_dir = os.path.join(static_cache_dir, 'files')
+            static_cache_dir = static_cache_dir.parent.absolute()
 
-        if self.static_cache_dir is None:
+        content = os.listdir(static_cache_dir)  # noqa: PTH208
+        if "files" in content:
+            static_cache_dir = static_cache_dir / "files"
+        else:
             raise ValueError(
-                'Cannot find test cache dir, expected it to be {}!'.format(static_cache_dir)
+                f"Cannot find test cache dir, expected it to be {static_cache_dir}!",
             )
 
-        self.cwd = os.getcwd()
-        workdir = os.path.dirname(os.path.abspath(__file__))
-        tmp_dir_name = self.id()
-        self.workdir = os.path.join(workdir, tmp_dir_name)
+        self.static_cache_dir = static_cache_dir
+        self.cwd = Path.cwd()
+        workdir = Path(__file__).parent.absolute()
+        tmp_dir_name = self.id() + tmpdir_suffix
+        self.workdir = workdir / tmp_dir_name
         shutil.rmtree(self.workdir, ignore_errors=True)
 
-        os.mkdir(self.workdir)
+        self.workdir.mkdir(exist_ok=True)
         os.chdir(self.workdir)
 
         self.cached = True
-        openml.config.apikey = TestBase.apikey
-        self.production_server = "https://openml.org/api/v1/xml"
-        openml.config.server = TestBase.test_server
-        openml.config.avoid_duplicate_runs = False
-        openml.config.cache_directory = self.workdir
-
-        # If we're on travis, we save the api key in the config file to allow
-        # the notebook tests to read them.
-        if os.environ.get('TRAVIS') or os.environ.get('APPVEYOR'):
-            with lockutils.external_lock('config', lock_path=self.workdir):
-                with open(openml.config.config_file, 'w') as fh:
-                    fh.write('apikey = %s' % openml.config.apikey)
+        openml.config.apikey = TestBase.user_key
+        self.production_server = "https://www.openml.org/api/v1/xml"
+        openml.config.set_root_cache_directory(str(self.workdir))
 
         # Increase the number of retries to avoid spurious server failures
+        self.retry_policy = openml.config.retry_policy
         self.connection_n_retries = openml.config.connection_n_retries
-        openml.config.connection_n_retries = 10
+        openml.config.set_retry_policy("robot", n_retries=20)
+
+    def use_production_server(self) -> None:
+        """
+        Use the production server for the OpenML API calls.
 
-    def tearDown(self):
+        Please use this sparingly - it is better to use the test server.
+        """
+        openml.config.server = self.production_server
+        openml.config.apikey = ""
+
+    def tearDown(self) -> None:
+        """Tear down the test"""
         os.chdir(self.cwd)
         try:
             shutil.rmtree(self.workdir)
-        except PermissionError:
-            if os.name == 'nt':
+        except PermissionError as e:
+            if os.name != "nt":
                 # one of the files may still be used by another process
-                pass
-            else:
-                raise
-        openml.config.server = self.production_server
+                raise e
+
         openml.config.connection_n_retries = self.connection_n_retries
+        openml.config.retry_policy = self.retry_policy
 
     @classmethod
-    def _mark_entity_for_removal(self, entity_type, entity_id):
-        """ Static record of entities uploaded to test server
+    def _mark_entity_for_removal(
+        cls,
+        entity_type: str,
+        entity_id: int,
+        entity_name: str | None = None,
+    ) -> None:
+        """Static record of entities uploaded to test server
 
         Dictionary of lists where the keys are 'entity_type'.
         Each such dictionary is a list of integer IDs.
@@ -130,10 +148,13 @@ def _mark_entity_for_removal(self, entity_type, entity_id):
             TestBase.publish_tracker[entity_type] = [entity_id]
         else:
             TestBase.publish_tracker[entity_type].append(entity_id)
+        if isinstance(entity_type, openml.flows.OpenMLFlow):
+            assert entity_name is not None
+            cls.flow_name_tracker.append(entity_name)
 
     @classmethod
-    def _delete_entity_from_tracker(self, entity_type, entity):
-        """ Deletes entity records from the static file_tracker
+    def _delete_entity_from_tracker(cls, entity_type: str, entity: int) -> None:
+        """Deletes entity records from the static file_tracker
 
         Given an entity type and corresponding ID, deletes all entries, including
         duplicate entries of the ID for the entity type.
@@ -141,135 +162,74 @@ def _delete_entity_from_tracker(self, entity_type, entity):
         if entity_type in TestBase.publish_tracker:
             # removes duplicate entries
             TestBase.publish_tracker[entity_type] = list(set(TestBase.publish_tracker[entity_type]))
-            if entity_type == 'flow':
-                delete_index = [i for i, (id_, _) in
-                                enumerate(TestBase.publish_tracker[entity_type])
-                                if id_ == entity][0]
+            if entity_type == "flow":
+                delete_index = next(
+                    i
+                    for i, (id_, _) in enumerate(
+                        zip(
+                            TestBase.publish_tracker[entity_type],
+                            TestBase.flow_name_tracker,
+                            strict=False,
+                        ),
+                    )
+                    if id_ == entity
+                )
             else:
-                delete_index = [i for i, id_ in
-                                enumerate(TestBase.publish_tracker[entity_type])
-                                if id_ == entity][0]
+                delete_index = next(
+                    i
+                    for i, id_ in enumerate(TestBase.publish_tracker[entity_type])
+                    if id_ == entity
+                )
             TestBase.publish_tracker[entity_type].pop(delete_index)
 
-    @pytest.fixture(scope="session", autouse=True)
-    def _cleanup_fixture(self):
-        """Cleans up files generated by unit tests
-
-        This function is called at the beginning of the invocation of
-        TestBase (defined below), by each of class that inherits TestBase.
-        The 'yield' creates a checkpoint and breaks away to continue running
-        the unit tests of the sub class. When all the tests end, execution
-        resumes from the checkpoint.
-        """
-
-        abspath_this_file = os.path.abspath(inspect.getfile(self.__class__))
-        static_cache_dir = os.path.dirname(abspath_this_file)
-        # Could be a risky while condition, however, going up a directory
-        # n-times will eventually end at main directory
-        while True:
-            if 'openml' in os.listdir(static_cache_dir):
-                break
-            else:
-                static_cache_dir = os.path.join(static_cache_dir, '../')
-        directory = os.path.join(static_cache_dir, 'tests/files/')
-        files = os.walk(directory)
-        old_file_list = []
-        for root, _, filenames in files:
-            for filename in filenames:
-                old_file_list.append(os.path.join(root, filename))
-        # context switches to other remaining tests
-        # pauses the code execution here till all tests in the 'session' is over
-        yield
-        # resumes from here after all collected tests are completed
-
-        #
-        # Local file deletion
-        #
-        files = os.walk(directory)
-        new_file_list = []
-        for root, _, filenames in files:
-            for filename in filenames:
-                new_file_list.append(os.path.join(root, filename))
-        # filtering the files generated during this run
-        new_file_list = list(set(new_file_list) - set(old_file_list))
-        for file in new_file_list:
-            os.remove(file)
-
-        #
-        # Test server deletion
-        #
-        openml.config.server = TestBase.test_server
-        openml.config.apikey = TestBase.apikey
-
-        # legal_entities defined in openml.utils._delete_entity - {'user'}
-        entity_types = {'run', 'data', 'flow', 'task', 'study'}
-        # 'run' needs to be first entity to allow other dependent entities to be deleted
-        # cloning file tracker to allow deletion of entries of deleted files
-        tracker = TestBase.publish_tracker.copy()
-
-        # reordering to delete sub flows at the end of flows
-        # sub-flows have shorter names, hence, sorting by descending order of flow name length
-        if 'flow' in entity_types:
-            flow_deletion_order = [entity_id for entity_id, _ in
-                                   sorted(tracker['flow'], key=lambda x: len(x[1]), reverse=True)]
-            tracker['flow'] = flow_deletion_order
-
-        # deleting all collected entities published to test server
-        for entity_type in entity_types:
-            for i, entity in enumerate(tracker[entity_type]):
-                try:
-                    openml.utils._delete_entity(entity_type, entity)
-                    TestBase.logger.info("Deleted ({}, {})".format(entity_type, entity))
-                    # deleting actual entry from tracker
-                    TestBase._delete_entity_from_tracker(entity_type, entity)
-                except Exception as e:
-                    TestBase.logger.warning("Cannot delete ({},{}): {}".format(
-                        entity_type, entity, e))
-        TestBase.logger.info("End of cleanup_fixture from {}".format(self.__class__))
-
-    def _get_sentinel(self, sentinel=None):
+    def _get_sentinel(self, sentinel: str | None = None) -> str:
         if sentinel is None:
             # Create a unique prefix for the flow. Necessary because the flow
             # is identified by its name and external version online. Having a
             # unique name allows us to publish the same flow in each test run.
-            md5 = hashlib.md5()
-            md5.update(str(time.time()).encode('utf-8'))
-            md5.update(str(os.getpid()).encode('utf-8'))
+            md5 = hashlib.md5()  # noqa: S324
+            md5.update(str(time.time()).encode("utf-8"))
+            md5.update(str(os.getpid()).encode("utf-8"))
             sentinel = md5.hexdigest()[:10]
-            sentinel = 'TEST%s' % sentinel
+            sentinel = f"TEST{sentinel}"
         return sentinel
 
-    def _add_sentinel_to_flow_name(self, flow, sentinel=None):
+    def _add_sentinel_to_flow_name(
+        self,
+        flow: openml.flows.OpenMLFlow,
+        sentinel: str | None = None,
+    ) -> tuple[openml.flows.OpenMLFlow, str]:
         sentinel = self._get_sentinel(sentinel=sentinel)
-        flows_to_visit = list()
+        flows_to_visit = []
         flows_to_visit.append(flow)
         while len(flows_to_visit) > 0:
             current_flow = flows_to_visit.pop()
-            current_flow.name = '%s%s' % (sentinel, current_flow.name)
+            current_flow.name = f"{sentinel}{current_flow.name}"
             for subflow in current_flow.components.values():
                 flows_to_visit.append(subflow)
 
         return flow, sentinel
 
-    def _check_dataset(self, dataset):
-        self.assertEqual(type(dataset), dict)
-        self.assertGreaterEqual(len(dataset), 2)
-        self.assertIn('did', dataset)
-        self.assertIsInstance(dataset['did'], int)
-        self.assertIn('status', dataset)
-        self.assertIsInstance(dataset['status'], str)
-        self.assertIn(dataset['status'], ['in_preparation', 'active',
-                                          'deactivated'])
-
-    def _check_fold_timing_evaluations(
+    def _check_dataset(self, dataset: dict[str, str | int]) -> None:
+        _check_dataset(dataset)
+        assert isinstance(dataset, dict)
+        assert len(dataset) >= 2
+        assert "did" in dataset
+        assert isinstance(dataset["did"], int)
+        assert "status" in dataset
+        assert isinstance(dataset["status"], str)
+        assert dataset["status"] in ["in_preparation", "active", "deactivated"]
+
+    def _check_fold_timing_evaluations(  # noqa: PLR0913
         self,
-        fold_evaluations: Dict,
+        fold_evaluations: dict[str, dict[int, dict[int, float]]],
         num_repeats: int,
         num_folds: int,
+        *,
         max_time_allowed: float = 60000.0,
-        task_type: int = TaskTypeEnum.SUPERVISED_CLASSIFICATION,
+        task_type: TaskType = TaskType.SUPERVISED_CLASSIFICATION,
         check_scores: bool = True,
-    ):
+    ) -> None:
         """
         Checks whether the right timing measures are attached to the run
         (before upload). Test is only performed for versions >= Python3.3
@@ -279,46 +239,93 @@ def _check_fold_timing_evaluations(
         default max_time_allowed (per fold, in milli seconds) = 1 minute,
         quite pessimistic
         """
-
         # a dict mapping from openml measure to a tuple with the minimum and
         # maximum allowed value
         check_measures = {
             # should take at least one millisecond (?)
-            'usercpu_time_millis_testing': (0, max_time_allowed),
-            'usercpu_time_millis_training': (0, max_time_allowed),
-            'usercpu_time_millis': (0, max_time_allowed),
-            'wall_clock_time_millis_training': (0, max_time_allowed),
-            'wall_clock_time_millis_testing': (0, max_time_allowed),
-            'wall_clock_time_millis': (0, max_time_allowed),
+            "usercpu_time_millis_testing": (0, max_time_allowed),
+            "usercpu_time_millis_training": (0, max_time_allowed),
+            "usercpu_time_millis": (0, max_time_allowed),
+            "wall_clock_time_millis_training": (0, max_time_allowed),
+            "wall_clock_time_millis_testing": (0, max_time_allowed),
+            "wall_clock_time_millis": (0, max_time_allowed),
         }
 
         if check_scores:
-            if task_type in (TaskTypeEnum.SUPERVISED_CLASSIFICATION, TaskTypeEnum.LEARNING_CURVE):
-                check_measures['predictive_accuracy'] = (0, 1.)
-            elif task_type == TaskTypeEnum.SUPERVISED_REGRESSION:
-                check_measures['mean_absolute_error'] = (0, float("inf"))
-
-        self.assertIsInstance(fold_evaluations, dict)
-        if sys.version_info[:2] >= (3, 3):
-            # this only holds if we are allowed to record time (otherwise some
-            # are missing)
-            self.assertEqual(set(fold_evaluations.keys()),
-                             set(check_measures.keys()))
-
-        for measure in check_measures.keys():
+            if task_type in (TaskType.SUPERVISED_CLASSIFICATION, TaskType.LEARNING_CURVE):
+                check_measures["predictive_accuracy"] = (0, 1.0)
+            elif task_type == TaskType.SUPERVISED_REGRESSION:
+                check_measures["mean_absolute_error"] = (0, float("inf"))
+
+        assert isinstance(fold_evaluations, dict)
+        assert set(fold_evaluations.keys()) == set(check_measures.keys())
+
+        for measure in check_measures:
             if measure in fold_evaluations:
                 num_rep_entrees = len(fold_evaluations[measure])
-                self.assertEqual(num_rep_entrees, num_repeats)
+                assert num_rep_entrees == num_repeats
                 min_val = check_measures[measure][0]
                 max_val = check_measures[measure][1]
                 for rep in range(num_rep_entrees):
                     num_fold_entrees = len(fold_evaluations[measure][rep])
-                    self.assertEqual(num_fold_entrees, num_folds)
+                    assert num_fold_entrees == num_folds
                     for fold in range(num_fold_entrees):
                         evaluation = fold_evaluations[measure][rep][fold]
-                        self.assertIsInstance(evaluation, float)
-                        self.assertGreaterEqual(evaluation, min_val)
-                        self.assertLessEqual(evaluation, max_val)
+                        assert isinstance(evaluation, float)
+                        assert evaluation >= min_val
+                        assert evaluation <= max_val
+
+
+def check_task_existence(
+    task_type: TaskType,
+    dataset_id: int,
+    target_name: str,
+    **kwargs: dict[str, str | int | dict[str, str | int | openml.tasks.TaskType]],
+) -> int | None:
+    """Checks if any task with exists on test server that matches the meta data.
+
+    Parameter
+    ---------
+    task_type : openml.tasks.TaskType
+    dataset_id : int
+    target_name : str
+
+    Return
+    ------
+    int, None
+    """
+    return_val = None
+    tasks = openml.tasks.list_tasks(task_type=task_type)
+    if len(tasks) == 0:
+        return None
+    tasks = tasks.loc[tasks["did"] == dataset_id]
+    if len(tasks) == 0:
+        return None
+    tasks = tasks.loc[tasks["target_feature"] == target_name]
+    if len(tasks) == 0:
+        return None
+    task_match = []
+    for task_id in tasks["tid"].to_list():
+        task_match.append(task_id)
+        try:
+            task = openml.tasks.get_task(task_id)
+        except OpenMLServerException:
+            # can fail if task_id deleted by another parallely run unit test
+            task_match.pop(-1)
+            return_val = None
+            continue
+        for k, v in kwargs.items():
+            if getattr(task, k) != v:
+                # even if one of the meta-data key mismatches, then task_id is not a match
+                task_match.pop(-1)
+                break
+        # if task_id is retained in the task_match list, it passed all meta key-value matches
+        if len(task_match) == 1:
+            return_val = task_id
+            break
+    if len(task_match) == 0:
+        return_val = None
+    return return_val
 
 
 try:
@@ -327,4 +334,31 @@ def _check_fold_timing_evaluations(
     from sklearn.preprocessing import Imputer as SimpleImputer
 
 
-__all__ = ['TestBase', 'SimpleImputer']
+class CustomImputer(SimpleImputer):
+    """Duplicate class alias for sklearn's SimpleImputer
+
+    Helps bypass the sklearn extension duplicate operation check
+    """
+
+
+def create_request_response(
+    *,
+    status_code: int,
+    content_filepath: pathlib.Path,
+) -> requests.Response:
+    with content_filepath.open("r") as xml_response:
+        response_body = xml_response.read()
+
+    response = requests.Response()
+    response.status_code = status_code
+    response._content = response_body.encode()
+    return response
+
+
+__all__ = [
+    "CustomImputer",
+    "SimpleImputer",
+    "TestBase",
+    "check_task_existence",
+    "create_request_response",
+]
diff --git a/openml/utils.py b/openml/utils.py
deleted file mode 100644
index f6cc81ff7..000000000
--- a/openml/utils.py
+++ /dev/null
@@ -1,390 +0,0 @@
-import os
-import hashlib
-import xmltodict
-import shutil
-import warnings
-import pandas as pd
-from functools import wraps
-import collections
-
-import openml._api_calls
-import openml.exceptions
-from . import config
-
-oslo_installed = False
-try:
-    # Currently, importing oslo raises a lot of warning that it will stop working
-    # under python3.8; remove this once they disappear
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        from oslo_concurrency import lockutils
-        oslo_installed = True
-except ImportError:
-    pass
-
-
-def extract_xml_tags(xml_tag_name, node, allow_none=True):
-    """Helper to extract xml tags from xmltodict.
-
-    Parameters
-    ----------
-    xml_tag_name : str
-        Name of the xml tag to extract from the node.
-
-    node : object
-        Node object returned by ``xmltodict`` from which ``xml_tag_name``
-        should be extracted.
-
-    allow_none : bool
-        If ``False``, the tag needs to exist in the node. Will raise a
-        ``ValueError`` if it does not.
-
-    Returns
-    -------
-    object
-    """
-    if xml_tag_name in node and node[xml_tag_name] is not None:
-        if isinstance(node[xml_tag_name], dict):
-            rval = [node[xml_tag_name]]
-        elif isinstance(node[xml_tag_name], str):
-            rval = [node[xml_tag_name]]
-        elif isinstance(node[xml_tag_name], list):
-            rval = node[xml_tag_name]
-        else:
-            raise ValueError('Received not string and non list as tag item')
-
-        return rval
-    else:
-        if allow_none:
-            return None
-        else:
-            raise ValueError("Could not find tag '%s' in node '%s'" %
-                             (xml_tag_name, str(node)))
-
-
-def _tag_entity(entity_type, entity_id, tag, untag=False):
-    """
-    Function that tags or untags a given entity on OpenML. As the OpenML
-    API tag functions all consist of the same format, this function covers
-    all entity types (currently: dataset, task, flow, setup, run). Could
-    be used in a partial to provide dataset_tag, dataset_untag, etc.
-
-    Parameters
-    ----------
-    entity_type : str
-        Name of the entity to tag (e.g., run, flow, data)
-
-    entity_id : int
-        OpenML id of the entity
-
-    tag : str
-        The tag
-
-    untag : bool
-        Set to true if needed to untag, rather than tag
-
-    Returns
-    -------
-    tags : list
-        List of tags that the entity is (still) tagged with
-    """
-    legal_entities = {'data', 'task', 'flow', 'setup', 'run'}
-    if entity_type not in legal_entities:
-        raise ValueError('Can\'t tag a %s' % entity_type)
-
-    uri = '%s/tag' % entity_type
-    main_tag = 'oml:%s_tag' % entity_type
-    if untag:
-        uri = '%s/untag' % entity_type
-        main_tag = 'oml:%s_untag' % entity_type
-
-    post_variables = {'%s_id' % entity_type: entity_id, 'tag': tag}
-    result_xml = openml._api_calls._perform_api_call(uri,
-                                                     'post',
-                                                     post_variables)
-
-    result = xmltodict.parse(result_xml, force_list={'oml:tag'})[main_tag]
-
-    if 'oml:tag' in result:
-        return result['oml:tag']
-    else:
-        # no tags, return empty list
-        return []
-
-
-def _delete_entity(entity_type, entity_id):
-    """
-    Function that deletes a given entity on OpenML. As the OpenML
-    API tag functions all consist of the same format, this function covers
-    all entity types that can be deleted (currently: dataset, task, flow,
-    run, study and user).
-
-    Parameters
-    ----------
-    entity_type : str
-        Name of the entity to tag (e.g., run, flow, data)
-
-    entity_id : int
-        OpenML id of the entity
-
-    Returns
-    -------
-    bool
-        True iff the deletion was successful. False otherwse
-    """
-    legal_entities = {
-        'data',
-        'flow',
-        'task',
-        'run',
-        'study',
-        'user',
-    }
-    if entity_type not in legal_entities:
-        raise ValueError('Can\'t delete a %s' % entity_type)
-
-    url_suffix = '%s/%d' % (entity_type, entity_id)
-    result_xml = openml._api_calls._perform_api_call(url_suffix,
-                                                     'delete')
-    result = xmltodict.parse(result_xml)
-    if 'oml:%s_delete' % entity_type in result:
-        return True
-    else:
-        return False
-
-
-def _list_all(listing_call, output_format='dict', *args, **filters):
-    """Helper to handle paged listing requests.
-
-    Example usage:
-
-    ``evaluations = list_all(list_evaluations, "predictive_accuracy", task=mytask)``
-
-    Parameters
-    ----------
-    listing_call : callable
-        Call listing, e.g. list_evaluations.
-    output_format : str, optional (default='dict')
-        The parameter decides the format of the output.
-        - If 'dict' the output is a dict of dict
-        - If 'dataframe' the output is a pandas DataFrame
-    *args : Variable length argument list
-        Any required arguments for the listing call.
-    **filters : Arbitrary keyword arguments
-        Any filters that can be applied to the listing function.
-        additionally, the batch_size can be specified. This is
-        useful for testing purposes.
-    Returns
-    -------
-    dict or dataframe
-    """
-
-    # eliminate filters that have a None value
-    active_filters = {key: value for key, value in filters.items()
-                      if value is not None}
-    page = 0
-    result = collections.OrderedDict()
-    if output_format == 'dataframe':
-        result = pd.DataFrame()
-
-    # Default batch size per paging.
-    # This one can be set in filters (batch_size), but should not be
-    # changed afterwards. The derived batch_size can be changed.
-    BATCH_SIZE_ORIG = 10000
-    if 'batch_size' in active_filters:
-        BATCH_SIZE_ORIG = active_filters['batch_size']
-        del active_filters['batch_size']
-
-    # max number of results to be shown
-    LIMIT = None
-    offset = 0
-    if 'size' in active_filters:
-        LIMIT = active_filters['size']
-        del active_filters['size']
-
-    if LIMIT is not None and BATCH_SIZE_ORIG > LIMIT:
-        BATCH_SIZE_ORIG = LIMIT
-
-    if 'offset' in active_filters:
-        offset = active_filters['offset']
-        del active_filters['offset']
-
-    batch_size = BATCH_SIZE_ORIG
-    while True:
-        try:
-            current_offset = offset + BATCH_SIZE_ORIG * page
-            new_batch = listing_call(
-                *args,
-                limit=batch_size,
-                offset=current_offset,
-                output_format=output_format,
-                **active_filters
-            )
-        except openml.exceptions.OpenMLServerNoResult:
-            # we want to return an empty dict in this case
-            break
-        if output_format == 'dataframe':
-            if len(result) == 0:
-                result = new_batch
-            else:
-                result = result.append(new_batch, ignore_index=True)
-        else:
-            # For output_format = 'dict' or 'object'
-            result.update(new_batch)
-        if len(new_batch) < batch_size:
-            break
-        page += 1
-        if LIMIT is not None:
-            # check if the number of required results has been achieved
-            # always do a 'bigger than' check,
-            # in case of bugs to prevent infinite loops
-            if len(result) >= LIMIT:
-                break
-            # check if there are enough results to fulfill a batch
-            if BATCH_SIZE_ORIG > LIMIT - len(result):
-                batch_size = LIMIT - len(result)
-
-    return result
-
-
-def _create_cache_directory(key):
-    cache = config.get_cache_directory()
-    cache_dir = os.path.join(cache, key)
-    try:
-        os.makedirs(cache_dir)
-    except OSError:
-        pass
-    return cache_dir
-
-
-def _create_cache_directory_for_id(key, id_):
-    """Create the cache directory for a specific ID
-
-    In order to have a clearer cache structure and because every task
-    is cached in several files (description, split), there
-    is a directory for each task witch the task ID being the directory
-    name. This function creates this cache directory.
-
-    This function is NOT thread/multiprocessing safe.
-
-    Parameters
-    ----------
-    key : str
-
-    id_ : int
-
-    Returns
-    -------
-    str
-        Path of the created dataset cache directory.
-    """
-    cache_dir = os.path.join(
-        _create_cache_directory(key), str(id_)
-    )
-    if os.path.exists(cache_dir) and os.path.isdir(cache_dir):
-        pass
-    elif os.path.exists(cache_dir) and not os.path.isdir(cache_dir):
-        raise ValueError('%s cache dir exists but is not a directory!' % key)
-    else:
-        os.makedirs(cache_dir)
-    return cache_dir
-
-
-def _remove_cache_dir_for_id(key, cache_dir):
-    """Remove the task cache directory
-
-    This function is NOT thread/multiprocessing safe.
-
-    Parameters
-    ----------
-    key : str
-
-    cache_dir : str
-    """
-    try:
-        shutil.rmtree(cache_dir)
-    except (OSError, IOError):
-        raise ValueError('Cannot remove faulty %s cache directory %s.'
-                         'Please do this manually!' % (key, cache_dir))
-
-
-def thread_safe_if_oslo_installed(func):
-    if oslo_installed:
-        @wraps(func)
-        def safe_func(*args, **kwargs):
-            # Lock directories use the id that is passed as either positional or keyword argument.
-            id_parameters = [parameter_name for parameter_name in kwargs if '_id' in parameter_name]
-            if len(id_parameters) == 1:
-                id_ = kwargs[id_parameters[0]]
-            elif len(args) > 0:
-                id_ = args[0]
-            else:
-                raise RuntimeError("An id must be specified for {}, was passed: ({}, {}).".format(
-                    func.__name__, args, kwargs
-                ))
-            # The [7:] gets rid of the 'openml.' prefix
-            lock_name = "{}.{}:{}".format(func.__module__[7:], func.__name__, id_)
-            with lockutils.external_lock(name=lock_name, lock_path=_create_lockfiles_dir()):
-                return func(*args, **kwargs)
-        return safe_func
-    else:
-        return func
-
-
-def _create_lockfiles_dir():
-    dir = os.path.join(config.get_cache_directory(), 'locks')
-    try:
-        os.makedirs(dir)
-    except OSError:
-        pass
-    return dir
-
-
-def _download_text_file(source: str,
-                        output_path: str,
-                        md5_checksum: str = None,
-                        exists_ok: bool = True,
-                        encoding: str = 'utf8',
-                        ) -> None:
-    """ Download the text file at `source` and store it in `output_path`.
-
-    By default, do nothing if a file already exists in `output_path`.
-    The downloaded file can be checked against an expected md5 checksum.
-
-    Parameters
-    ----------
-    source : str
-        url of the file to be downloaded
-    output_path : str
-        full path, including filename, of where the file should be stored.
-    md5_checksum : str, optional (default=None)
-        If not None, should be a string of hexidecimal digits of the expected digest value.
-    exists_ok : bool, optional (default=True)
-        If False, raise an FileExistsError if there already exists a file at `output_path`.
-    encoding : str, optional (default='utf8')
-        The encoding with which the file should be stored.
-    """
-    try:
-        with open(output_path, encoding=encoding):
-            if exists_ok:
-                return
-            else:
-                raise FileExistsError
-    except FileNotFoundError:
-        pass
-
-    downloaded_file = openml._api_calls._read_url(source, request_method='get')
-
-    if md5_checksum is not None:
-        md5 = hashlib.md5()
-        md5.update(downloaded_file.encode('utf-8'))
-        md5_checksum_download = md5.hexdigest()
-        if md5_checksum != md5_checksum_download:
-            raise openml.exceptions.OpenMLHashException(
-                'Checksum {} of downloaded file is unequal to the expected checksum {}.'
-                .format(md5_checksum_download, md5_checksum))
-
-    with open(output_path, "w", encoding=encoding) as fh:
-        fh.write(downloaded_file)
-
-    del downloaded_file
diff --git a/openml/utils/__init__.py b/openml/utils/__init__.py
new file mode 100644
index 000000000..1e74a3684
--- /dev/null
+++ b/openml/utils/__init__.py
@@ -0,0 +1,39 @@
+"""Utilities module."""
+
+from openml.utils._openml import (
+    ProgressBar,
+    ReprMixin,
+    _create_cache_directory,
+    _create_cache_directory_for_id,
+    _create_lockfiles_dir,
+    _delete_entity,
+    _get_cache_dir_for_id,
+    _get_cache_dir_for_key,
+    _get_rest_api_type_alias,
+    _list_all,
+    _remove_cache_dir_for_id,
+    _tag_entity,
+    _tag_openml_base,
+    extract_xml_tags,
+    get_cache_size,
+    thread_safe_if_oslo_installed,
+)
+
+__all__ = [
+    "ProgressBar",
+    "ReprMixin",
+    "_create_cache_directory",
+    "_create_cache_directory_for_id",
+    "_create_lockfiles_dir",
+    "_delete_entity",
+    "_get_cache_dir_for_id",
+    "_get_cache_dir_for_key",
+    "_get_rest_api_type_alias",
+    "_list_all",
+    "_remove_cache_dir_for_id",
+    "_tag_entity",
+    "_tag_openml_base",
+    "extract_xml_tags",
+    "get_cache_size",
+    "thread_safe_if_oslo_installed",
+]
diff --git a/openml/utils/_openml.py b/openml/utils/_openml.py
new file mode 100644
index 000000000..2bf54690e
--- /dev/null
+++ b/openml/utils/_openml.py
@@ -0,0 +1,544 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import contextlib
+import re
+import shutil
+import warnings
+from abc import ABC, abstractmethod
+from collections.abc import Callable, Iterable, Mapping, Sequence, Sized
+from functools import wraps
+from pathlib import Path
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+    TypeVar,
+    overload,
+)
+from typing_extensions import ParamSpec
+
+import numpy as np
+import xmltodict
+from minio.helpers import ProgressType
+from tqdm import tqdm
+
+import openml
+import openml._api_calls
+import openml.exceptions
+
+# Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
+if TYPE_CHECKING:
+    from openml.base import OpenMLBase
+
+    P = ParamSpec("P")
+    R = TypeVar("R")
+    _SizedT = TypeVar("_SizedT", bound=Sized)
+
+
+@overload
+def extract_xml_tags(
+    xml_tag_name: str,
+    node: Mapping[str, Any],
+    *,
+    allow_none: Literal[True] = ...,
+) -> Any | None: ...
+
+
+@overload
+def extract_xml_tags(
+    xml_tag_name: str,
+    node: Mapping[str, Any],
+    *,
+    allow_none: Literal[False],
+) -> Any: ...
+
+
+def extract_xml_tags(
+    xml_tag_name: str,
+    node: Mapping[str, Any],
+    *,
+    allow_none: bool = True,
+) -> Any | None:
+    """Helper to extract xml tags from xmltodict.
+
+    Parameters
+    ----------
+    xml_tag_name : str
+        Name of the xml tag to extract from the node.
+
+    node : Mapping[str, Any]
+        Node object returned by ``xmltodict`` from which ``xml_tag_name``
+        should be extracted.
+
+    allow_none : bool
+        If ``False``, the tag needs to exist in the node. Will raise a
+        ``ValueError`` if it does not.
+
+    Returns
+    -------
+    object
+    """
+    if xml_tag_name in node and node[xml_tag_name] is not None:
+        if isinstance(node[xml_tag_name], (dict, str)):
+            return [node[xml_tag_name]]
+        if isinstance(node[xml_tag_name], list):
+            return node[xml_tag_name]
+
+        raise ValueError("Received not string and non list as tag item")
+
+    if allow_none:
+        return None
+
+    raise ValueError(f"Could not find tag '{xml_tag_name}' in node '{node!s}'")
+
+
+def _get_rest_api_type_alias(oml_object: OpenMLBase) -> str:
+    """Return the alias of the openml entity as it is defined for the REST API."""
+    rest_api_mapping: list[tuple[type | tuple, str]] = [
+        (openml.datasets.OpenMLDataset, "data"),
+        (openml.flows.OpenMLFlow, "flow"),
+        (openml.tasks.OpenMLTask, "task"),
+        (openml.runs.OpenMLRun, "run"),
+        ((openml.study.OpenMLStudy, openml.study.OpenMLBenchmarkSuite), "study"),
+    ]
+    _, api_type_alias = next(
+        (python_type, api_alias)
+        for (python_type, api_alias) in rest_api_mapping
+        if isinstance(oml_object, python_type)
+    )
+    return api_type_alias
+
+
+def _tag_openml_base(oml_object: OpenMLBase, tag: str, untag: bool = False) -> None:  # noqa: FBT002
+    api_type_alias = _get_rest_api_type_alias(oml_object)
+    if oml_object.id is None:
+        raise openml.exceptions.ObjectNotPublishedError(
+            f"Cannot tag an {api_type_alias} that has not been published yet."
+            "Please publish the object first before being able to tag it."
+            f"\n{oml_object}",
+        )
+    _tag_entity(entity_type=api_type_alias, entity_id=oml_object.id, tag=tag, untag=untag)
+
+
+def _tag_entity(entity_type: str, entity_id: int, tag: str, *, untag: bool = False) -> list[str]:
+    """
+    Function that tags or untags a given entity on OpenML. As the OpenML
+    API tag functions all consist of the same format, this function covers
+    all entity types (currently: dataset, task, flow, setup, run). Could
+    be used in a partial to provide dataset_tag, dataset_untag, etc.
+
+    Parameters
+    ----------
+    entity_type : str
+        Name of the entity to tag (e.g., run, flow, data)
+
+    entity_id : int
+        OpenML id of the entity
+
+    tag : str
+        The tag
+
+    untag : bool
+        Set to true if needed to untag, rather than tag
+
+    Returns
+    -------
+    tags : list
+        List of tags that the entity is (still) tagged with
+    """
+    legal_entities = {"data", "task", "flow", "setup", "run"}
+    if entity_type not in legal_entities:
+        raise ValueError(f"Can't tag a {entity_type}")
+
+    if untag:
+        uri = f"{entity_type}/untag"
+        main_tag = f"oml:{entity_type}_untag"
+    else:
+        uri = f"{entity_type}/tag"
+        main_tag = f"oml:{entity_type}_tag"
+
+    result_xml = openml._api_calls._perform_api_call(
+        uri,
+        "post",
+        {f"{entity_type}_id": entity_id, "tag": tag},
+    )
+
+    result = xmltodict.parse(result_xml, force_list={"oml:tag"})[main_tag]
+
+    if "oml:tag" in result:
+        return result["oml:tag"]  # type: ignore
+
+    # no tags, return empty list
+    return []
+
+
+# TODO(eddiebergman): Maybe this can be made more specific with a Literal
+def _delete_entity(entity_type: str, entity_id: int) -> bool:
+    """
+    Function that deletes a given entity on OpenML. As the OpenML
+    API tag functions all consist of the same format, this function covers
+    all entity types that can be deleted (currently: dataset, task, flow,
+    run, study and user).
+
+    Parameters
+    ----------
+    entity_type : str
+        Name of the entity to tag (e.g., run, flow, data)
+
+    entity_id : int
+        OpenML id of the entity
+
+    Returns
+    -------
+    bool
+        True iff the deletion was successful. False otherwse
+    """
+    legal_entities = {
+        "data",
+        "flow",
+        "task",
+        "run",
+        "study",
+        "user",
+    }
+    if entity_type not in legal_entities:
+        raise ValueError(f"Can't delete a {entity_type}")
+
+    url_suffix = f"{entity_type}/{entity_id}"
+    try:
+        result_xml = openml._api_calls._perform_api_call(url_suffix, "delete")
+        result = xmltodict.parse(result_xml)
+        return f"oml:{entity_type}_delete" in result
+    except openml.exceptions.OpenMLServerException as e:
+        # https://github.com/openml/OpenML/blob/21f6188d08ac24fcd2df06ab94cf421c946971b0/openml_OS/views/pages/api_new/v1/xml/pre.php
+        # Most exceptions are descriptive enough to be raised as their standard
+        # OpenMLServerException, however there are two cases where we add information:
+        #  - a generic "failed" message, we direct them to the right issue board
+        #  - when the user successfully authenticates with the server,
+        #    but user is not allowed to take the requested action,
+        #    in which case we specify a OpenMLNotAuthorizedError.
+        by_other_user = [323, 353, 393, 453, 594]
+        has_dependent_entities = [324, 326, 327, 328, 354, 454, 464, 595]
+        unknown_reason = [325, 355, 394, 455, 593]
+        if e.code in by_other_user:
+            raise openml.exceptions.OpenMLNotAuthorizedError(
+                message=(
+                    f"The {entity_type} can not be deleted because it was not uploaded by you."
+                ),
+            ) from e
+        if e.code in has_dependent_entities:
+            raise openml.exceptions.OpenMLNotAuthorizedError(
+                message=(
+                    f"The {entity_type} can not be deleted because "
+                    f"it still has associated entities: {e.message}"
+                ),
+            ) from e
+        if e.code in unknown_reason:
+            raise openml.exceptions.OpenMLServerError(
+                message=(
+                    f"The {entity_type} can not be deleted for unknown reason,"
+                    " please open an issue at: https://github.com/openml/openml/issues/new"
+                ),
+            ) from e
+        raise e
+
+
+def _list_all(  # noqa: C901
+    listing_call: Callable[[int, int], _SizedT],
+    *,
+    limit: int | None = None,
+    offset: int | None = None,
+    batch_size: int | None = 10_000,
+) -> list[_SizedT]:
+    """Helper to handle paged listing requests.
+
+    Example usage:
+
+    ``evaluations = list_all(list_evaluations, "predictive_accuracy", task=mytask)``
+
+    Parameters
+    ----------
+    listing_call : callable
+        Call listing, e.g. list_evaluations. Takes two positional
+        arguments: batch_size and offset.
+    batch_size : int, optional
+        The batch size to use for the listing call.
+    offset : int, optional
+        The initial offset to use for the listing call.
+    limit : int, optional
+        The total size of the listing. If not provided, the function will
+        request the first batch and then continue until no more results are
+        returned
+
+    Returns
+    -------
+    List of types returned from type of the listing call
+    """
+    page = 0
+    results: list[_SizedT] = []
+
+    offset = offset if offset is not None else 0
+    batch_size = batch_size if batch_size is not None else 10_000
+
+    LIMIT = limit
+    BATCH_SIZE_ORIG = batch_size
+
+    # Default batch size per paging.
+    # This one can be set in filters (batch_size), but should not be
+    # changed afterwards. The derived batch_size can be changed.
+    if not isinstance(BATCH_SIZE_ORIG, int):
+        raise ValueError(f"'batch_size' should be an integer but got {BATCH_SIZE_ORIG}")
+
+    if (LIMIT is not None) and (not isinstance(LIMIT, int)) and (not np.isinf(LIMIT)):
+        raise ValueError(f"'limit' should be an integer or inf but got {LIMIT}")
+
+    # If our batch size is larger than the limit, we should only
+    # request one batch of size of LIMIT
+    if LIMIT is not None and BATCH_SIZE_ORIG > LIMIT:
+        BATCH_SIZE_ORIG = LIMIT
+
+    if not isinstance(offset, int):
+        raise ValueError(f"'offset' should be an integer but got {offset}")
+
+    batch_size = BATCH_SIZE_ORIG
+    while True:
+        try:
+            current_offset = offset + BATCH_SIZE_ORIG * page
+            new_batch = listing_call(batch_size, current_offset)
+        except openml.exceptions.OpenMLServerNoResult:
+            # NOTE: This above statement may not actually happen, but we could just return here
+            # to enforce it...
+            break
+
+        results.append(new_batch)
+
+        # If the batch is less than our requested batch_size, that's the last batch
+        # and we can bail out.
+        if len(new_batch) < batch_size:
+            break
+
+        page += 1
+        if LIMIT is not None:
+            # check if the number of required results has been achieved
+            # always do a 'bigger than' check,
+            # in case of bugs to prevent infinite loops
+            n_received = sum(len(result) for result in results)
+            if n_received >= LIMIT:
+                break
+
+            # check if there are enough results to fulfill a batch
+            if LIMIT - n_received < BATCH_SIZE_ORIG:
+                batch_size = LIMIT - n_received
+
+    return results
+
+
+def _get_cache_dir_for_key(key: str) -> Path:
+    return Path(openml.config.get_cache_directory()) / key
+
+
+def _create_cache_directory(key: str) -> Path:
+    cache_dir = _get_cache_dir_for_key(key)
+
+    try:
+        cache_dir.mkdir(exist_ok=True, parents=True)
+    except Exception as e:
+        raise openml.exceptions.OpenMLCacheException(
+            f"Cannot create cache directory {cache_dir}."
+        ) from e
+
+    return cache_dir
+
+
+def _get_cache_dir_for_id(key: str, id_: int, create: bool = False) -> Path:  # noqa: FBT002
+    cache_dir = _create_cache_directory(key) if create else _get_cache_dir_for_key(key)
+    return Path(cache_dir) / str(id_)
+
+
+def _create_cache_directory_for_id(key: str, id_: int) -> Path:
+    """Create the cache directory for a specific ID
+
+    In order to have a clearer cache structure and because every task
+    is cached in several files (description, split), there
+    is a directory for each task witch the task ID being the directory
+    name. This function creates this cache directory.
+
+    This function is NOT thread/multiprocessing safe.
+
+    Parameters
+    ----------
+    key : str
+
+    id_ : int
+
+    Returns
+    -------
+    cache_dir : Path
+        Path of the created dataset cache directory.
+    """
+    cache_dir = _get_cache_dir_for_id(key, id_, create=True)
+    if cache_dir.exists() and not cache_dir.is_dir():
+        raise ValueError(f"{key} cache dir exists but is not a directory!")
+
+    cache_dir.mkdir(exist_ok=True, parents=True)
+    return cache_dir
+
+
+def _remove_cache_dir_for_id(key: str, cache_dir: Path) -> None:
+    """Remove the task cache directory
+
+    This function is NOT thread/multiprocessing safe.
+
+    Parameters
+    ----------
+    key : str
+
+    cache_dir : str
+    """
+    try:
+        shutil.rmtree(cache_dir)
+    except OSError as e:
+        raise ValueError(
+            f"Cannot remove faulty {key} cache directory {cache_dir}. Please do this manually!",
+        ) from e
+
+
+def thread_safe_if_oslo_installed(func: Callable[P, R]) -> Callable[P, R]:
+    try:
+        # Currently, importing oslo raises a lot of warning that it will stop working
+        # under python3.8; remove this once they disappear
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            from oslo_concurrency import lockutils
+
+        @wraps(func)
+        def safe_func(*args: P.args, **kwargs: P.kwargs) -> R:
+            # Lock directories use the id that is passed as either positional or keyword argument.
+            id_parameters = [parameter_name for parameter_name in kwargs if "_id" in parameter_name]
+            if len(id_parameters) == 1:
+                id_ = kwargs[id_parameters[0]]
+            elif len(args) > 0:
+                id_ = args[0]
+            else:
+                raise RuntimeError(
+                    f"An id must be specified for {func.__name__}, was passed: ({args}, {kwargs}).",
+                )
+            # The [7:] gets rid of the 'openml.' prefix
+            lock_name = f"{func.__module__[7:]}.{func.__name__}:{id_}"
+            with lockutils.external_lock(name=lock_name, lock_path=_create_lockfiles_dir()):
+                return func(*args, **kwargs)
+
+        return safe_func
+    except ImportError:
+        return func
+
+
+def get_cache_size() -> int:
+    """Calculate the size of OpenML cache directory
+
+    Returns
+    -------
+    cache_size: int
+        Total size of cache in bytes
+    """
+    path = Path(openml.config.get_cache_directory())
+    return sum(f.stat().st_size for f in path.rglob("*") if f.is_file())
+
+
+def _create_lockfiles_dir() -> Path:
+    path = Path(openml.config.get_cache_directory()) / "locks"
+    # TODO(eddiebergman): Not sure why this is allowed to error and ignore???
+    with contextlib.suppress(OSError):
+        path.mkdir(exist_ok=True, parents=True)
+    return path
+
+
+class ProgressBar(ProgressType):
+    """Progressbar for MinIO function's `progress` parameter."""
+
+    def __init__(self) -> None:
+        self._object_name = ""
+        self._progress_bar: tqdm | None = None
+
+    def set_meta(self, object_name: str, total_length: int) -> None:
+        """Initializes the progress bar.
+
+        Parameters
+        ----------
+        object_name: str
+          Not used.
+
+        total_length: int
+          File size of the object in bytes.
+        """
+        self._object_name = object_name
+        self._progress_bar = tqdm(total=total_length, unit_scale=True, unit="B")
+
+    def update(self, length: int) -> None:
+        """Updates the progress bar.
+
+        Parameters
+        ----------
+        length: int
+          Number of bytes downloaded since last `update` call.
+        """
+        if not self._progress_bar:
+            raise RuntimeError("Call `set_meta` before calling `update`.")
+        self._progress_bar.update(length)
+        if self._progress_bar.total <= self._progress_bar.n:
+            self._progress_bar.close()
+
+
+class ReprMixin(ABC):
+    """A mixin class that provides a customizable string representation for OpenML objects.
+
+    This mixin standardizes the __repr__ output format across OpenML classes.
+    Classes inheriting from this mixin should implement the
+    _get_repr_body_fields method to specify which fields to display.
+    """
+
+    def __repr__(self) -> str:
+        body_fields = self._get_repr_body_fields()
+        return self._apply_repr_template(body_fields)
+
+    @abstractmethod
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
+        """Collect all information to display in the __repr__ body.
+
+        Returns
+        -------
+        body_fields : List[Tuple[str, Union[str, int, List[str]]]]
+            A list of (name, value) pairs to display in the body of the __repr__.
+            E.g.: [('metric', 'accuracy'), ('dataset', 'iris')]
+            If value is a List of str, then each item of the list will appear in a separate row.
+        """
+        # Should be implemented in the base class.
+
+    def _apply_repr_template(
+        self,
+        body_fields: Iterable[tuple[str, str | int | list[str] | None]],
+    ) -> str:
+        """Generates the header and formats the body for string representation of the object.
+
+        Parameters
+        ----------
+        body_fields: List[Tuple[str, str]]
+           A list of (name, value) pairs to display in the body of the __repr__.
+        """
+        # We add spaces between capitals, e.g. ClassificationTask -> Classification Task
+        name_with_spaces = re.sub(
+            r"(\w)([A-Z])",
+            r"\1 \2",
+            self.__class__.__name__[len("OpenML") :],
+        )
+        header_text = f"OpenML {name_with_spaces}"
+        header = f"{header_text}\n{'=' * len(header_text)}\n"
+
+        _body_fields: list[tuple[str, str | int | list[str]]] = [
+            (k, "None" if v is None else v) for k, v in body_fields
+        ]
+        longest_field_name_length = max(len(name) for name, _ in _body_fields)
+        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
+        body = "\n".join(field_line_format.format(name, value) for name, value in _body_fields)
+        return header + body
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000..8c463968b
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,341 @@
+# -*- coding: utf-8 -*-
+
+# License: BSD 3-Clause
+[build-system]
+requires = ["setuptools >= 61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "openml"
+dynamic = ["version"]  # Will take it from the __version__ file, update there
+dependencies = [
+  "liac-arff>=2.4.0",
+  "xmltodict",
+  "requests",
+  "scikit-learn>=0.18",
+  "python-dateutil",  # Installed through pandas anyway.
+  "pandas>=1.0.0",
+  "scipy>=0.13.3",
+  "numpy>=1.6.2",
+  "minio",
+  "pyarrow",
+  "tqdm",  # For MinIO download progress bars
+]
+requires-python = ">=3.10,<3.15"    
+maintainers = [
+  { name = "Pieter Gijsbers", email="p.gijsbers@tue.nl"},
+  { name = "Lennart Purucker"},
+]
+authors = [
+  { name = "Matthias Feurer"},
+  { name = "Jan van Rijn" },
+  { name = "Arlind Kadra" },
+  { name = "Pieter Gijsbers" },
+  { name = "Neeratyoy Mallik" },
+  { name = "Sahithya Ravi" },
+  { name = "Andreas Müller" },
+  { name = "Joaquin Vanschoren " },
+  { name = "Frank Hutter" },
+]
+readme = "README.md"
+description = "Python API for OpenML"
+classifiers = [
+  "Intended Audience :: Science/Research",
+  "Intended Audience :: Developers",
+  "License :: OSI Approved :: BSD License",
+  "Programming Language :: Python",
+  "Topic :: Software Development",
+  "Topic :: Scientific/Engineering",
+  "Operating System :: POSIX",
+  "Operating System :: Unix",
+  "Operating System :: MacOS",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: 3.14",
+]
+license = { file = "LICENSE" }
+
+[project.scripts]
+openml = "openml.cli:main"
+
+[project.optional-dependencies]
+test=[
+    "nbconvert",
+    "jupyter_client",
+    "matplotlib",
+    "pytest",
+    "pytest-xdist",
+    "pytest-timeout",
+    "nbformat",
+    "oslo.concurrency",
+    "flaky",
+    "pre-commit",
+    "pytest-cov",
+    "pytest-rerunfailures",
+    "mypy",
+    "ruff",
+    "requests-mock",
+    "openml-sklearn",
+    "packaging",
+    "pytest-mock",
+    "openml-sklearn",
+]
+examples=[
+    "matplotlib",
+    "jupyter",
+    "notebook",
+    "nbconvert",
+    "nbformat",
+    "jupyter_client",
+    "ipython",
+    "ipykernel",
+    "seaborn",
+]
+docs=[
+    "mkdocs",
+    "numpydoc",
+    "mkdocs-material",
+    "mkdocs-autorefs",
+    "mkdocstrings[python]",
+    "mkdocs-gen-files",
+    "mkdocs-literate-nav",
+    "mkdocs-section-index",
+    "mkdocs-jupyter",
+    "mkdocs-linkcheck",
+    "mike"
+]
+
+[project.urls]
+home="https://openml.org/"
+documentation = "https://openml.github.io/openml-python/"
+source = "https://github.com/openml/openml-python"
+
+[tool.setuptools.packages.find]
+where = [""]
+include = ["openml*"]
+namespaces = false
+
+[tool.setuptools.package-data]
+openml = ["*.txt", "*.md", "py.typed"]
+
+[tool.setuptools.dynamic]
+version = {attr = "openml.__version__.__version__"}
+
+# https://docs.pytest.org/en/7.2.x/reference/reference.html#ini-options-ref
+[tool.pytest.ini_options]
+log_level="DEBUG"
+testpaths = ["tests"]
+minversion = "7.0"
+xfail_strict = true
+filterwarnings=[
+    "ignore:the matrix subclass:PendingDeprecationWarning"
+]
+markers = [
+  "upload: anything that uploads to a server",
+  "production_server: any interaction with the production server",
+  "cache: anything that interacts with the (test) cache",
+  "test_server: tests that require the OpenML test server",
+]
+
+# https://github.com/charliermarsh/ruff
+[tool.ruff]
+target-version = "py310"
+line-length = 100
+output-format = "grouped"
+src = ["openml", "tests", "examples"]
+unsafe-fixes = true
+
+exclude = [
+  # TODO(eddiebergman): Tests should be re-enabled after the refactor
+  "tests",
+  #
+  ".bzr",
+  ".direnv",
+  ".eggs",
+  ".git",
+  ".hg",
+  ".mypy_cache",
+  ".nox",
+  ".pants.d",
+  ".ruff_cache",
+  ".svn",
+  ".tox",
+  ".venv",
+  "__pypackages__",
+  "_build",
+  "buck-out",
+  "build",
+  "dist",
+  "node_modules",
+  "venv",
+  "docs",
+]
+
+# Exclude a variety of commonly ignored directories.
+[tool.ruff.lint.per-file-ignores]
+"tests/*.py" = [
+  "D100",   # Undocumented public module
+  "D101",   # Missing docstring in public class
+  "D102",   # Missing docstring in public method
+  "D103",   # Missing docstring in public function
+  "S101",   # Use of assert
+  "ANN201", # Missing return type annotation for public function
+  "FBT001", # Positional boolean argument
+  "PLR2004",# No use of magic numbers
+  "PD901",  #  X is a bad variable name. (pandas)
+  "TCH",    # https://docs.astral.sh/ruff/rules/#flake8-type-checking-tch
+  "N803",   # Argument name {name} should be lowercase
+]
+"openml/cli.py" = [
+  "T201",   # print found
+  "T203",   # pprint found
+]
+"openml/__version__.py" = [
+  "D100",   # Undocumented public module
+]
+"__init__.py" = [
+  "I002",   # Missing required import (i.e. from __future__ import annotations)
+]
+"examples/*.py" = [
+  "D101",   # Missing docstring in public class
+  "D102",   # Missing docstring in public method
+  "D103",   # Missing docstring in public function
+  "D415",   # First line should end with a . or ? or !
+  "INP001", # File is part of an implicit namespace package, add an __init__.py
+  "I002",   # Missing required import (i.e. from __future__ import annotations)
+  "E741",   # Ambigiuous variable name
+  "T201",   # print found
+  "T203",   # pprint found
+  "ERA001", # found commeneted out code
+  "E402",   # Module level import not at top of cell
+  "E501",   # Line too long
+]
+
+[tool.ruff.lint]
+# Allow unused variables when underscore-prefixed.
+dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+
+select = [
+  "A",
+  # "ANN", # Handled by mypy
+  "ARG",
+  "B",
+  "BLE",
+  "COM",
+  "C4",
+  "D",
+  # "DTZ",  # One day I should know how to utilize timezones and dates...
+  "E",
+  # "EXE", Meh
+  "ERA",
+  "F",
+  "FBT",
+  "I",
+  # "ISC",  # Favours implicit string concatenation
+  "INP",
+  # "INT", # I don't understand this one
+  "N",
+  "NPY",
+  "PD",
+  "PLC",
+  "PLE",
+  "PLR",
+  "PLW",
+  "PIE",
+  "PT",
+  "PTH",
+  # "PYI", # Specific to .pyi files for type stubs
+  "Q",
+  "PGH004",
+  "RET",
+  "RUF",
+  "C90",
+  "S",
+  # "SLF",    # Private member accessed (sure, it's python)
+  "SIM",
+  # "TRY", # Good in principle, would take a lot of work to statisfy
+  "T10",
+  "T20",
+  "TID",
+  "TCH",
+  "UP",
+  "N",
+  "W",
+  "YTT",
+]
+
+ignore = [
+  "D105",    # Missing docstring in magic mthod
+  "D401",    # First line of docstring should be in imperative mood
+  "N806",    # Variable X in function should be lowercase
+  "E731",    # Do not assign a lambda expression, use a def
+  "S101",    # Use of assert detected.
+  "W292",    # No newline at end of file
+  "PLC1901", # "" can be simplified to be falsey
+  "TC003",  # Move stdlib import into TYPE_CHECKING
+  "COM812",  # Trailing comma missing (handled by linter, ruff recommend disabling if using formatter)
+  "N803",    # Argument should be lowercase (but we accept things like `X`)
+  "PLC0415", # Allow imports inside functions / non-top-level scope
+  "FBT001",  # Allow Boolean-typed positional argument in function definition
+
+  # TODO(@eddibergman): These should be enabled
+  "D100",    # Missing docstring in public module
+  "D103",    # Missing docstring in public function
+  "D104",    # Missing docstring in public package
+
+  # TODO(@eddiebergman): Maybe fix
+   "PLR2004", # Magic value used in comparison, consider replacing 2 with a constant variable
+  "D400",    # First line must end with a period (@eddiebergman too many to fix so ignoring this for now)
+  "D203",    # 1 blank line required before class docstring
+  "D205",    # 1 blank line between summary and description
+
+  # TODO(@eddiebergman): Could be backwards breaking
+  "N802",    # Public function name should be lower case (i.e. get_X())
+]
+
+
+
+[tool.ruff.lint.isort]
+known-first-party = ["openml"]
+no-lines-before = ["future"]
+required-imports = ["from __future__ import annotations"]
+combine-as-imports = true
+extra-standard-library = ["typing_extensions"]
+force-wrap-aliases = true
+
+[tool.ruff.lint.pydocstyle]
+convention = "numpy"
+
+[tool.mypy]
+python_version = "3.10"
+packages = ["openml", "tests"]
+
+show_error_codes = true
+
+warn_unused_configs = true # warn about unused [tool.mypy] lines
+
+follow_imports = "normal"      # Type check top level api code we use from imports
+ignore_missing_imports = false # prefer explicit ignores
+
+disallow_untyped_defs = true       # All functions must have types
+disallow_untyped_decorators = true # ... even decorators
+disallow_incomplete_defs = true    # ...all types
+
+no_implicit_optional = true
+check_untyped_defs = true
+
+warn_return_any = true
+
+
+[[tool.mypy.overrides]]
+module = ["tests.*", "openml.extensions.sklearn.*"]
+
+# TODO(eddiebergman): This should be re-enabled after tests get refactored
+ignore_errors = true
+#disallow_untyped_defs = false          # Sometimes we just want to ignore verbose types
+#disallow_untyped_decorators = false    # Test decorators are not properly typed
+#disallow_incomplete_defs = false       # Sometimes we just want to ignore verbose types
+#disable_error_code = ["var-annotated"]
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 000000000..000969b80
--- /dev/null
+++ b/scripts/__init__.py
@@ -0,0 +1 @@
+"""Package for scripts and utilities."""
diff --git a/scripts/gen_ref_pages.py b/scripts/gen_ref_pages.py
new file mode 100644
index 000000000..22a873a4a
--- /dev/null
+++ b/scripts/gen_ref_pages.py
@@ -0,0 +1,58 @@
+"""Generate the code reference pages.
+
+based on https://github.com/mkdocstrings/mkdocstrings/blob/33aa573efb17b13e7b9da77e29aeccb3fbddd8e8/docs/recipes.md
+but modified for lack of "src/" file structure.
+
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import mkdocs_gen_files
+
+nav = mkdocs_gen_files.Nav()
+
+root = Path(__file__).parent.parent
+src = root / "openml"
+
+for path in sorted(src.rglob("*.py")):
+    module_path = path.relative_to(root).with_suffix("")
+    doc_path = path.relative_to(src).with_suffix(".md")
+    full_doc_path = Path("reference", doc_path)
+
+    parts = tuple(module_path.parts)
+
+    if parts[-1] == "__init__":
+        parts = parts[:-1]
+        doc_path = doc_path.with_name("index.md")
+        full_doc_path = full_doc_path.with_name("index.md")
+    elif parts[-1] == "__main__":
+        continue
+
+    nav[parts] = doc_path.as_posix()
+
+    with mkdocs_gen_files.open(full_doc_path, "w") as fd:
+        identifier = ".".join(parts)
+        print("::: " + identifier, file=fd)
+
+    mkdocs_gen_files.set_edit_path(full_doc_path, path.relative_to(root))
+
+    with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file:
+        nav_file.writelines(nav.build_literate_nav())
+
+nav = mkdocs_gen_files.Nav()
+examples_dir = root / "examples"
+examples_doc_dir = root / "docs" / "examples"
+for path in sorted(examples_dir.rglob("*.py")):
+    if "_external_or_deprecated" in path.parts:
+        continue
+    dest_path = Path("examples") / path.relative_to(examples_dir)
+    with mkdocs_gen_files.open(dest_path, "w") as dest_file:
+        print(path.read_text(), file=dest_file)
+
+    new_relative_location = Path("../") / dest_path
+    nav[new_relative_location.parts[2:]] = new_relative_location.as_posix()
+
+    with mkdocs_gen_files.open("examples/SUMMARY.md", "w") as nav_file:
+        nav_file.writelines(nav.build_literate_nav())
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 726c8fa73..000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-[metadata]
-description-file = README.md
-
-[tool:pytest]
-filterwarnings =
-    ignore:the matrix subclass:PendingDeprecationWarning
diff --git a/setup.py b/setup.py
deleted file mode 100644
index b1700073f..000000000
--- a/setup.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import setuptools
-import sys
-
-with open("openml/__version__.py") as fh:
-    version = fh.readlines()[-1].split()[-1].strip("\"'")
-
-if sys.version_info < (3, 5):
-    raise ValueError(
-        'Unsupported Python version {}.{}.{} found. OpenML requires Python 3.5 or higher.'
-        .format(sys.version_info.major, sys.version_info.minor, sys.version_info.micro)
-    )
-
-setuptools.setup(name="openml",
-                 author="Matthias Feurer, Jan van Rijn, Arlind Kadra, Andreas Müller, "
-                        "Pieter Gijsbers and Joaquin Vanschoren",
-                 author_email="feurerm@informatik.uni-freiburg.de",
-                 maintainer="Matthias Feurer",
-                 maintainer_email="feurerm@informatik.uni-freiburg.de",
-                 description="Python API for OpenML",
-                 license="BSD 3-clause",
-                 url="http://openml.org/",
-                 project_urls={
-                     "Documentation": "https://openml.github.io/openml-python/",
-                     "Source Code": "https://github.com/openml/openml-python"
-                 },
-                 version=version,
-                 packages=setuptools.find_packages(),
-                 package_data={'': ['*.txt', '*.md']},
-                 install_requires=[
-                     'liac-arff>=2.4.0',
-                     'xmltodict',
-                     'requests',
-                     'scikit-learn>=0.18',
-                     'python-dateutil',  # Installed through pandas anyway.
-                     'pandas>=0.19.2',
-                     'scipy>=0.13.3',
-                     'numpy>=1.6.2'
-                 ],
-                 extras_require={
-                     'test': [
-                         'nbconvert',
-                         'jupyter_client',
-                         'matplotlib',
-                         'pytest',
-                         'pytest-xdist',
-                         'pytest-timeout',
-                         'nbformat',
-                         'oslo.concurrency'
-                     ],
-                     'examples': [
-                         'matplotlib',
-                         'jupyter',
-                         'notebook',
-                         'nbconvert',
-                         'nbformat',
-                         'jupyter_client',
-                         'ipython',
-                         'ipykernel',
-                         'seaborn'
-                     ]
-                 },
-                 test_suite="pytest",
-                 classifiers=['Intended Audience :: Science/Research',
-                              'Intended Audience :: Developers',
-                              'License :: OSI Approved :: BSD License',
-                              'Programming Language :: Python',
-                              'Topic :: Software Development',
-                              'Topic :: Scientific/Engineering',
-                              'Operating System :: POSIX',
-                              'Operating System :: Unix',
-                              'Operating System :: MacOS',
-                              'Programming Language :: Python :: 3',
-                              'Programming Language :: Python :: 3.4',
-                              'Programming Language :: Python :: 3.5',
-                              'Programming Language :: Python :: 3.6',
-                              'Programming Language :: Python :: 3.7'])
diff --git a/tests/__init__.py b/tests/__init__.py
index dc5287024..245c252db 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,3 +1,5 @@
+# License: BSD 3-Clause
+
 # Dummy to allow mock classes in the test files to have a version number for
 # their parent module
-__version__ = '0.1'
+__version__ = "0.1"
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 000000000..1967f1fad
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,318 @@
+"""This file is recognized by pytest for defining specified behaviour
+
+'conftest.py' files are directory-scope files that are shared by all
+sub-directories from where this file is placed. pytest recognises
+'conftest.py' for any unit test executed from within this directory
+tree. This file is used to define fixtures, hooks, plugins, and other
+functionality that can be shared by the unit tests.
+
+This file has been created for the OpenML testing to primarily make use
+of the pytest hooks 'pytest_sessionstart' and 'pytest_sessionfinish',
+which are being used for managing the deletion of local and remote files
+created by the unit tests, run across more than one process.
+
+This design allows one to comment or remove the conftest.py file to
+disable file deletions, without editing any of the test case files.
+
+
+Possible Future: class TestBase from openml/testing.py can be included
+    under this file and there would not be any requirements to import
+    testing.py in each of the unit test modules.
+"""
+
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import multiprocessing
+
+multiprocessing.set_start_method("spawn", force=True)
+
+from collections.abc import Iterator
+import logging
+import os
+import shutil
+from pathlib import Path
+import pytest
+import openml_sklearn
+
+import openml
+from openml.testing import TestBase
+
+import inspect
+
+# creating logger for unit test file deletion status
+logger = logging.getLogger("unit_tests")
+logger.setLevel(logging.DEBUG)
+
+file_list = []
+
+
+def worker_id() -> str:
+    """Returns the name of the worker process owning this function call.
+
+    :return: str
+        Possible outputs from the set of {'master', 'gw0', 'gw1', ..., 'gw(n-1)'}
+        where n is the number of workers being used by pytest-xdist
+    """
+    vars_ = list(os.environ.keys())
+    if "PYTEST_XDIST_WORKER" in vars_ or "PYTEST_XDIST_WORKER_COUNT" in vars_:
+        return os.environ["PYTEST_XDIST_WORKER"]
+    else:
+        return "master"
+
+
+def read_file_list() -> list[Path]:
+    """Returns a list of paths to all files that currently exist in 'openml/tests/files/'
+
+    :return: List[Path]
+    """
+    test_files_dir = Path(__file__).parent / "files"
+    return [f for f in test_files_dir.rglob("*") if f.is_file()]
+
+
+def compare_delete_files(old_list: list[Path], new_list: list[Path]) -> None:
+    """Deletes files that are there in the new_list but not in the old_list
+
+    :param old_list: List[Path]
+    :param new_list: List[Path]
+    :return: None
+    """
+    file_list = list(set(new_list) - set(old_list))
+    for file in file_list:
+        os.remove(file)
+        logger.info(f"Deleted from local: {file}")
+
+
+def delete_remote_files(tracker, flow_names) -> None:
+    """Function that deletes the entities passed as input, from the OpenML test server
+
+    The TestBase class in openml/testing.py has an attribute called publish_tracker.
+    This function expects the dictionary of the same structure.
+    It is a dictionary of lists, where the keys are entity types, while the values are
+    lists of integer IDs, except for key 'flow' where the value is a tuple (ID, flow name).
+
+    Iteratively, multiple POST requests are made to the OpenML test server using
+    openml.utils._delete_entity() to remove the entities uploaded by all the unit tests.
+
+    :param tracker: Dict
+    :return: None
+    """
+    openml.config.server = TestBase.test_server
+    openml.config.apikey = TestBase.user_key
+
+    # reordering to delete sub flows at the end of flows
+    # sub-flows have shorter names, hence, sorting by descending order of flow name length
+    if "flow" in tracker:
+        to_sort = list(zip(tracker["flow"], flow_names))
+        flow_deletion_order = [
+            entity_id for entity_id, _ in sorted(to_sort, key=lambda x: len(x[1]), reverse=True)
+        ]
+        tracker["flow"] = [flow_deletion_order[1] for flow_id, _ in flow_deletion_order]
+
+    # deleting all collected entities published to test server
+    # 'run's are deleted first to prevent dependency issue of entities on deletion
+    logger.info(f"Entity Types: {['run', 'data', 'flow', 'task', 'study']}")
+    for entity_type in ["run", "data", "flow", "task", "study"]:
+        logger.info(f"Deleting {entity_type}s...")
+        for _i, entity in enumerate(tracker[entity_type]):
+            try:
+                openml.utils._delete_entity(entity_type, entity)
+                logger.info(f"Deleted ({entity_type}, {entity})")
+            except Exception as e:
+                logger.warning(f"Cannot delete ({entity_type},{entity}): {e}")
+
+
+def pytest_sessionstart() -> None:
+    """Pytest hook that is executed before any unit test starts
+
+    This function will be called by each of the worker processes, along with the master process
+    when they are spawned. This happens even before the collection of unit tests.
+    If number of workers, n=4, there will be a total of 5 (1 master + 4 workers) calls of this
+    function, before execution of any unit test begins. The master pytest process has the name
+    'master' while the worker processes are named as 'gw{i}' where i = 0, 1, ..., n-1.
+    The order of process spawning is: 'master' -> random ordering of the 'gw{i}' workers.
+
+    Since, master is always executed first, it is checked if the current process is 'master' and
+    store a list of strings of paths of all files in the directory (pre-unit test snapshot).
+
+    :return: None
+    """
+    # file_list is global to maintain the directory snapshot during tear down
+    global file_list
+    worker = worker_id()
+    if worker == "master":
+        file_list = read_file_list()
+
+
+def pytest_sessionfinish() -> None:
+    """Pytest hook that is executed after all unit tests of a worker ends
+
+    This function will be called by each of the worker processes, along with the master process
+    when they are done with the unit tests allocated to them.
+    If number of workers, n=4, there will be a total of 5 (1 master + 4 workers) calls of this
+    function, before execution of any unit test begins. The master pytest process has the name
+    'master' while the worker processes are named as 'gw{i}' where i = 0, 1, ..., n-1.
+    The order of invocation is: random ordering of the 'gw{i}' workers -> 'master'.
+
+    Since, master is always executed last, it is checked if the current process is 'master' and,
+    * Compares file list with pre-unit test snapshot and deletes all local files generated
+    * Iterates over the list of entities uploaded to test server and deletes them remotely
+
+    :return: None
+    """
+    # allows access to the file_list read in the set up phase
+    global file_list
+    worker = worker_id()
+    logger.info(f"Finishing worker {worker}")
+
+    # Test file deletion
+    logger.info(f"Deleting files uploaded to test server for worker {worker}")
+    delete_remote_files(TestBase.publish_tracker, TestBase.flow_name_tracker)
+
+    if worker == "master":
+        # Local file deletion
+        new_file_list = read_file_list()
+        compare_delete_files(file_list, new_file_list)
+
+        # Delete any test dirs that remain
+        # In edge cases due to a mixture of pytest parametrization and oslo concurrency,
+        # some file lock are created after leaving the test. This removes these files!
+        test_files_dir = Path(__file__).parent.parent / "openml"
+        for f in test_files_dir.glob("tests.*"):
+            if f.is_dir():
+                shutil.rmtree(f)
+
+        logger.info("Local files deleted")
+
+    logger.info(f"{worker} is killed")
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "sklearn: marks tests that use scikit-learn")
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--long",
+        action="store_true",
+        default=False,
+        help="Run the long version of tests which support both short and long scenarios.",
+    )
+
+
+def _expected_static_cache_state(root_dir: Path) -> list[Path]:
+    _c_root_dir = root_dir / "org" / "openml" / "test"
+    res_paths = [root_dir, _c_root_dir]
+
+    for _d in ["datasets", "tasks", "runs", "setups"]:
+        res_paths.append(_c_root_dir / _d)
+
+    for _id in ["-1", "2"]:
+        tmp_p = _c_root_dir / "datasets" / _id
+        res_paths.extend(
+            [
+                tmp_p / "dataset.arff",
+                tmp_p / "features.xml",
+                tmp_p / "qualities.xml",
+                tmp_p / "description.xml",
+            ]
+        )
+
+    res_paths.append(_c_root_dir / "datasets" / "30" / "dataset_30.pq")
+    res_paths.append(_c_root_dir / "runs" / "1" / "description.xml")
+    res_paths.append(_c_root_dir / "setups" / "1" / "description.xml")
+
+    for _id in ["1", "3", "1882"]:
+        tmp_p = _c_root_dir / "tasks" / _id
+        res_paths.extend(
+            [
+                tmp_p / "datasplits.arff",
+                tmp_p / "task.xml",
+            ]
+        )
+
+    return res_paths
+
+
+def assert_static_test_cache_correct(root_dir: Path) -> None:
+    for p in _expected_static_cache_state(root_dir):
+        assert p.exists(), f"Expected path {p} exists"
+
+
+@pytest.fixture(scope="class")
+def long_version(request):
+    request.cls.long_version = request.config.getoption("--long")
+
+
+@pytest.fixture(scope="session")
+def test_files_directory() -> Path:
+    return Path(__file__).parent / "files"
+
+
+@pytest.fixture(scope="session")
+def test_api_key() -> str:
+    return TestBase.user_key
+
+
+@pytest.fixture(autouse=True, scope="function")
+def verify_cache_state(test_files_directory) -> Iterator[None]:
+    assert_static_test_cache_correct(test_files_directory)
+    yield
+    assert_static_test_cache_correct(test_files_directory)
+
+
+@pytest.fixture(autouse=True, scope="session")
+def as_robot() -> Iterator[None]:
+    policy = openml.config.retry_policy
+    n_retries = openml.config.connection_n_retries
+    openml.config.set_retry_policy("robot", n_retries=20)
+    yield
+    openml.config.set_retry_policy(policy, n_retries)
+
+
+@pytest.fixture(autouse=True)
+def with_server(request):
+    if os.getenv("OPENML_USE_LOCAL_SERVICES") == "true":
+        openml.config.TEST_SERVER_URL = "http://localhost:8000"
+    if "production_server" in request.keywords:
+        openml.config.server = "https://www.openml.org/api/v1/xml"
+        openml.config.apikey = None
+        yield
+        return
+    openml.config.server = f"{openml.config.TEST_SERVER_URL}/api/v1/xml"
+    openml.config.apikey = TestBase.user_key
+    yield
+
+
+@pytest.fixture(autouse=True)
+def with_test_cache(test_files_directory, request):
+    # Skip this fixture for TestBase subclasses - they manage their own cache directory
+    # in setUp()/tearDown(). Having both mechanisms fight over the global config
+    # causes race conditions.
+    if request.instance is not None and isinstance(request.instance, TestBase):
+        yield
+        return
+
+    if not test_files_directory.exists():
+        raise ValueError(
+            f"Cannot find test cache dir, expected it to be {test_files_directory!s}!",
+        )
+    _root_cache_directory = openml.config._root_cache_directory
+    tmp_cache = test_files_directory / request.node.nodeid.replace("/", ".").replace("::", ".")
+    openml.config.set_root_cache_directory(tmp_cache)
+    yield
+    openml.config.set_root_cache_directory(_root_cache_directory)
+    if tmp_cache.exists():
+        shutil.rmtree(tmp_cache)
+        
+
+@pytest.fixture
+def static_cache_dir():
+    return Path(__file__).parent / "files" 
+
+@pytest.fixture
+def workdir(tmp_path):
+    original_cwd = Path.cwd()
+    os.chdir(tmp_path)
+    yield tmp_path
+    os.chdir(original_cwd)
\ No newline at end of file
diff --git a/tests/files/localhost_8000 b/tests/files/localhost_8000
new file mode 120000
index 000000000..334c709ef
--- /dev/null
+++ b/tests/files/localhost_8000
@@ -0,0 +1 @@
+org/openml/test
\ No newline at end of file
diff --git a/tests/files/misc/features_with_whitespaces.xml b/tests/files/misc/features_with_whitespaces.xml
new file mode 100644
index 000000000..2b542d167
--- /dev/null
+++ b/tests/files/misc/features_with_whitespaces.xml
@@ -0,0 +1,22 @@
+<oml:data_features xmlns:oml="http://openml.org/openml">
+    <oml:feature>
+        <oml:index>0</oml:index>
+        <oml:name>V1</oml:name>
+        <oml:data_type>numeric</oml:data_type>
+            <oml:is_target>false</oml:is_target>
+        <oml:is_ignore>false</oml:is_ignore>
+        <oml:is_row_identifier>false</oml:is_row_identifier>
+        <oml:number_of_missing_values>0</oml:number_of_missing_values>
+    </oml:feature>
+    <oml:feature>
+        <oml:index>1</oml:index>
+        <oml:name>V42</oml:name>
+        <oml:data_type>nominal</oml:data_type>
+              <oml:nominal_value> - 50000.</oml:nominal_value>
+              <oml:nominal_value> 50000+.</oml:nominal_value>
+            <oml:is_target>false</oml:is_target>
+        <oml:is_ignore>false</oml:is_ignore>
+        <oml:is_row_identifier>false</oml:is_row_identifier>
+        <oml:number_of_missing_values>0</oml:number_of_missing_values>
+    </oml:feature>
+</oml:data_features>
diff --git a/tests/files/mock_responses/datasets/data_delete_has_tasks.xml b/tests/files/mock_responses/datasets/data_delete_has_tasks.xml
new file mode 100644
index 000000000..fc866047c
--- /dev/null
+++ b/tests/files/mock_responses/datasets/data_delete_has_tasks.xml
@@ -0,0 +1,4 @@
+<oml:error xmlns:oml="http://openml.org/openml">
+	<oml:code>354</oml:code>
+	<oml:message>Dataset is in use by other content. Can not be deleted</oml:message>
+</oml:error>
diff --git a/tests/files/mock_responses/datasets/data_delete_not_exist.xml b/tests/files/mock_responses/datasets/data_delete_not_exist.xml
new file mode 100644
index 000000000..b3b212fbe
--- /dev/null
+++ b/tests/files/mock_responses/datasets/data_delete_not_exist.xml
@@ -0,0 +1,4 @@
+<oml:error xmlns:oml="http://openml.org/openml">
+	<oml:code>352</oml:code>
+	<oml:message>Dataset does not exist</oml:message>
+</oml:error>
diff --git a/tests/files/mock_responses/datasets/data_delete_not_owned.xml b/tests/files/mock_responses/datasets/data_delete_not_owned.xml
new file mode 100644
index 000000000..7d412d48e
--- /dev/null
+++ b/tests/files/mock_responses/datasets/data_delete_not_owned.xml
@@ -0,0 +1,4 @@
+<oml:error xmlns:oml="http://openml.org/openml">
+	<oml:code>353</oml:code>
+	<oml:message>Dataset is not owned by you</oml:message>
+</oml:error>
\ No newline at end of file
diff --git a/tests/files/mock_responses/datasets/data_delete_successful.xml b/tests/files/mock_responses/datasets/data_delete_successful.xml
new file mode 100644
index 000000000..9df47c1a2
--- /dev/null
+++ b/tests/files/mock_responses/datasets/data_delete_successful.xml
@@ -0,0 +1,3 @@
+<oml:data_delete xmlns:oml="http://openml.org/openml">
+  <oml:id>40000</oml:id>
+</oml:data_delete>
diff --git a/tests/files/mock_responses/datasets/data_description_61.xml b/tests/files/mock_responses/datasets/data_description_61.xml
new file mode 100644
index 000000000..fc25e5861
--- /dev/null
+++ b/tests/files/mock_responses/datasets/data_description_61.xml
@@ -0,0 +1,30 @@
+<oml:data_set_description xmlns:oml="http://openml.org/openml">
+  <oml:id>61</oml:id>
+  <oml:name>iris</oml:name>
+  <oml:version>1</oml:version>
+  <oml:description>**Author**: R.A. Fisher  
+**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall  
+**Please cite**:   
+
+**Iris Plants Database**  
+This is perhaps the best known database to be found in the pattern recognition literature.  Fisher's paper is a classic in the field and is referenced frequently to this day.  (See Duda &amp; Hart, for example.)  The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant.  One class is     linearly separable from the other 2; the latter are NOT linearly separable from each other.
+
+Predicted attribute: class of iris plant.  
+This is an exceedingly simple domain.  
+ 
+### Attribute Information:
+    1. sepal length in cm
+    2. sepal width in cm
+    3. petal length in cm
+    4. petal width in cm
+    5. class: 
+       -- Iris Setosa
+       -- Iris Versicolour
+       -- Iris Virginica</oml:description>
+  <oml:description_version>4</oml:description_version>
+  <oml:format>ARFF</oml:format>
+  <oml:creator>R.A. Fisher</oml:creator>     <oml:collection_date>1936</oml:collection_date>  <oml:upload_date>2014-04-06T23:23:39</oml:upload_date>
+  <oml:language>English</oml:language>  <oml:licence>Public</oml:licence>  <oml:url>https://api.openml.org/data/v1/download/61/iris.arff</oml:url>
+  <oml:parquet_url>https://data.openml.org/datasets/0000/0061/dataset_61.pq</oml:parquet_url>  <oml:file_id>61</oml:file_id>  <oml:default_target_attribute>class</oml:default_target_attribute>      <oml:version_label>1</oml:version_label>  <oml:citation>https://archive.ics.uci.edu/ml/citation_policy.html</oml:citation>  <oml:tag>Botany</oml:tag><oml:tag>Ecology</oml:tag><oml:tag>Kaggle</oml:tag><oml:tag>Machine Learning</oml:tag><oml:tag>study_1</oml:tag><oml:tag>study_25</oml:tag><oml:tag>study_4</oml:tag><oml:tag>study_41</oml:tag><oml:tag>study_50</oml:tag><oml:tag>study_52</oml:tag><oml:tag>study_7</oml:tag><oml:tag>study_86</oml:tag><oml:tag>study_88</oml:tag><oml:tag>study_89</oml:tag><oml:tag>uci</oml:tag>  <oml:visibility>public</oml:visibility>  <oml:original_data_url>https://archive.ics.uci.edu/ml/datasets/Iris</oml:original_data_url>  <oml:paper_url>http://digital.library.adelaide.edu.au/dspace/handle/2440/15227</oml:paper_url>  <oml:minio_url>https://data.openml.org/datasets/0000/0061/dataset_61.pq</oml:minio_url>  <oml:status>active</oml:status>
+  <oml:processing_date>2020-11-20 19:02:18</oml:processing_date>      <oml:md5_checksum>ad484452702105cbf3d30f8deaba39a9</oml:md5_checksum>
+</oml:data_set_description>
diff --git a/tests/files/mock_responses/flows/flow_delete_has_runs.xml b/tests/files/mock_responses/flows/flow_delete_has_runs.xml
new file mode 100644
index 000000000..5c8530e75
--- /dev/null
+++ b/tests/files/mock_responses/flows/flow_delete_has_runs.xml
@@ -0,0 +1,5 @@
+<oml:error xmlns:oml="http://openml.org/openml">
+	<oml:code>324</oml:code>
+	<oml:message>flow is in use by other content (runs). Can not be deleted</oml:message>
+    <oml:additional_information>{10716, 10707} ()</oml:additional_information>
+</oml:error>
diff --git a/tests/files/mock_responses/flows/flow_delete_is_subflow.xml b/tests/files/mock_responses/flows/flow_delete_is_subflow.xml
new file mode 100644
index 000000000..ddc314ae4
--- /dev/null
+++ b/tests/files/mock_responses/flows/flow_delete_is_subflow.xml
@@ -0,0 +1,5 @@
+<oml:error xmlns:oml="http://openml.org/openml">
+	<oml:code>328</oml:code>
+	<oml:message>flow is in use by other content (it is a subflow). Can not be deleted</oml:message>
+    <oml:additional_information>{37661}</oml:additional_information>
+</oml:error>
diff --git a/tests/files/mock_responses/flows/flow_delete_not_exist.xml b/tests/files/mock_responses/flows/flow_delete_not_exist.xml
new file mode 100644
index 000000000..4df49149f
--- /dev/null
+++ b/tests/files/mock_responses/flows/flow_delete_not_exist.xml
@@ -0,0 +1,4 @@
+<oml:error xmlns:oml="http://openml.org/openml">
+	<oml:code>322</oml:code>
+	<oml:message>flow does not exist</oml:message>
+</oml:error>
diff --git a/tests/files/mock_responses/flows/flow_delete_not_owned.xml b/tests/files/mock_responses/flows/flow_delete_not_owned.xml
new file mode 100644
index 000000000..3aa9a9ef2
--- /dev/null
+++ b/tests/files/mock_responses/flows/flow_delete_not_owned.xml
@@ -0,0 +1,4 @@
+<oml:error xmlns:oml="http://openml.org/openml">
+	<oml:code>323</oml:code>
+	<oml:message>flow is not owned by you</oml:message>
+</oml:error>
diff --git a/tests/files/mock_responses/flows/flow_delete_successful.xml b/tests/files/mock_responses/flows/flow_delete_successful.xml
new file mode 100644
index 000000000..7638e942d
--- /dev/null
+++ b/tests/files/mock_responses/flows/flow_delete_successful.xml
@@ -0,0 +1,3 @@
+<oml:flow_delete xmlns:oml="http://openml.org/openml">
+    <oml:id>33364</oml:id>
+</oml:flow_delete>
diff --git a/tests/files/mock_responses/runs/run_delete_not_exist.xml b/tests/files/mock_responses/runs/run_delete_not_exist.xml
new file mode 100644
index 000000000..855c223fa
--- /dev/null
+++ b/tests/files/mock_responses/runs/run_delete_not_exist.xml
@@ -0,0 +1,4 @@
+<oml:error xmlns:oml="http://openml.org/openml">
+	<oml:code>392</oml:code>
+	<oml:message>Run does not exist</oml:message>
+</oml:error>
diff --git a/tests/files/mock_responses/runs/run_delete_not_owned.xml b/tests/files/mock_responses/runs/run_delete_not_owned.xml
new file mode 100644
index 000000000..551252e22
--- /dev/null
+++ b/tests/files/mock_responses/runs/run_delete_not_owned.xml
@@ -0,0 +1,4 @@
+<oml:error xmlns:oml="http://openml.org/openml">
+	<oml:code>393</oml:code>
+	<oml:message>Run is not owned by you</oml:message>
+</oml:error>
diff --git a/tests/files/mock_responses/runs/run_delete_successful.xml b/tests/files/mock_responses/runs/run_delete_successful.xml
new file mode 100644
index 000000000..fe4233afa
--- /dev/null
+++ b/tests/files/mock_responses/runs/run_delete_successful.xml
@@ -0,0 +1,3 @@
+<oml:run_delete xmlns:oml="http://openml.org/openml">
+  <oml:id>10591880</oml:id>
+</oml:run_delete>
diff --git a/tests/files/mock_responses/tasks/task_delete_has_runs.xml b/tests/files/mock_responses/tasks/task_delete_has_runs.xml
new file mode 100644
index 000000000..87a92540d
--- /dev/null
+++ b/tests/files/mock_responses/tasks/task_delete_has_runs.xml
@@ -0,0 +1,4 @@
+<oml:error xmlns:oml="http://openml.org/openml">
+	<oml:code>454</oml:code>
+	<oml:message>Task is executed in some runs. Delete these first</oml:message>
+	</oml:error>
diff --git a/tests/files/mock_responses/tasks/task_delete_not_exist.xml b/tests/files/mock_responses/tasks/task_delete_not_exist.xml
new file mode 100644
index 000000000..8a262af29
--- /dev/null
+++ b/tests/files/mock_responses/tasks/task_delete_not_exist.xml
@@ -0,0 +1,4 @@
+<oml:error xmlns:oml="http://openml.org/openml">
+	<oml:code>452</oml:code>
+	<oml:message>Task does not exist</oml:message>
+</oml:error>
diff --git a/tests/files/mock_responses/tasks/task_delete_not_owned.xml b/tests/files/mock_responses/tasks/task_delete_not_owned.xml
new file mode 100644
index 000000000..3d504772b
--- /dev/null
+++ b/tests/files/mock_responses/tasks/task_delete_not_owned.xml
@@ -0,0 +1,4 @@
+<oml:error xmlns:oml="http://openml.org/openml">
+	<oml:code>453</oml:code>
+	<oml:message>Task is not owned by you</oml:message>
+	</oml:error>
diff --git a/tests/files/mock_responses/tasks/task_delete_successful.xml b/tests/files/mock_responses/tasks/task_delete_successful.xml
new file mode 100644
index 000000000..594b6e992
--- /dev/null
+++ b/tests/files/mock_responses/tasks/task_delete_successful.xml
@@ -0,0 +1,3 @@
+<oml:task_delete xmlns:oml="http://openml.org/openml">
+  <oml:id>361323</oml:id>
+</oml:task_delete>
diff --git a/tests/files/org/openml/test/datasets/30/dataset_30.pq b/tests/files/org/openml/test/datasets/30/dataset_30.pq
new file mode 100644
index 000000000..b35597281
Binary files /dev/null and b/tests/files/org/openml/test/datasets/30/dataset_30.pq differ
diff --git a/tests/files/org/openml/test/tasks/1/task.xml b/tests/files/org/openml/test/tasks/1/task.xml
index c70baaff3..38325bc24 100644
--- a/tests/files/org/openml/test/tasks/1/task.xml
+++ b/tests/files/org/openml/test/tasks/1/task.xml
@@ -9,6 +9,7 @@
 </oml:data_set>  </oml:input>
 	      <oml:input name="estimation_procedure">
     <oml:estimation_procedure>
+        <oml:id>1</oml:id>
 <oml:type>crossvalidation</oml:type>
 <oml:data_splits_url>http://www.openml.org/api_splits/get/1/Task_1_splits.arff</oml:data_splits_url>
 <oml:parameter name="number_repeats">1</oml:parameter>
diff --git a/tests/files/org/openml/test/tasks/1882/task.xml b/tests/files/org/openml/test/tasks/1882/task.xml
index 4a744b397..07e63d969 100644
--- a/tests/files/org/openml/test/tasks/1882/task.xml
+++ b/tests/files/org/openml/test/tasks/1882/task.xml
@@ -9,6 +9,7 @@
 </oml:data_set>  </oml:input>
 	      <oml:input name="estimation_procedure">
     <oml:estimation_procedure>
+        <oml:id>3</oml:id>
 <oml:type>crossvalidation</oml:type>
 <oml:data_splits_url>http://capa.win.tue.nl/api_splits/get/1882/Task_1882_splits.arff</oml:data_splits_url>
 <oml:parameter name="number_repeats">10</oml:parameter>
diff --git a/tests/files/org/openml/test/tasks/3/task.xml b/tests/files/org/openml/test/tasks/3/task.xml
index ef538330d..e73bbc75a 100644
--- a/tests/files/org/openml/test/tasks/3/task.xml
+++ b/tests/files/org/openml/test/tasks/3/task.xml
@@ -9,6 +9,7 @@
 </oml:data_set>  </oml:input>
 	      <oml:input name="estimation_procedure">
     <oml:estimation_procedure>
+        <oml:id>1</oml:id>
 <oml:type>crossvalidation</oml:type>
 <oml:data_splits_url>http://www.openml.org/api_splits/get/3/Task_3_splits.arff</oml:data_splits_url>
 <oml:parameter name="number_repeats">1</oml:parameter>
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index cabad9565..c651845fb 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -1,5 +1,9 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import os
+import unittest.mock
 from time import time
-from warnings import filterwarnings, catch_warnings
 
 import numpy as np
 import pandas as pd
@@ -7,317 +11,474 @@
 from scipy import sparse
 
 import openml
-from openml.testing import TestBase
+from openml.datasets import OpenMLDataFeature, OpenMLDataset
 from openml.exceptions import PyOpenMLError
+from openml.testing import TestBase
+
+import pytest
 
 
+@pytest.mark.production_server()
 class OpenMLDatasetTest(TestBase):
     _multiprocess_can_split_ = True
 
     def setUp(self):
-        super(OpenMLDatasetTest, self).setUp()
-        openml.config.server = self.production_server
+        super().setUp()
+        self.use_production_server()
 
         # Load dataset id 2 - dataset 2 is interesting because it contains
         # missing values, categorical features etc.
-        self.dataset = openml.datasets.get_dataset(2, download_data=False)
+        self._dataset = None
         # titanic as missing values, categories, and string
-        self.titanic = openml.datasets.get_dataset(40945, download_data=False)
+        self._titanic = None
         # these datasets have some boolean features
-        self.pc4 = openml.datasets.get_dataset(1049, download_data=False)
-        self.jm1 = openml.datasets.get_dataset(1053, download_data=False)
-
-    def test_get_data_array(self):
-        # Basic usage
-        rval, _, categorical, attribute_names = self.dataset.get_data(dataset_format='array')
-        self.assertIsInstance(rval, np.ndarray)
-        self.assertEqual(rval.dtype, np.float32)
-        self.assertEqual((898, 39), rval.shape)
-        self.assertEqual(len(categorical), 39)
-        self.assertTrue(all([isinstance(cat, bool) for cat in categorical]))
-        self.assertEqual(len(attribute_names), 39)
-        self.assertTrue(all([isinstance(att, str)
-                             for att in attribute_names]))
-        self.assertIsNone(_)
-
-        # check that an error is raised when the dataset contains string
-        err_msg = "PyOpenML cannot handle string when returning numpy arrays"
-        with pytest.raises(PyOpenMLError, match=err_msg):
-            self.titanic.get_data(dataset_format='array')
+        self._pc4 = None
+        self._jm1 = None
+        self._iris = None
+
+    @property
+    def dataset(self):
+        if self._dataset is None:
+            self._dataset = openml.datasets.get_dataset(2, download_data=False)
+        return self._dataset
+
+    @property
+    def titanic(self):
+        if self._titanic is None:
+            self._titanic = openml.datasets.get_dataset(40945, download_data=False)
+        return self._titanic
+
+    @property
+    def pc4(self):
+        if self._pc4 is None:
+            self._pc4 = openml.datasets.get_dataset(1049, download_data=False)
+        return self._pc4
+
+    @property
+    def jm1(self):
+        if self._jm1 is None:
+            self._jm1 = openml.datasets.get_dataset(1053, download_data=False)
+        return self._jm1
+
+    @property
+    def iris(self):
+        if self._iris is None:
+            self._iris = openml.datasets.get_dataset(61, download_data=False)
+        return self._iris
+
+    def test_repr(self):
+        # create a bare-bones dataset as would be returned by
+        # create_dataset
+        data = openml.datasets.OpenMLDataset(name="somename", description="a description")
+        str(data)
+
+    def test_init_string_validation(self):
+        with pytest.raises(ValueError, match="Invalid symbols ' ' in name"):
+            openml.datasets.OpenMLDataset(name="some name", description="a description")
+
+        with pytest.raises(ValueError, match="Invalid symbols 'ï' in description"):
+            openml.datasets.OpenMLDataset(name="somename", description="a descriptïon")
+
+        with pytest.raises(ValueError, match="Invalid symbols 'ü' in citation"):
+            openml.datasets.OpenMLDataset(
+                name="somename",
+                description="a description",
+                citation="Something by Müller",
+            )
+
+    def test__unpack_categories_with_nan_likes(self):
+        # unpack_categories decodes numeric categorical values according to the header
+        # Containing a 'non' category in the header shouldn't lead to failure.
+        categories = ["a", "b", None, float("nan"), np.nan]
+        series = pd.Series([0, 1, None, float("nan"), np.nan, 1, 0])
+        clean_series = OpenMLDataset._unpack_categories(series, categories)
+
+        expected_values = ["a", "b", np.nan, np.nan, np.nan, "b", "a"]
+        self.assertListEqual(list(clean_series.values), expected_values)
+        self.assertListEqual(list(clean_series.cat.categories.values), list("ab"))
 
     def test_get_data_pandas(self):
-        data, _, _, _ = self.titanic.get_data(dataset_format='dataframe')
-        self.assertTrue(isinstance(data, pd.DataFrame))
-        self.assertEqual(data.shape[1], len(self.titanic.features))
-        self.assertEqual(data.shape[0], 1309)
+        data, _, _, _ = self.titanic.get_data()
+        assert isinstance(data, pd.DataFrame)
+        assert data.shape[1] == len(self.titanic.features)
+        assert data.shape[0] == 1309
+        # Dynamically detect what this version of Pandas calls string columns.
+        str_dtype = data["name"].dtype.name
+
         col_dtype = {
-            'pclass': 'float64',
-            'survived': 'category',
-            'name': 'object',
-            'sex': 'category',
-            'age': 'float64',
-            'sibsp': 'float64',
-            'parch': 'float64',
-            'ticket': 'object',
-            'fare': 'float64',
-            'cabin': 'object',
-            'embarked': 'category',
-            'boat': 'object',
-            'body': 'float64',
-            'home.dest': 'object'
+            "pclass": "uint8",
+            "survived": "category",
+            "name": str_dtype,
+            "sex": "category",
+            "age": "float64",
+            "sibsp": "uint8",
+            "parch": "uint8",
+            "ticket": str_dtype,
+            "fare": "float64",
+            "cabin": str_dtype,
+            "embarked": "category",
+            "boat": str_dtype,
+            "body": "float64",
+            "home.dest": str_dtype,
         }
         for col_name in data.columns:
-            self.assertTrue(data[col_name].dtype.name == col_dtype[col_name])
+            assert data[col_name].dtype.name == col_dtype[col_name]
 
         X, y, _, _ = self.titanic.get_data(
-            dataset_format='dataframe',
-            target=self.titanic.default_target_attribute)
-        self.assertTrue(isinstance(X, pd.DataFrame))
-        self.assertTrue(isinstance(y, pd.Series))
-        self.assertEqual(X.shape, (1309, 13))
-        self.assertEqual(y.shape, (1309,))
+            target=self.titanic.default_target_attribute,
+        )
+        assert isinstance(X, pd.DataFrame)
+        assert isinstance(y, pd.Series)
+        assert X.shape == (1309, 13)
+        assert y.shape == (1309,)
         for col_name in X.columns:
-            self.assertTrue(X[col_name].dtype.name == col_dtype[col_name])
-        self.assertTrue(y.dtype.name == col_dtype['survived'])
+            assert X[col_name].dtype.name == col_dtype[col_name]
+        assert y.dtype.name == col_dtype["survived"]
 
+    @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
     def test_get_data_boolean_pandas(self):
         # test to check that we are converting properly True and False even
         # with some inconsistency when dumping the data on openml
         data, _, _, _ = self.jm1.get_data()
-        self.assertTrue(data['defects'].dtype.name == 'category')
-        self.assertTrue(set(data['defects'].cat.categories) == {True, False})
+        assert data["defects"].dtype.name == "category"
+        assert set(data["defects"].cat.categories) == {True, False}
 
         data, _, _, _ = self.pc4.get_data()
-        self.assertTrue(data['c'].dtype.name == 'category')
-        self.assertTrue(set(data['c'].cat.categories) == {True, False})
+        assert data["c"].dtype.name == "category"
+        assert set(data["c"].cat.categories) == {True, False}
 
-    def test_get_data_no_str_data_for_nparrays(self):
-        # check that an error is raised when the dataset contains string
-        err_msg = "PyOpenML cannot handle string when returning numpy arrays"
-        with pytest.raises(PyOpenMLError, match=err_msg):
-            self.titanic.get_data(dataset_format='array')
+    def _check_expected_type(self, dtype, is_cat, col):
+        if is_cat:
+            expected_type = "category"
+        elif not col.isna().any() and (col.astype("uint8") == col).all():
+            expected_type = "uint8"
+        else:
+            expected_type = "float64"
 
+        assert dtype.name == expected_type
+
+    @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
     def test_get_data_with_rowid(self):
         self.dataset.row_id_attribute = "condition"
         rval, _, categorical, _ = self.dataset.get_data(include_row_id=True)
-        self.assertIsInstance(rval, pd.DataFrame)
-        for (dtype, is_cat) in zip(rval.dtypes, categorical):
-            expected_type = 'category' if is_cat else 'float64'
-            self.assertEqual(dtype.name, expected_type)
-        self.assertEqual(rval.shape, (898, 39))
-        self.assertEqual(len(categorical), 39)
+        assert isinstance(rval, pd.DataFrame)
+        for dtype, is_cat, col in zip(rval.dtypes, categorical, rval):
+            self._check_expected_type(dtype, is_cat, rval[col])
+        assert rval.shape == (898, 39)
+        assert len(categorical) == 39
 
         rval, _, categorical, _ = self.dataset.get_data()
-        self.assertIsInstance(rval, pd.DataFrame)
-        for (dtype, is_cat) in zip(rval.dtypes, categorical):
-            expected_type = 'category' if is_cat else 'float64'
-            self.assertEqual(dtype.name, expected_type)
-        self.assertEqual(rval.shape, (898, 38))
-        self.assertEqual(len(categorical), 38)
-
-    def test_get_data_with_target_array(self):
-        X, y, _, attribute_names = self.dataset.get_data(dataset_format='array', target="class")
-        self.assertIsInstance(X, np.ndarray)
-        self.assertEqual(X.dtype, np.float32)
-        self.assertEqual(X.shape, (898, 38))
-        self.assertIn(y.dtype, [np.int32, np.int64])
-        self.assertEqual(y.shape, (898, ))
-        self.assertEqual(len(attribute_names), 38)
-        self.assertNotIn("class", attribute_names)
+        assert isinstance(rval, pd.DataFrame)
+        for dtype, is_cat, col in zip(rval.dtypes, categorical, rval):
+            self._check_expected_type(dtype, is_cat, rval[col])
+        assert rval.shape == (898, 38)
+        assert len(categorical) == 38
 
+    @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
     def test_get_data_with_target_pandas(self):
         X, y, categorical, attribute_names = self.dataset.get_data(target="class")
-        self.assertIsInstance(X, pd.DataFrame)
-        for (dtype, is_cat) in zip(X.dtypes, categorical):
-            expected_type = 'category' if is_cat else 'float64'
-            self.assertEqual(dtype.name, expected_type)
-        self.assertIsInstance(y, pd.Series)
-        self.assertEqual(y.dtype.name, 'category')
+        assert isinstance(X, pd.DataFrame)
+        for dtype, is_cat, col in zip(X.dtypes, categorical, X):
+            self._check_expected_type(dtype, is_cat, X[col])
+        assert isinstance(y, pd.Series)
+        assert y.dtype.name == "category"
 
-        self.assertEqual(X.shape, (898, 38))
-        self.assertEqual(len(attribute_names), 38)
-        self.assertEqual(y.shape, (898, ))
+        assert X.shape == (898, 38)
+        assert len(attribute_names) == 38
+        assert y.shape == (898,)
 
-        self.assertNotIn("class", attribute_names)
+        assert "class" not in attribute_names
 
     def test_get_data_rowid_and_ignore_and_target(self):
         self.dataset.ignore_attribute = ["condition"]
         self.dataset.row_id_attribute = ["hardness"]
         X, y, categorical, names = self.dataset.get_data(target="class")
-        self.assertEqual(X.shape, (898, 36))
-        self.assertEqual(len(categorical), 36)
+        assert X.shape == (898, 36)
+        assert len(categorical) == 36
         cats = [True] * 3 + [False, True, True, False] + [True] * 23 + [False] * 3 + [True] * 3
         self.assertListEqual(categorical, cats)
-        self.assertEqual(y.shape, (898, ))
+        assert y.shape == (898,)
 
+    @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
     def test_get_data_with_ignore_attributes(self):
         self.dataset.ignore_attribute = ["condition"]
         rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=True)
-        for (dtype, is_cat) in zip(rval.dtypes, categorical):
-            expected_type = 'category' if is_cat else 'float64'
-            self.assertEqual(dtype.name, expected_type)
-        self.assertEqual(rval.shape, (898, 39))
-        self.assertEqual(len(categorical), 39)
+        for dtype, is_cat, col in zip(rval.dtypes, categorical, rval):
+            self._check_expected_type(dtype, is_cat, rval[col])
+        assert rval.shape == (898, 39)
+        assert len(categorical) == 39
 
         rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=False)
-        for (dtype, is_cat) in zip(rval.dtypes, categorical):
-            expected_type = 'category' if is_cat else 'float64'
-            self.assertEqual(dtype.name, expected_type)
-        self.assertEqual(rval.shape, (898, 38))
-        self.assertEqual(len(categorical), 38)
-
-    def test_dataset_format_constructor(self):
-
-        with catch_warnings():
-            filterwarnings('error')
-            self.assertRaises(
-                DeprecationWarning,
-                openml.OpenMLDataset,
-                'Test',
-                'Test',
-                format='arff'
-            )
+        for dtype, is_cat, col in zip(rval.dtypes, categorical, rval):
+            self._check_expected_type(dtype, is_cat, rval[col])
+        assert rval.shape == (898, 38)
+        assert len(categorical) == 38
 
     def test_get_data_with_nonexisting_class(self):
         # This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However,
         # label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to
         # indices 4 and 5, and that nothing is mapped to index 3.
-        _, y, _, _ = self.dataset.get_data('class', dataset_format='dataframe')
-        self.assertEqual(list(y.dtype.categories), ['1', '2', '3', '4', '5', 'U'])
-        _, y, _, _ = self.dataset.get_data('class', dataset_format='array')
-        self.assertEqual(np.min(y), 0)
-        self.assertEqual(np.max(y), 5)
-        # Check that no label is mapped to 3, since it is reserved for label '4'.
-        self.assertEqual(np.sum(y == 3), 0)
-
-
-class OpenMLDatasetTestOnTestServer(TestBase):
-    def setUp(self):
-        super(OpenMLDatasetTestOnTestServer, self).setUp()
-        # longley, really small dataset
-        self.dataset = openml.datasets.get_dataset(125, download_data=False)
-
-    def test_tagging(self):
-        tag = "testing_tag_{}_{}".format(self.id(), time())
-        ds_list = openml.datasets.list_datasets(tag=tag)
-        self.assertEqual(len(ds_list), 0)
-        self.dataset.push_tag(tag)
-        ds_list = openml.datasets.list_datasets(tag=tag)
-        self.assertEqual(len(ds_list), 1)
-        self.assertIn(125, ds_list)
-        self.dataset.remove_tag(tag)
-        ds_list = openml.datasets.list_datasets(tag=tag)
-        self.assertEqual(len(ds_list), 0)
-
+        _, y, _, _ = self.dataset.get_data("class")
+        assert list(y.dtype.categories) == ["1", "2", "3", "4", "5", "U"]
+
+    def test_get_data_corrupt_pickle(self):
+        # Lazy loaded dataset, populate cache.
+        self.iris.get_data()
+        # Corrupt pickle file, overwrite as empty.
+        with open(self.iris.data_pickle_file, "w") as fh:
+            fh.write("")
+        # Despite the corrupt file, the data should be loaded from the ARFF file.
+        # A warning message is written to the python logger.
+        xy, _, _, _ = self.iris.get_data()
+        assert isinstance(xy, pd.DataFrame)
+        assert xy.shape == (150, 5)
+
+    def test_lazy_loading_metadata(self):
+        # Initial Setup
+        did_cache_dir = openml.utils._create_cache_directory_for_id(
+            openml.datasets.functions.DATASETS_CACHE_DIR_NAME,
+            2,
+        )
+        _compare_dataset = openml.datasets.get_dataset(
+            2,
+            download_data=False,
+            download_features_meta_data=True,
+            download_qualities=True,
+        )
+        change_time = os.stat(did_cache_dir).st_mtime
+
+        # Test with cache
+        _dataset = openml.datasets.get_dataset(
+            2,
+            download_data=False,
+            download_features_meta_data=False,
+            download_qualities=False,
+        )
+        assert change_time == os.stat(did_cache_dir).st_mtime
+        assert _dataset.features == _compare_dataset.features
+        assert _dataset.qualities == _compare_dataset.qualities
+
+        # -- Test without cache
+        openml.utils._remove_cache_dir_for_id(
+            openml.datasets.functions.DATASETS_CACHE_DIR_NAME,
+            did_cache_dir,
+        )
 
+        _dataset = openml.datasets.get_dataset(
+            2,
+            download_data=False,
+            download_features_meta_data=False,
+            download_qualities=False,
+        )
+        assert ["description.xml"] == os.listdir(did_cache_dir)
+        assert change_time != os.stat(did_cache_dir).st_mtime
+        assert _dataset.features == _compare_dataset.features
+        assert _dataset.qualities == _compare_dataset.qualities
+
+    def test_equality_comparison(self):
+        self.assertEqual(self.iris, self.iris)
+        self.assertNotEqual(self.iris, self.titanic)
+        self.assertNotEqual(self.titanic, "Wrong_object")
+
+
+@pytest.mark.test_server()
+def test_tagging():
+    dataset = openml.datasets.get_dataset(125, download_data=False)
+
+    # tags can be at most 64 alphanumeric (+ underscore) chars
+    unique_indicator = str(time()).replace(".", "")
+    tag = f"test_tag_OpenMLDatasetTestOnTestServer_{unique_indicator}"
+    datasets = openml.datasets.list_datasets(tag=tag)
+    assert datasets.empty
+    dataset.push_tag(tag)
+    datasets = openml.datasets.list_datasets(tag=tag)
+    assert len(datasets) == 1
+    assert 125 in datasets["did"]
+    dataset.remove_tag(tag)
+    datasets = openml.datasets.list_datasets(tag=tag)
+    assert datasets.empty
+
+@pytest.mark.test_server()
+def test_get_feature_with_ontology_data_id_11():
+    # test on car dataset, which has built-in ontology references
+    dataset = openml.datasets.get_dataset(11)
+    assert len(dataset.features) == 7
+    assert len(dataset.features[1].ontologies) >= 2
+    assert len(dataset.features[2].ontologies) >= 1
+    assert len(dataset.features[3].ontologies) >= 1   
+
+@pytest.mark.test_server()
+def test_add_remove_ontology_to_dataset():
+    did = 1
+    feature_index = 1
+    ontology = "https://www.openml.org/unittest/" + str(time())
+    openml.datasets.functions.data_feature_add_ontology(did, feature_index, ontology)
+    openml.datasets.functions.data_feature_remove_ontology(did, feature_index, ontology)    
+
+@pytest.mark.test_server()
+def test_add_same_ontology_multiple_features():
+    did = 1
+    ontology = "https://www.openml.org/unittest/" + str(time())
+
+    for i in range(3):
+        openml.datasets.functions.data_feature_add_ontology(did, i, ontology)    
+
+
+@pytest.mark.test_server()
+def test_add_illegal_long_ontology():
+    did = 1
+    ontology = "http://www.google.com/" + ("a" * 257)
+    try:
+        openml.datasets.functions.data_feature_add_ontology(did, 1, ontology)
+        assert False
+    except openml.exceptions.OpenMLServerException as e:
+        assert e.code == 1105
+    
+
+
+@pytest.mark.test_server()
+def test_add_illegal_url_ontology():
+    did = 1
+    ontology = "not_a_url" + str(time())
+    try:
+        openml.datasets.functions.data_feature_add_ontology(did, 1, ontology)
+        assert False
+    except openml.exceptions.OpenMLServerException as e:
+        assert e.code == 1106
+
+
+@pytest.mark.production_server()
 class OpenMLDatasetTestSparse(TestBase):
     _multiprocess_can_split_ = True
 
     def setUp(self):
-        super(OpenMLDatasetTestSparse, self).setUp()
-        openml.config.server = self.production_server
+        super().setUp()
+        self.use_production_server()
 
         self.sparse_dataset = openml.datasets.get_dataset(4136, download_data=False)
 
-    def test_get_sparse_dataset_with_target(self):
-        X, y, _, attribute_names = self.sparse_dataset.get_data(
-            dataset_format='array', target="class"
-        )
-
-        self.assertTrue(sparse.issparse(X))
-        self.assertEqual(X.dtype, np.float32)
-        self.assertEqual(X.shape, (600, 20000))
-
-        self.assertIsInstance(y, np.ndarray)
-        self.assertIn(y.dtype, [np.int32, np.int64])
-        self.assertEqual(y.shape, (600, ))
+    def test_get_sparse_dataset_dataframe_with_target(self):
+        X, y, _, attribute_names = self.sparse_dataset.get_data(target="class")
+        assert isinstance(X, pd.DataFrame)
+        assert isinstance(X.dtypes.iloc[0], pd.SparseDtype)
+        assert X.shape == (600, 20000)
 
-        self.assertEqual(len(attribute_names), 20000)
-        self.assertNotIn("class", attribute_names)
+        assert isinstance(y, pd.Series)
+        assert isinstance(y.dtypes, pd.SparseDtype)
+        assert y.shape == (600,)
 
-    def test_get_sparse_dataset(self):
-        rval, _, categorical, attribute_names = self.sparse_dataset.get_data(dataset_format='array')
-        self.assertTrue(sparse.issparse(rval))
-        self.assertEqual(rval.dtype, np.float32)
-        self.assertEqual((600, 20001), rval.shape)
+        assert len(attribute_names) == 20000
+        assert "class" not in attribute_names
 
-        self.assertEqual(len(categorical), 20001)
-        self.assertTrue(all([isinstance(cat, bool) for cat in categorical]))
-
-        self.assertEqual(len(attribute_names), 20001)
-        self.assertTrue(all([isinstance(att, str) for att in attribute_names]))
-
-    def test_get_sparse_dataframe(self):
+    def test_get_sparse_dataset_dataframe(self):
         rval, *_ = self.sparse_dataset.get_data()
-        self.assertTrue(isinstance(rval, pd.SparseDataFrame))
-        self.assertEqual((600, 20001), rval.shape)
-
-    def test_get_sparse_dataset_with_rowid(self):
-        self.sparse_dataset.row_id_attribute = ["V256"]
-        rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format='array', include_row_id=True
-        )
-        self.assertTrue(sparse.issparse(rval))
-        self.assertEqual(rval.dtype, np.float32)
-        self.assertEqual(rval.shape, (600, 20001))
-        self.assertEqual(len(categorical), 20001)
-
-        rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format='array', include_row_id=False
+        assert isinstance(rval, pd.DataFrame)
+        np.testing.assert_array_equal(
+            [pd.SparseDtype(np.float32, fill_value=0.0)] * len(rval.dtypes),
+            rval.dtypes,
         )
-        self.assertTrue(sparse.issparse(rval))
-        self.assertEqual(rval.dtype, np.float32)
-        self.assertEqual(rval.shape, (600, 20000))
-        self.assertEqual(len(categorical), 20000)
-
-    def test_get_sparse_dataset_with_ignore_attributes(self):
-        self.sparse_dataset.ignore_attribute = ["V256"]
-        rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format='array', include_ignore_attribute=True
-        )
-        self.assertTrue(sparse.issparse(rval))
-        self.assertEqual(rval.dtype, np.float32)
-        self.assertEqual(rval.shape, (600, 20001))
-
-        self.assertEqual(len(categorical), 20001)
-        rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format='array', include_ignore_attribute=False
-        )
-        self.assertTrue(sparse.issparse(rval))
-        self.assertEqual(rval.dtype, np.float32)
-        self.assertEqual(rval.shape, (600, 20000))
-        self.assertEqual(len(categorical), 20000)
+        assert rval.shape == (600, 20001)
 
     def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
         # TODO: re-add row_id and ignore attributes
         self.sparse_dataset.ignore_attribute = ["V256"]
         self.sparse_dataset.row_id_attribute = ["V512"]
         X, y, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format='array',
             target="class",
             include_row_id=False,
             include_ignore_attribute=False,
         )
-        self.assertTrue(sparse.issparse(X))
-        self.assertEqual(X.dtype, np.float32)
-        self.assertIn(y.dtype, [np.int32, np.int64])
-        self.assertEqual(X.shape, (600, 19998))
+        assert all(dtype == pd.SparseDtype(np.float32, fill_value=0.0) for dtype in X.dtypes)
+        # array format returned dense, but now we only return sparse and let the user handle it.
+        assert isinstance(y.dtypes, pd.SparseDtype)
+        assert X.shape == (600, 19998)
 
-        self.assertEqual(len(categorical), 19998)
+        assert len(categorical) == 19998
         self.assertListEqual(categorical, [False] * 19998)
-        self.assertEqual(y.shape, (600, ))
-
-
-class OpenMLDatasetQualityTest(TestBase):
-    def test__check_qualities(self):
-        qualities = [{'oml:name': 'a', 'oml:value': '0.5'}]
-        qualities = openml.datasets.dataset._check_qualities(qualities)
-        self.assertEqual(qualities['a'], 0.5)
-
-        qualities = [{'oml:name': 'a', 'oml:value': 'null'}]
-        qualities = openml.datasets.dataset._check_qualities(qualities)
-        self.assertNotEqual(qualities['a'], qualities['a'])
-
-        qualities = [{'oml:name': 'a', 'oml:value': None}]
-        qualities = openml.datasets.dataset._check_qualities(qualities)
-        self.assertNotEqual(qualities['a'], qualities['a'])
+        assert y.shape == (600,)
+
+    def test_get_sparse_categorical_data_id_395(self):
+        dataset = openml.datasets.get_dataset(395, download_data=True)
+        feature = dataset.features[3758]
+        assert isinstance(dataset, OpenMLDataset)
+        assert isinstance(feature, OpenMLDataFeature)
+        assert dataset.name == "re1.wc"
+        assert feature.name == "CLASS_LABEL"
+        assert feature.data_type == "nominal"
+        assert len(feature.nominal_values) == 25
+
+
+@pytest.mark.test_server()
+def test__read_features(mocker, workdir, static_cache_dir):
+    """Test we read the features from the xml if no cache pickle is available.
+    This test also does some simple checks to verify that the features are read correctly
+    """
+    filename_mock = mocker.patch("openml.datasets.dataset._get_features_pickle_file")
+    pickle_mock = mocker.patch("openml.datasets.dataset.pickle")
+
+    filename_mock.return_value = os.path.join(workdir, "features.xml.pkl")
+    pickle_mock.load.side_effect = FileNotFoundError
+
+    features = openml.datasets.dataset._read_features(
+        os.path.join(
+            static_cache_dir,
+            "org",
+            "openml",
+            "test",
+            "datasets",
+            "2",
+            "features.xml",
+        ),
+    )
+    assert isinstance(features, dict)
+    assert len(features) == 39
+    assert isinstance(features[0], OpenMLDataFeature)
+    assert features[0].name == "family"
+    assert len(features[0].nominal_values) == 9
+    # pickle.load is never called because the features pickle file didn't exist
+    assert pickle_mock.load.call_count == 0
+    assert pickle_mock.dump.call_count == 1
+
+
+@pytest.mark.test_server()
+def test__read_qualities(static_cache_dir, workdir, mocker):
+    """Test we read the qualities from the xml if no cache pickle is available.
+    This test also does some minor checks to ensure that the qualities are read correctly.
+    """
+
+    filename_mock = mocker.patch("openml.datasets.dataset._get_qualities_pickle_file")
+    pickle_mock = mocker.patch("openml.datasets.dataset.pickle")
+
+    filename_mock.return_value=os.path.join(workdir, "qualities.xml.pkl")
+    pickle_mock.load.side_effect = FileNotFoundError
+
+    qualities = openml.datasets.dataset._read_qualities(
+        os.path.join(
+            static_cache_dir,
+            "org",
+            "openml",
+            "test",
+            "datasets",
+            "2",
+            "qualities.xml",
+        ),
+    )
+    assert isinstance(qualities, dict)
+    assert len(qualities) == 106
+    assert pickle_mock.load.call_count == 0
+    assert pickle_mock.dump.call_count == 1
+
+
+
+def test__check_qualities():
+    qualities = [{"oml:name": "a", "oml:value": "0.5"}]
+    qualities = openml.datasets.dataset._check_qualities(qualities)
+    assert qualities["a"] == 0.5
+
+    qualities = [{"oml:name": "a", "oml:value": "null"}]
+    qualities = openml.datasets.dataset._check_qualities(qualities)
+    assert qualities["a"] != qualities["a"]
+
+    qualities = [{"oml:name": "a", "oml:value": None}]
+    qualities = openml.datasets.dataset._check_qualities(qualities)
+    assert qualities["a"] != qualities["a"]
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 80d7333a0..974fb36ef 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -1,56 +1,76 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import itertools
 import os
 import random
+import shutil
+import time
+import uuid
 from itertools import product
+from pathlib import Path
+from typing import Iterable
 from unittest import mock
 
 import arff
-
-import pytest
 import numpy as np
 import pandas as pd
+import pytest
+import requests
+import requests_mock
 import scipy.sparse
 from oslo_concurrency import lockutils
 
 import openml
 from openml import OpenMLDataset
-from openml.exceptions import OpenMLCacheException, OpenMLHashException, \
-    OpenMLPrivateDatasetError
-from openml.testing import TestBase
-from openml.utils import _tag_entity, _create_cache_directory_for_id
-from openml.datasets.functions import (create_dataset,
-                                       attributes_arff_from_df,
-                                       _get_cached_dataset,
-                                       _get_cached_dataset_features,
-                                       _get_cached_dataset_qualities,
-                                       _get_cached_datasets,
-                                       _get_dataset_arff,
-                                       _get_dataset_description,
-                                       _get_dataset_features,
-                                       _get_dataset_qualities,
-                                       _get_online_dataset_arff,
-                                       _get_online_dataset_format,
-                                       DATASETS_CACHE_DIR_NAME)
+from openml._api_calls import _download_minio_file
+from openml.datasets import edit_dataset, fork_dataset
+from openml.datasets.functions import (
+    DATASETS_CACHE_DIR_NAME,
+    _get_dataset_arff,
+    _get_dataset_description,
+    _get_dataset_features_file,
+    _get_dataset_parquet,
+    _get_dataset_qualities_file,
+    _get_online_dataset_arff,
+    _get_online_dataset_format,
+    _topic_add_dataset,
+    _topic_delete_dataset,
+    attributes_arff_from_df,
+    create_dataset,
+)
+from openml.exceptions import (
+    OpenMLHashException,
+    OpenMLNotAuthorizedError,
+    OpenMLPrivateDatasetError,
+    OpenMLServerException,
+    OpenMLServerNoResult,
+)
+from openml.tasks import TaskType, create_task
+from openml.testing import TestBase, create_request_response
+from openml.utils import _create_cache_directory_for_id, _tag_entity
 
 
 class TestOpenMLDataset(TestBase):
     _multiprocess_can_split_ = True
 
-    def setUp(self):
-        super(TestOpenMLDataset, self).setUp()
-
     def tearDown(self):
         self._remove_pickle_files()
-        super(TestOpenMLDataset, self).tearDown()
+        super().tearDown()
 
     def _remove_pickle_files(self):
-        self.lock_path = os.path.join(openml.config.get_cache_directory(), 'locks')
-        for did in ['-1', '2']:
+        self.lock_path = os.path.join(openml.config.get_cache_directory(), "locks")
+        for did in ["-1", "2"]:
             with lockutils.external_lock(
-                    name='datasets.functions.get_dataset:%s' % did,
-                    lock_path=self.lock_path,
+                name=f"datasets.functions.get_dataset:{did}",
+                lock_path=self.lock_path,
             ):
-                pickle_path = os.path.join(openml.config.get_cache_directory(), 'datasets',
-                                           did, 'dataset.pkl.py3')
+                pickle_path = os.path.join(
+                    openml.config.get_cache_directory(),
+                    "datasets",
+                    did,
+                    "dataset.pkl.py3",
+                )
                 try:
                     os.remove(pickle_path)
                 except (OSError, FileNotFoundError):
@@ -58,416 +78,461 @@ def _remove_pickle_files(self):
                     pass
 
     def _get_empty_param_for_dataset(self):
-
         return {
-            'name': None,
-            'description': None,
-            'creator': None,
-            'contributor': None,
-            'collection_date': None,
-            'language': None,
-            'licence': None,
-            'default_target_attribute': None,
-            'row_id_attribute': None,
-            'ignore_attribute': None,
-            'citation': None,
-            'attributes': None,
-            'data': None
+            "name": None,
+            "description": None,
+            "creator": None,
+            "contributor": None,
+            "collection_date": None,
+            "language": None,
+            "licence": None,
+            "default_target_attribute": None,
+            "row_id_attribute": None,
+            "ignore_attribute": None,
+            "citation": None,
+            "attributes": None,
+            "data": None,
         }
 
-    def test__list_cached_datasets(self):
-        openml.config.cache_directory = self.static_cache_dir
-        cached_datasets = openml.datasets.functions._list_cached_datasets()
-        self.assertIsInstance(cached_datasets, list)
-        self.assertEqual(len(cached_datasets), 2)
-        self.assertIsInstance(cached_datasets[0], int)
-
-    @mock.patch('openml.datasets.functions._list_cached_datasets')
-    def test__get_cached_datasets(self, _list_cached_datasets_mock):
-        openml.config.cache_directory = self.static_cache_dir
-        _list_cached_datasets_mock.return_value = [-1, 2]
-        datasets = _get_cached_datasets()
-        self.assertIsInstance(datasets, dict)
-        self.assertEqual(len(datasets), 2)
-        self.assertIsInstance(list(datasets.values())[0], OpenMLDataset)
-
-    def test__get_cached_dataset(self, ):
-        openml.config.cache_directory = self.static_cache_dir
-        dataset = _get_cached_dataset(2)
-        features = _get_cached_dataset_features(2)
-        qualities = _get_cached_dataset_qualities(2)
-        self.assertIsInstance(dataset, OpenMLDataset)
-        self.assertTrue(len(dataset.features) > 0)
-        self.assertTrue(len(dataset.features) == len(features['oml:feature']))
-        self.assertTrue(len(dataset.qualities) == len(qualities))
-
-    def test_get_cached_dataset_description(self):
-        openml.config.cache_directory = self.static_cache_dir
-        description = openml.datasets.functions._get_cached_dataset_description(2)
-        self.assertIsInstance(description, dict)
-
-    def test_get_cached_dataset_description_not_cached(self):
-        openml.config.cache_directory = self.static_cache_dir
-        self.assertRaisesRegex(OpenMLCacheException,
-                               "Dataset description for dataset id 3 not cached",
-                               openml.datasets.functions._get_cached_dataset_description,
-                               dataset_id=3)
-
-    def test_get_cached_dataset_arff(self):
-        openml.config.cache_directory = self.static_cache_dir
-        description = openml.datasets.functions._get_cached_dataset_arff(dataset_id=2)
-        self.assertIsInstance(description, str)
-
-    def test_get_cached_dataset_arff_not_cached(self):
-        openml.config.cache_directory = self.static_cache_dir
-        self.assertRaisesRegex(OpenMLCacheException,
-                               "ARFF file for dataset id 3 not cached",
-                               openml.datasets.functions._get_cached_dataset_arff,
-                               dataset_id=3)
-
     def _check_dataset(self, dataset):
-        self.assertEqual(type(dataset), dict)
-        self.assertGreaterEqual(len(dataset), 2)
-        self.assertIn('did', dataset)
-        self.assertIsInstance(dataset['did'], int)
-        self.assertIn('status', dataset)
-        self.assertIsInstance(dataset['status'], str)
-        self.assertIn(dataset['status'], ['in_preparation', 'active', 'deactivated'])
+        assert type(dataset) == dict
+        assert len(dataset) >= 2
+        assert "did" in dataset
+        assert isinstance(dataset["did"], int)
+        assert "status" in dataset
+        assert isinstance(dataset["status"], str)
+        assert dataset["status"] in ["in_preparation", "active", "deactivated"]
 
     def _check_datasets(self, datasets):
         for did in datasets:
             self._check_dataset(datasets[did])
 
+    @pytest.mark.test_server()
     def test_tag_untag_dataset(self):
-        tag = 'test_tag_%d' % random.randint(1, 1000000)
-        all_tags = _tag_entity('data', 1, tag)
-        self.assertTrue(tag in all_tags)
-        all_tags = _tag_entity('data', 1, tag, untag=True)
-        self.assertTrue(tag not in all_tags)
-
-    def test_list_datasets(self):
-        # We can only perform a smoke test here because we test on dynamic
-        # data from the internet...
+        tag = "test_tag_%d" % random.randint(1, 1000000)
+        all_tags = _tag_entity("data", 1, tag)
+        assert tag in all_tags
+        all_tags = _tag_entity("data", 1, tag, untag=True)
+        assert tag not in all_tags
+
+    @pytest.mark.test_server()
+    def test_list_datasets_length(self):
         datasets = openml.datasets.list_datasets()
-        # 1087 as the number of datasets on openml.org
-        self.assertGreaterEqual(len(datasets), 100)
-        self._check_datasets(datasets)
-
-    def test_list_datasets_output_format(self):
-        datasets = openml.datasets.list_datasets(output_format='dataframe')
-        self.assertIsInstance(datasets, pd.DataFrame)
-        self.assertGreaterEqual(len(datasets), 100)
-
-    def test_list_datasets_by_tag(self):
-        datasets = openml.datasets.list_datasets(tag='study_14')
-        self.assertGreaterEqual(len(datasets), 100)
-        self._check_datasets(datasets)
-
-    def test_list_datasets_by_size(self):
-        datasets = openml.datasets.list_datasets(size=10050)
-        self.assertGreaterEqual(len(datasets), 120)
-        self._check_datasets(datasets)
-
-    def test_list_datasets_by_number_instances(self):
-        datasets = openml.datasets.list_datasets(number_instances="5..100")
-        self.assertGreaterEqual(len(datasets), 4)
-        self._check_datasets(datasets)
-
-    def test_list_datasets_by_number_features(self):
-        datasets = openml.datasets.list_datasets(number_features="50..100")
-        self.assertGreaterEqual(len(datasets), 8)
-        self._check_datasets(datasets)
-
-    def test_list_datasets_by_number_classes(self):
-        datasets = openml.datasets.list_datasets(number_classes="5")
-        self.assertGreaterEqual(len(datasets), 3)
-        self._check_datasets(datasets)
-
-    def test_list_datasets_by_number_missing_values(self):
-        datasets = openml.datasets.list_datasets(number_missing_values="5..100")
-        self.assertGreaterEqual(len(datasets), 5)
-        self._check_datasets(datasets)
-
-    def test_list_datasets_combined_filters(self):
-        datasets = openml.datasets.list_datasets(tag='study_14',
-                                                 number_instances="100..1000",
-                                                 number_missing_values="800..1000")
-        self.assertGreaterEqual(len(datasets), 1)
-        self._check_datasets(datasets)
+        assert len(datasets) >= 100
 
+    @pytest.mark.test_server()
     def test_list_datasets_paginate(self):
         size = 10
         max = 100
         for i in range(0, max, size):
             datasets = openml.datasets.list_datasets(offset=i, size=size)
-            self.assertEqual(size, len(datasets))
-            self._check_datasets(datasets)
+            assert len(datasets) == size
+            assert len(datasets.columns) >= 2
+            assert "did" in datasets.columns
+            assert datasets["did"].dtype == int
+            assert "status" in datasets.columns
+            assert datasets["status"].dtype == pd.CategoricalDtype(
+                categories=["in_preparation", "active", "deactivated"],
+            )
 
+    @pytest.mark.test_server()
     def test_list_datasets_empty(self):
-        datasets = openml.datasets.list_datasets(tag='NoOneWouldUseThisTagAnyway')
-        if len(datasets) > 0:
-            raise ValueError('UnitTest Outdated, tag was already used (please remove)')
-
-        self.assertIsInstance(datasets, dict)
+        datasets = openml.datasets.list_datasets(tag="NoOneWouldUseThisTagAnyway")
+        assert datasets.empty
 
+    @pytest.mark.production_server()
     def test_check_datasets_active(self):
         # Have to test on live because there is no deactivated dataset on the test server.
-        openml.config.server = self.production_server
-        active = openml.datasets.check_datasets_active([2, 17])
-        self.assertTrue(active[2])
-        self.assertFalse(active[17])
+        self.use_production_server()
+        active = openml.datasets.check_datasets_active(
+            [2, 17, 79],
+            raise_error_if_not_exist=False,
+        )
+        assert active[2]
+        assert not active[17]
+        assert active.get(79) is None
         self.assertRaisesRegex(
             ValueError,
-            'Could not find dataset 79 in OpenML dataset list.',
+            r"Could not find dataset\(s\) 79 in OpenML dataset list.",
             openml.datasets.check_datasets_active,
             [79],
         )
         openml.config.server = self.test_server
 
-    def _datasets_retrieved_successfully(self, dids, metadata_only=True):
-        """ Checks that all files for the given dids have been downloaded.
-
-        This includes:
-            - description
-            - qualities
-            - features
-            - absence of data arff if metadata_only, else it must be present too.
-        """
-        for did in dids:
-            self.assertTrue(os.path.exists(os.path.join(
-                openml.config.get_cache_directory(), "datasets", str(did), "description.xml")))
-            self.assertTrue(os.path.exists(os.path.join(
-                openml.config.get_cache_directory(), "datasets", str(did), "qualities.xml")))
-            self.assertTrue(os.path.exists(os.path.join(
-                openml.config.get_cache_directory(), "datasets", str(did), "features.xml")))
-
-            data_assert = self.assertFalse if metadata_only else self.assertTrue
-            data_assert(os.path.exists(os.path.join(
-                openml.config.get_cache_directory(), "datasets", str(did), "dataset.arff")))
-
+    @pytest.mark.test_server()
+    def test_illegal_character_tag(self):
+        dataset = openml.datasets.get_dataset(1)
+        tag = "illegal_tag&"
+        try:
+            dataset.push_tag(tag)
+            raise AssertionError()
+        except openml.exceptions.OpenMLServerException as e:
+            assert e.code == 477
+
+    @pytest.mark.test_server()
+    def test_illegal_length_tag(self):
+        dataset = openml.datasets.get_dataset(1)
+        tag = "a" * 65
+        try:
+            dataset.push_tag(tag)
+            raise AssertionError()
+        except openml.exceptions.OpenMLServerException as e:
+            assert e.code == 477
+
+    @pytest.mark.production_server()
     def test__name_to_id_with_deactivated(self):
-        """ Check that an activated dataset is returned if an earlier deactivated one exists. """
-        openml.config.server = self.production_server
+        """Check that an activated dataset is returned if an earlier deactivated one exists."""
+        self.use_production_server()
         # /d/1 was deactivated
-        self.assertEqual(openml.datasets.functions._name_to_id('anneal'), 2)
+        assert openml.datasets.functions._name_to_id("anneal") == 2
         openml.config.server = self.test_server
 
+    @pytest.mark.production_server()
     def test__name_to_id_with_multiple_active(self):
-        """ With multiple active datasets, retrieve the least recent active. """
-        openml.config.server = self.production_server
-        self.assertEqual(openml.datasets.functions._name_to_id('iris'), 61)
+        """With multiple active datasets, retrieve the least recent active."""
+        self.use_production_server()
+        assert openml.datasets.functions._name_to_id("iris") == 61
 
+    @pytest.mark.production_server()
     def test__name_to_id_with_version(self):
-        """ With multiple active datasets, retrieve the least recent active. """
-        openml.config.server = self.production_server
-        self.assertEqual(openml.datasets.functions._name_to_id('iris', version=3), 969)
+        """With multiple active datasets, retrieve the least recent active."""
+        self.use_production_server()
+        assert openml.datasets.functions._name_to_id("iris", version=3) == 969
 
+    @pytest.mark.production_server()
     def test__name_to_id_with_multiple_active_error(self):
-        """ With multiple active datasets, retrieve the least recent active. """
+        """With multiple active datasets, retrieve the least recent active."""
+        self.use_production_server()
         self.assertRaisesRegex(
             ValueError,
-            "Multiple active datasets exist with name iris",
+            "Multiple active datasets exist with name 'iris'.",
             openml.datasets.functions._name_to_id,
-            dataset_name='iris',
-            error_if_multiple=True
+            dataset_name="iris",
+            error_if_multiple=True,
         )
 
+    @pytest.mark.test_server()
     def test__name_to_id_name_does_not_exist(self):
-        """ With multiple active datasets, retrieve the least recent active. """
+        """With multiple active datasets, retrieve the least recent active."""
         self.assertRaisesRegex(
             RuntimeError,
-            "No active datasets exist with name does_not_exist",
+            "No active datasets exist with name 'does_not_exist'.",
             openml.datasets.functions._name_to_id,
-            dataset_name='does_not_exist'
+            dataset_name="does_not_exist",
         )
 
+    @pytest.mark.test_server()
     def test__name_to_id_version_does_not_exist(self):
-        """ With multiple active datasets, retrieve the least recent active. """
+        """With multiple active datasets, retrieve the least recent active."""
         self.assertRaisesRegex(
             RuntimeError,
-            "No active datasets exist with name iris and version 100000",
+            "No active datasets exist with name 'iris' and version '100000'.",
             openml.datasets.functions._name_to_id,
-            dataset_name='iris',
-            version=100000
+            dataset_name="iris",
+            version=100000,
         )
 
+    @pytest.mark.test_server()
     def test_get_datasets_by_name(self):
         # did 1 and 2 on the test server:
-        dids = ['anneal', 'kr-vs-kp']
-        datasets = openml.datasets.get_datasets(dids, download_data=False)
-        self.assertEqual(len(datasets), 2)
-        self._datasets_retrieved_successfully([1, 2])
+        dids = ["anneal", "kr-vs-kp"]
+        datasets = openml.datasets.get_datasets(dids)
+        assert len(datasets) == 2
+        _assert_datasets_retrieved_successfully([1, 2])
 
+    @pytest.mark.test_server()
     def test_get_datasets_by_mixed(self):
         # did 1 and 2 on the test server:
-        dids = ['anneal', 2]
-        datasets = openml.datasets.get_datasets(dids, download_data=False)
-        self.assertEqual(len(datasets), 2)
-        self._datasets_retrieved_successfully([1, 2])
+        dids = ["anneal", 2]
+        datasets = openml.datasets.get_datasets(dids)
+        assert len(datasets) == 2
+        _assert_datasets_retrieved_successfully([1, 2])
 
+    @pytest.mark.test_server()
     def test_get_datasets(self):
         dids = [1, 2]
         datasets = openml.datasets.get_datasets(dids)
-        self.assertEqual(len(datasets), 2)
-        self._datasets_retrieved_successfully([1, 2], metadata_only=False)
-
-    def test_get_datasets_lazy(self):
-        dids = [1, 2]
-        datasets = openml.datasets.get_datasets(dids, download_data=False)
-        self.assertEqual(len(datasets), 2)
-        self._datasets_retrieved_successfully([1, 2], metadata_only=True)
-
-        datasets[0].get_data()
-        datasets[1].get_data()
-        self._datasets_retrieved_successfully([1, 2], metadata_only=False)
+        assert len(datasets) == 2
+        _assert_datasets_retrieved_successfully([1, 2])
 
+    @pytest.mark.test_server()
     def test_get_dataset_by_name(self):
-        dataset = openml.datasets.get_dataset('anneal')
-        self.assertEqual(type(dataset), OpenMLDataset)
-        self.assertEqual(dataset.dataset_id, 1)
-        self._datasets_retrieved_successfully([1], metadata_only=False)
-
-        self.assertGreater(len(dataset.features), 1)
-        self.assertGreater(len(dataset.qualities), 4)
-
-        # Issue324 Properly handle private datasets when trying to access them
-        openml.config.server = self.production_server
-        self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45)
-
-    def test_get_dataset(self):
-        # This is the only non-lazy load to ensure default behaviour works.
+        dataset = openml.datasets.get_dataset("anneal")
+        assert type(dataset) == OpenMLDataset
+        assert dataset.dataset_id == 1
+        _assert_datasets_retrieved_successfully([1])
+
+        assert len(dataset.features) > 1
+        assert len(dataset.qualities) > 4
+
+    @pytest.mark.skip("Feature is experimental, can not test against stable server.")
+    def test_get_dataset_download_all_files(self):
+        # openml.datasets.get_dataset(id, download_all_files=True)
+        # check for expected files
+        # checking that no additional files are downloaded if
+        # the default (false) is used, seems covered by
+        # test_get_dataset_lazy
+        raise NotImplementedError
+
+    @pytest.mark.test_server()
+    def test_get_dataset_uint8_dtype(self):
         dataset = openml.datasets.get_dataset(1)
-        self.assertEqual(type(dataset), OpenMLDataset)
-        self.assertEqual(dataset.name, 'anneal')
-        self._datasets_retrieved_successfully([1], metadata_only=False)
-
-        self.assertGreater(len(dataset.features), 1)
-        self.assertGreater(len(dataset.qualities), 4)
+        assert type(dataset) == OpenMLDataset
+        assert dataset.name == "anneal"
+        df, _, _, _ = dataset.get_data()
+        assert df["carbon"].dtype == "uint8"
 
+    @pytest.mark.production_server()
+    def test_get_dataset_cannot_access_private_data(self):
         # Issue324 Properly handle private datasets when trying to access them
-        openml.config.server = self.production_server
+        self.use_production_server()
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45)
 
-    def test_get_dataset_lazy(self):
-        dataset = openml.datasets.get_dataset(1, download_data=False)
-        self.assertEqual(type(dataset), OpenMLDataset)
-        self.assertEqual(dataset.name, 'anneal')
-        self._datasets_retrieved_successfully([1], metadata_only=True)
-
-        self.assertGreater(len(dataset.features), 1)
-        self.assertGreater(len(dataset.qualities), 4)
-
-        dataset.get_data()
-        self._datasets_retrieved_successfully([1], metadata_only=False)
-
-        # Issue324 Properly handle private datasets when trying to access them
-        openml.config.server = self.production_server
-        self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45, False)
+    @pytest.mark.skip("Need to find dataset name of private dataset")
+    def test_dataset_by_name_cannot_access_private_data(self):
+        self.use_production_server()
+        self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")
 
+    @pytest.mark.test_server()
     def test_get_dataset_lazy_all_functions(self):
-        """ Test that all expected functionality is available without downloading the dataset. """
-        dataset = openml.datasets.get_dataset(1, download_data=False)
+        """Test that all expected functionality is available without downloading the dataset."""
+        dataset = openml.datasets.get_dataset(1)
         # We only tests functions as general integrity is tested by test_get_dataset_lazy
 
         def ensure_absence_of_real_data():
-            self.assertFalse(os.path.exists(os.path.join(
-                openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")))
+            assert not os.path.exists(
+                os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")
+            )
 
-        tag = 'test_lazy_tag_%d' % random.randint(1, 1000000)
+        tag = "test_lazy_tag_%d" % random.randint(1, 1000000)
         dataset.push_tag(tag)
         ensure_absence_of_real_data()
 
         dataset.remove_tag(tag)
         ensure_absence_of_real_data()
 
-        nominal_indices = dataset.get_features_by_type('nominal')
+        nominal_indices = dataset.get_features_by_type("nominal")
+        # fmt: off
         correct = [0, 1, 2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                    20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 35, 36, 37, 38]
-        self.assertEqual(nominal_indices, correct)
+        # fmt: on
+        assert nominal_indices == correct
         ensure_absence_of_real_data()
 
         classes = dataset.retrieve_class_labels()
-        self.assertEqual(classes, ['1', '2', '3', '4', '5', 'U'])
+        assert classes == ["1", "2", "3", "4", "5", "U"]
         ensure_absence_of_real_data()
 
+    @pytest.mark.test_server()
     def test_get_dataset_sparse(self):
-        dataset = openml.datasets.get_dataset(102, download_data=False)
-        X, *_ = dataset.get_data(dataset_format='array')
-        self.assertIsInstance(X, scipy.sparse.csr_matrix)
+        dataset = openml.datasets.get_dataset(102)
+        X, *_ = dataset.get_data()
+        assert isinstance(X, pd.DataFrame)
+        assert all(isinstance(col, pd.SparseDtype) for col in X.dtypes)
 
+    @pytest.mark.test_server()
     def test_download_rowid(self):
         # Smoke test which checks that the dataset has the row-id set correctly
         did = 44
-        dataset = openml.datasets.get_dataset(did, download_data=False)
-        self.assertEqual(dataset.row_id_attribute, 'Counter')
+        dataset = openml.datasets.get_dataset(did)
+        assert dataset.row_id_attribute == "Counter"
 
+    @pytest.mark.test_server()
     def test__get_dataset_description(self):
         description = _get_dataset_description(self.workdir, 2)
-        self.assertIsInstance(description, dict)
-        description_xml_path = os.path.join(self.workdir,
-                                            'description.xml')
-        self.assertTrue(os.path.exists(description_xml_path))
+        assert isinstance(description, dict)
+        description_xml_path = os.path.join(self.workdir, "description.xml")
+        assert os.path.exists(description_xml_path)
 
+    @pytest.mark.test_server()
     def test__getarff_path_dataset_arff(self):
-        openml.config.cache_directory = self.static_cache_dir
-        description = openml.datasets.functions._get_cached_dataset_description(2)
+        openml.config.set_root_cache_directory(self.static_cache_dir)
+        description = _get_dataset_description(self.workdir, 2)
         arff_path = _get_dataset_arff(description, cache_directory=self.workdir)
-        self.assertIsInstance(arff_path, str)
-        self.assertTrue(os.path.exists(arff_path))
+        assert isinstance(arff_path, Path)
+        assert arff_path.exists()
+
+    def test__download_minio_file_object_does_not_exist(self):
+        self.assertRaisesRegex(
+            FileNotFoundError,
+            r"Object at .* does not exist",
+            _download_minio_file,
+            source="http://data.openml.org/dataset20/i_do_not_exist.pq",
+            destination=self.workdir,
+            exists_ok=True,
+        )
+
+    def test__download_minio_file_to_directory(self):
+        _download_minio_file(
+            source="http://data.openml.org/dataset20/dataset_20.pq",
+            destination=self.workdir,
+            exists_ok=True,
+        )
+        assert os.path.isfile(
+            os.path.join(self.workdir, "dataset_20.pq")
+        ), "_download_minio_file can save to a folder by copying the object name"
+
+    def test__download_minio_file_to_path(self):
+        file_destination = os.path.join(self.workdir, "custom.pq")
+        _download_minio_file(
+            source="http://data.openml.org/dataset20/dataset_20.pq",
+            destination=file_destination,
+            exists_ok=True,
+        )
+        assert os.path.isfile(
+            file_destination
+        ), "_download_minio_file can save to a folder by copying the object name"
+
+    def test__download_minio_file_raises_FileExists_if_destination_in_use(self):
+        file_destination = Path(self.workdir, "custom.pq")
+        file_destination.touch()
+
+        self.assertRaises(
+            FileExistsError,
+            _download_minio_file,
+            source="http://data.openml.org/dataset20/dataset_20.pq",
+            destination=str(file_destination),
+            exists_ok=False,
+        )
+
+    def test__download_minio_file_works_with_bucket_subdirectory(self):
+        file_destination = Path(self.workdir, "custom.pq")
+        _download_minio_file(
+            source="http://data.openml.org/dataset61/dataset_61.pq",
+            destination=file_destination,
+            exists_ok=True,
+        )
+        assert os.path.isfile(
+            file_destination
+        ), "_download_minio_file can download from subdirectories"
+
+
+    @mock.patch("openml._api_calls._download_minio_file")
+    @pytest.mark.test_server()
+    def test__get_dataset_parquet_is_cached(self, patch):
+        openml.config.set_root_cache_directory(self.static_cache_dir)
+        patch.side_effect = RuntimeError(
+            "_download_parquet_url should not be called when loading from cache",
+        )
+        description = {
+            "oml:parquet_url": "http://data.openml.org/dataset30/dataset_30.pq",
+            "oml:id": "30",
+        }
+        path = _get_dataset_parquet(description, cache_directory=None)
+        assert isinstance(path, Path), "_get_dataset_parquet returns a path"
+        assert path.is_file(), "_get_dataset_parquet returns path to real file"
+
+    def test__get_dataset_parquet_file_does_not_exist(self):
+        description = {
+            "oml:parquet_url": "http://data.openml.org/dataset20/does_not_exist.pq",
+            "oml:id": "20",
+        }
+        path = _get_dataset_parquet(description, cache_directory=self.workdir)
+        assert path is None, "_get_dataset_parquet returns None if no file is found"
 
     def test__getarff_md5_issue(self):
         description = {
-            'oml:id': 5,
-            'oml:md5_checksum': 'abc',
-            'oml:url': 'https://www.openml.org/data/download/61',
+            "oml:id": 5,
+            "oml:md5_checksum": "abc",
+            "oml:url": "https://www.openml.org/data/download/61",
         }
+        n = openml.config.connection_n_retries
+        openml.config.connection_n_retries = 1
+
         self.assertRaisesRegex(
             OpenMLHashException,
-            'Checksum ad484452702105cbf3d30f8deaba39a9 of downloaded file '
-            'is unequal to the expected checksum abc. '
-            'Raised when downloading dataset 5.',
+            "Checksum of downloaded file is unequal to the expected checksum abc when downloading "
+            "https://www.openml.org/data/download/61. Raised when downloading dataset 5.",
             _get_dataset_arff,
             description,
         )
 
+        openml.config.connection_n_retries = n
+
+    @pytest.mark.test_server()
     def test__get_dataset_features(self):
-        features = _get_dataset_features(self.workdir, 2)
-        self.assertIsInstance(features, dict)
-        features_xml_path = os.path.join(self.workdir, 'features.xml')
-        self.assertTrue(os.path.exists(features_xml_path))
+        features_file = _get_dataset_features_file(self.workdir, 2)
+        assert isinstance(features_file, Path)
+        features_xml_path = self.workdir / "features.xml"
+        assert features_xml_path.exists()
 
+    @pytest.mark.test_server()
     def test__get_dataset_qualities(self):
-        # Only a smoke check
-        qualities = _get_dataset_qualities(self.workdir, 2)
-        self.assertIsInstance(qualities, list)
+        qualities = _get_dataset_qualities_file(self.workdir, 2)
+        assert isinstance(qualities, Path)
+        qualities_xml_path = self.workdir / "qualities.xml"
+        assert qualities_xml_path.exists()
+
+    @pytest.mark.test_server()
+    def test_get_dataset_force_refresh_cache(self):
+        did_cache_dir = _create_cache_directory_for_id(
+            DATASETS_CACHE_DIR_NAME,
+            2,
+        )
+        openml.datasets.get_dataset(2)
+        change_time = os.stat(did_cache_dir).st_mtime
+
+        # Test default
+        openml.datasets.get_dataset(2)
+        assert change_time == os.stat(did_cache_dir).st_mtime
+
+        # Test refresh
+        openml.datasets.get_dataset(2, force_refresh_cache=True)
+        assert change_time != os.stat(did_cache_dir).st_mtime
+
+        # Final clean up
+        openml.utils._remove_cache_dir_for_id(
+            DATASETS_CACHE_DIR_NAME,
+            did_cache_dir,
+        )
+
+    @pytest.mark.test_server()
+    def test_get_dataset_force_refresh_cache_clean_start(self):
+        did_cache_dir = _create_cache_directory_for_id(
+            DATASETS_CACHE_DIR_NAME,
+            2,
+        )
+        # Clean up
+        openml.utils._remove_cache_dir_for_id(
+            DATASETS_CACHE_DIR_NAME,
+            did_cache_dir,
+        )
+
+        # Test clean start
+        openml.datasets.get_dataset(2, force_refresh_cache=True)
+        assert os.path.exists(did_cache_dir)
+
+        # Final clean up
+        openml.utils._remove_cache_dir_for_id(
+            DATASETS_CACHE_DIR_NAME,
+            did_cache_dir,
+        )
 
     def test_deletion_of_cache_dir(self):
         # Simple removal
         did_cache_dir = _create_cache_directory_for_id(
-            DATASETS_CACHE_DIR_NAME, 1,
+            DATASETS_CACHE_DIR_NAME,
+            1,
         )
-        self.assertTrue(os.path.exists(did_cache_dir))
+        assert os.path.exists(did_cache_dir)
         openml.utils._remove_cache_dir_for_id(
-            DATASETS_CACHE_DIR_NAME, did_cache_dir,
+            DATASETS_CACHE_DIR_NAME,
+            did_cache_dir,
         )
-        self.assertFalse(os.path.exists(did_cache_dir))
+        assert not os.path.exists(did_cache_dir)
 
-    # Use _get_dataset_arff to load the description, trigger an exception in the
-    # test target and have a slightly higher coverage
-    @mock.patch('openml.datasets.functions._get_dataset_arff')
+    # get_dataset_description is the only data guaranteed to be downloaded
+    @mock.patch("openml.datasets.functions._get_dataset_description")
+    @pytest.mark.test_server()
     def test_deletion_of_cache_dir_faulty_download(self, patch):
-        patch.side_effect = Exception('Boom!')
-        self.assertRaisesRegex(Exception, 'Boom!', openml.datasets.get_dataset, dataset_id=1)
-        datasets_cache_dir = os.path.join(
-            self.workdir, 'org', 'openml', 'test', 'datasets'
-        )
-        self.assertEqual(len(os.listdir(datasets_cache_dir)), 0)
+        patch.side_effect = Exception("Boom!")
+        self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1)
+        datasets_cache_dir = os.path.join(openml.config.get_cache_directory(), "datasets")
+        assert len(os.listdir(datasets_cache_dir)) == 0
 
+    @pytest.mark.test_server()
     def test_publish_dataset(self):
-        # lazy loading not possible as we need the arff-file.
-        openml.datasets.get_dataset(3)
-        file_path = os.path.join(openml.config.get_cache_directory(),
-                                 "datasets", "3", "dataset.arff")
+        arff_file_path = self.static_cache_dir / "org" / "openml" / "test" / "datasets" / "2" / "dataset.arff"
         dataset = OpenMLDataset(
             "anneal",
             "test",
@@ -475,104 +540,135 @@ def test_publish_dataset(self):
             version=1,
             licence="public",
             default_target_attribute="class",
-            data_file=file_path,
+            data_file=arff_file_path,
         )
         dataset.publish()
-        TestBase._mark_entity_for_removal('data', dataset.dataset_id)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            dataset.dataset_id))
-        self.assertIsInstance(dataset.dataset_id, int)
+        TestBase._mark_entity_for_removal("data", dataset.dataset_id)
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {dataset.dataset_id}",
+        )
+        assert isinstance(dataset.dataset_id, int)
 
+    @pytest.mark.test_server()
     def test__retrieve_class_labels(self):
-        openml.config.cache_directory = self.static_cache_dir
-        labels = openml.datasets.get_dataset(2, download_data=False).retrieve_class_labels()
-        self.assertEqual(labels, ['1', '2', '3', '4', '5', 'U'])
-        labels = openml.datasets.get_dataset(2, download_data=False).retrieve_class_labels(
-            target_name='product-type')
-        self.assertEqual(labels, ['C', 'H', 'G'])
+        openml.config.set_root_cache_directory(self.static_cache_dir)
+        labels = openml.datasets.get_dataset(2).retrieve_class_labels()
+        assert labels == ["1", "2", "3", "4", "5", "U"]
 
-    def test_upload_dataset_with_url(self):
+        labels = openml.datasets.get_dataset(2).retrieve_class_labels(
+            target_name="product-type",
+        )
+        assert labels == ["C", "H", "G"]
+
+        # Test workaround for string-typed class labels
+        custom_ds = openml.datasets.get_dataset(2)
+        custom_ds.features[31].data_type = "string"
+        labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name)
+        assert labels == ["COIL", "SHEET"]
 
+    @pytest.mark.test_server()
+    def test_upload_dataset_with_url(self):
         dataset = OpenMLDataset(
-            "%s-UploadTestWithURL" % self._get_sentinel(),
+            f"{self._get_sentinel()}-UploadTestWithURL",
             "test",
             data_format="arff",
             version=1,
             url="https://www.openml.org/data/download/61/dataset_61_iris.arff",
         )
         dataset.publish()
-        TestBase._mark_entity_for_removal('data', dataset.dataset_id)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            dataset.dataset_id))
-        self.assertIsInstance(dataset.dataset_id, int)
+        TestBase._mark_entity_for_removal("data", dataset.dataset_id)
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {dataset.dataset_id}",
+        )
+        assert isinstance(dataset.dataset_id, int)
 
+    def _assert_status_of_dataset(self, *, did: int, status: str):
+        """Asserts there is exactly one dataset with id `did` and its current status is `status`"""
+        # need to use listing fn, as this is immune to cache
+        result = openml.datasets.list_datasets(data_id=[did], status="all")
+        result = result.to_dict(orient="index")
+        # I think we should drop the test that one result is returned,
+        # the server should never return multiple results?
+        assert len(result) == 1
+        assert result[did]["status"] == status
+
+    @pytest.mark.skipif(
+        not os.environ.get(openml.config.OPENML_TEST_SERVER_ADMIN_KEY_ENV_VAR),
+        reason="Test requires admin key. Set OPENML_TEST_SERVER_ADMIN_KEY environment variable.",
+    )
+    @pytest.mark.flaky()
+    @pytest.mark.test_server()
     def test_data_status(self):
         dataset = OpenMLDataset(
-            "%s-UploadTestWithURL" % self._get_sentinel(),
-            "test", "ARFF",
+            f"{self._get_sentinel()}-UploadTestWithURL",
+            "test",
+            "ARFF",
             version=1,
-            url="https://www.openml.org/data/download/61/dataset_61_iris.arff")
+            url="https://www.openml.org/data/download/61/dataset_61_iris.arff",
+        )
         dataset.publish()
-        TestBase._mark_entity_for_removal('data', dataset.dataset_id)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            dataset.dataset_id))
-        did = dataset.dataset_id
+        TestBase._mark_entity_for_removal("data", dataset.id)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {dataset.id}")
+        did = dataset.id
 
-        # admin key for test server (only adminds can activate datasets.
+        # admin key for test server (only admins can activate datasets.
         # all users can deactivate their own datasets)
-        openml.config.apikey = 'd488d8afd93b32331cf6ea9d7003d4c3'
+        openml.config.apikey = TestBase.admin_key
 
-        openml.datasets.status_update(did, 'active')
-        # need to use listing fn, as this is immune to cache
-        result = openml.datasets.list_datasets(data_id=did, status='all')
-        self.assertEqual(len(result), 1)
-        self.assertEqual(result[did]['status'], 'active')
-        openml.datasets.status_update(did, 'deactivated')
-        # need to use listing fn, as this is immune to cache
-        result = openml.datasets.list_datasets(data_id=did, status='all')
-        self.assertEqual(len(result), 1)
-        self.assertEqual(result[did]['status'], 'deactivated')
-        openml.datasets.status_update(did, 'active')
-        # need to use listing fn, as this is immune to cache
-        result = openml.datasets.list_datasets(data_id=did, status='all')
-        self.assertEqual(len(result), 1)
-        self.assertEqual(result[did]['status'], 'active')
-        with self.assertRaises(ValueError):
-            openml.datasets.status_update(did, 'in_preparation')
-        # need to use listing fn, as this is immune to cache
-        result = openml.datasets.list_datasets(data_id=did, status='all')
-        self.assertEqual(len(result), 1)
-        self.assertEqual(result[did]['status'], 'active')
+        openml.datasets.status_update(did, "active")
+        self._assert_status_of_dataset(did=did, status="active")
+
+        openml.datasets.status_update(did, "deactivated")
+        self._assert_status_of_dataset(did=did, status="deactivated")
+
+        openml.datasets.status_update(did, "active")
+        self._assert_status_of_dataset(did=did, status="active")
+
+        with pytest.raises(ValueError):
+            openml.datasets.status_update(did, "in_preparation")
+        self._assert_status_of_dataset(did=did, status="active")
 
     def test_attributes_arff_from_df(self):
         # DataFrame case
         df = pd.DataFrame(
-            [[1, 1.0, 'xxx', 'A', True], [2, 2.0, 'yyy', 'B', False]],
-            columns=['integer', 'floating', 'string', 'category', 'boolean']
+            [[1, 1.0, "xxx", "A", True], [2, 2.0, "yyy", "B", False]],
+            columns=["integer", "floating", "string", "category", "boolean"],
+        )
+        df["category"] = df["category"].astype("category")
+        attributes = attributes_arff_from_df(df)
+        assert attributes == [
+            ("integer", "INTEGER"),
+            ("floating", "REAL"),
+            ("string", "STRING"),
+            ("category", ["A", "B"]),
+            ("boolean", ["True", "False"]),
+        ]
+        # DataFrame with Sparse columns case
+        df = pd.DataFrame(
+            {
+                "integer": pd.arrays.SparseArray([1, 2, 0], fill_value=0),
+                "floating": pd.arrays.SparseArray([1.0, 2.0, 0], fill_value=0.0),
+            },
         )
-        df['category'] = df['category'].astype('category')
+        df["integer"] = df["integer"].astype(np.int64)
         attributes = attributes_arff_from_df(df)
-        self.assertEqual(attributes, [('integer', 'INTEGER'),
-                                      ('floating', 'REAL'),
-                                      ('string', 'STRING'),
-                                      ('category', ['A', 'B']),
-                                      ('boolean', ['True', 'False'])])
-        # SparseDataFrame case
-        df = pd.SparseDataFrame([[1, 1.0],
-                                 [2, 2.0],
-                                 [0, 0]],
-                                columns=['integer', 'floating'],
-                                default_fill_value=0)
-        df['integer'] = df['integer'].astype(np.int64)
+        assert attributes == [("integer", "INTEGER"), ("floating", "REAL")]
+
+    def test_attributes_arff_from_df_numeric_column(self):
+        # Test column names are automatically converted to str if needed (#819)
+        df = pd.DataFrame({0: [1, 2, 3], 0.5: [4, 5, 6], "target": [0, 1, 1]})
         attributes = attributes_arff_from_df(df)
-        self.assertEqual(attributes, [('integer', 'INTEGER'),
-                                      ('floating', 'REAL')])
+        assert attributes == [
+            ("0", "INTEGER"),
+            ("0.5", "INTEGER"),
+            ("target", "INTEGER"),
+        ]
 
     def test_attributes_arff_from_df_mixed_dtype_categories(self):
         # liac-arff imposed categorical attributes to be of sting dtype. We
         # raise an error if this is not the case.
-        df = pd.DataFrame([[1], ['2'], [3.]])
-        df[0] = df[0].astype('category')
+        df = pd.DataFrame([[1], ["2"], [3.0]])
+        df[0] = df[0].astype("category")
         err_msg = "The column '0' of the dataframe is of 'category' dtype."
         with pytest.raises(ValueError, match=err_msg):
             attributes_arff_from_df(df)
@@ -581,312 +677,287 @@ def test_attributes_arff_from_df_unknown_dtype(self):
         # check that an error is raised when the dtype is not supptagorted by
         # liac-arff
         data = [
-            [[1], ['2'], [3.]],
-            [pd.Timestamp('2012-05-01'), pd.Timestamp('2012-05-02')],
-        ]
-        dtype = [
-            'mixed-integer',
-            'datetime64'
+            [[1], ["2"], [3.0]],
+            [pd.Timestamp("2012-05-01"), pd.Timestamp("2012-05-02")],
         ]
+        dtype = ["mixed-integer", "datetime64"]
         for arr, dt in zip(data, dtype):
             df = pd.DataFrame(arr)
-            err_msg = ("The dtype '{}' of the column '0' is not currently "
-                       "supported by liac-arff".format(dt))
+            err_msg = (
+                f"The dtype '{dt}' of the column '0' is not currently " "supported by liac-arff"
+            )
             with pytest.raises(ValueError, match=err_msg):
                 attributes_arff_from_df(df)
 
+    @pytest.mark.test_server()
     def test_create_dataset_numpy(self):
+        data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T
 
-        data = np.array(
-            [
-                [1, 2, 3],
-                [1.2, 2.5, 3.8],
-                [2, 5, 8],
-                [0, 1, 0]
-            ]
-        ).T
-
-        attributes = [('col_{}'.format(i), 'REAL')
-                      for i in range(data.shape[1])]
+        attributes = [(f"col_{i}", "REAL") for i in range(data.shape[1])]
 
         dataset = create_dataset(
-            name='%s-NumPy_testing_dataset' % self._get_sentinel(),
-            description='Synthetic dataset created from a NumPy array',
-            creator='OpenML tester',
+            name=f"{self._get_sentinel()}-NumPy_testing_dataset",
+            description="Synthetic dataset created from a NumPy array",
+            creator="OpenML tester",
             contributor=None,
-            collection_date='01-01-2018',
-            language='English',
-            licence='MIT',
-            default_target_attribute='col_{}'.format(data.shape[1] - 1),
+            collection_date="01-01-2018",
+            language="English",
+            licence="MIT",
+            default_target_attribute=f"col_{data.shape[1] - 1}",
             row_id_attribute=None,
             ignore_attribute=None,
-            citation='None',
+            citation="None",
             attributes=attributes,
             data=data,
-            version_label='test',
-            original_data_url='http://openml.github.io/openml-python',
-            paper_url='http://openml.github.io/openml-python'
+            version_label="test",
+            original_data_url="http://openml.github.io/openml-python",
+            paper_url="http://openml.github.io/openml-python",
         )
 
-        upload_did = dataset.publish()
-        TestBase._mark_entity_for_removal('data', upload_did)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            upload_did))
+        dataset.publish()
+        TestBase._mark_entity_for_removal("data", dataset.id)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {dataset.id}")
 
-        self.assertEqual(
-            _get_online_dataset_arff(upload_did),
-            dataset._dataset,
-            "Uploaded arff does not match original one"
-        )
-        self.assertEqual(
-            _get_online_dataset_format(upload_did),
-            'arff',
-            "Wrong format for dataset"
-        )
+        assert (
+            _get_online_dataset_arff(dataset.id) == dataset._dataset
+        ), "Uploaded arff does not match original one"
+        assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
 
+    @pytest.mark.test_server()
     def test_create_dataset_list(self):
-
         data = [
-            ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'],
-            ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'],
-            ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'],
-            ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'],
-            ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes'],
-            ['f', 'rainy', 65.0, 70.0, 'TRUE', 'no'],
-            ['g', 'overcast', 64.0, 65.0, 'TRUE', 'yes'],
-            ['h', 'sunny', 72.0, 95.0, 'FALSE', 'no'],
-            ['i', 'sunny', 69.0, 70.0, 'FALSE', 'yes'],
-            ['j', 'rainy', 75.0, 80.0, 'FALSE', 'yes'],
-            ['k', 'sunny', 75.0, 70.0, 'TRUE', 'yes'],
-            ['l', 'overcast', 72.0, 90.0, 'TRUE', 'yes'],
-            ['m', 'overcast', 81.0, 75.0, 'FALSE', 'yes'],
-            ['n', 'rainy', 71.0, 91.0, 'TRUE', 'no'],
+            ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
+            ["b", "sunny", 80.0, 90.0, "TRUE", "no"],
+            ["c", "overcast", 83.0, 86.0, "FALSE", "yes"],
+            ["d", "rainy", 70.0, 96.0, "FALSE", "yes"],
+            ["e", "rainy", 68.0, 80.0, "FALSE", "yes"],
+            ["f", "rainy", 65.0, 70.0, "TRUE", "no"],
+            ["g", "overcast", 64.0, 65.0, "TRUE", "yes"],
+            ["h", "sunny", 72.0, 95.0, "FALSE", "no"],
+            ["i", "sunny", 69.0, 70.0, "FALSE", "yes"],
+            ["j", "rainy", 75.0, 80.0, "FALSE", "yes"],
+            ["k", "sunny", 75.0, 70.0, "TRUE", "yes"],
+            ["l", "overcast", 72.0, 90.0, "TRUE", "yes"],
+            ["m", "overcast", 81.0, 75.0, "FALSE", "yes"],
+            ["n", "rainy", 71.0, 91.0, "TRUE", "no"],
         ]
 
         attributes = [
-            ('rnd_str', 'STRING'),
-            ('outlook', ['sunny', 'overcast', 'rainy']),
-            ('temperature', 'REAL'),
-            ('humidity', 'REAL'),
-            ('windy', ['TRUE', 'FALSE']),
-            ('play', ['yes', 'no']),
+            ("rnd_str", "STRING"),
+            ("outlook", ["sunny", "overcast", "rainy"]),
+            ("temperature", "REAL"),
+            ("humidity", "REAL"),
+            ("windy", ["TRUE", "FALSE"]),
+            ("play", ["yes", "no"]),
         ]
 
         dataset = create_dataset(
-            name="%s-ModifiedWeather" % self._get_sentinel(),
-            description=(
-                'Testing dataset upload when the data is a list of lists'
-            ),
-            creator='OpenML test',
+            name=f"{self._get_sentinel()}-ModifiedWeather",
+            description=("Testing dataset upload when the data is a list of lists"),
+            creator="OpenML test",
             contributor=None,
-            collection_date='21-09-2018',
-            language='English',
-            licence='MIT',
-            default_target_attribute='play',
+            collection_date="21-09-2018",
+            language="English",
+            licence="MIT",
+            default_target_attribute="play",
             row_id_attribute=None,
             ignore_attribute=None,
-            citation='None',
+            citation="None",
             attributes=attributes,
             data=data,
-            version_label='test',
-            original_data_url='http://openml.github.io/openml-python',
-            paper_url='http://openml.github.io/openml-python'
-        )
-
-        upload_did = dataset.publish()
-        TestBase._mark_entity_for_removal('data', upload_did)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            upload_did))
-        self.assertEqual(
-            _get_online_dataset_arff(upload_did),
-            dataset._dataset,
-            "Uploaded ARFF does not match original one"
-        )
-        self.assertEqual(
-            _get_online_dataset_format(upload_did),
-            'arff',
-            "Wrong format for dataset"
+            version_label="test",
+            original_data_url="http://openml.github.io/openml-python",
+            paper_url="http://openml.github.io/openml-python",
         )
 
+        dataset.publish()
+        TestBase._mark_entity_for_removal("data", dataset.id)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {dataset.id}")
+        assert (
+            _get_online_dataset_arff(dataset.id) == dataset._dataset
+        ), "Uploaded ARFF does not match original one"
+        assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
+
+    @pytest.mark.test_server()
     def test_create_dataset_sparse(self):
-
         # test the scipy.sparse.coo_matrix
-        sparse_data = scipy.sparse.coo_matrix((
-            [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-            ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
-        ))
+        sparse_data = scipy.sparse.coo_matrix(
+            (
+                [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+                ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]),
+            ),
+        )
 
         column_names = [
-            ('input1', 'REAL'),
-            ('input2', 'REAL'),
-            ('y', 'REAL'),
+            ("input1", "REAL"),
+            ("input2", "REAL"),
+            ("y", "REAL"),
         ]
 
         xor_dataset = create_dataset(
-            name="%s-XOR" % self._get_sentinel(),
-            description='Dataset representing the XOR operation',
+            name=f"{self._get_sentinel()}-XOR",
+            description="Dataset representing the XOR operation",
             creator=None,
             contributor=None,
             collection_date=None,
-            language='English',
+            language="English",
             licence=None,
-            default_target_attribute='y',
+            default_target_attribute="y",
             row_id_attribute=None,
             ignore_attribute=None,
             citation=None,
             attributes=column_names,
             data=sparse_data,
-            version_label='test',
+            version_label="test",
         )
 
-        upload_did = xor_dataset.publish()
-        TestBase._mark_entity_for_removal('data', upload_did)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            upload_did))
-        self.assertEqual(
-            _get_online_dataset_arff(upload_did),
-            xor_dataset._dataset,
-            "Uploaded ARFF does not match original one"
-        )
-        self.assertEqual(
-            _get_online_dataset_format(upload_did),
-            'sparse_arff',
-            "Wrong format for dataset"
+        xor_dataset.publish()
+        TestBase._mark_entity_for_removal("data", xor_dataset.id)
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {xor_dataset.id}",
         )
+        assert (
+            _get_online_dataset_arff(xor_dataset.id) == xor_dataset._dataset
+        ), "Uploaded ARFF does not match original one"
+        assert (
+            _get_online_dataset_format(xor_dataset.id) == "sparse_arff"
+        ), "Wrong format for dataset"
 
         # test the list of dicts sparse representation
-        sparse_data = [
-            {0: 0.0},
-            {1: 1.0, 2: 1.0},
-            {0: 1.0, 2: 1.0},
-            {0: 1.0, 1: 1.0}
-        ]
+        sparse_data = [{0: 0.0}, {1: 1.0, 2: 1.0}, {0: 1.0, 2: 1.0}, {0: 1.0, 1: 1.0}]
 
         xor_dataset = create_dataset(
-            name="%s-XOR" % self._get_sentinel(),
-            description='Dataset representing the XOR operation',
+            name=f"{self._get_sentinel()}-XOR",
+            description="Dataset representing the XOR operation",
             creator=None,
             contributor=None,
             collection_date=None,
-            language='English',
+            language="English",
             licence=None,
-            default_target_attribute='y',
+            default_target_attribute="y",
             row_id_attribute=None,
             ignore_attribute=None,
             citation=None,
             attributes=column_names,
             data=sparse_data,
-            version_label='test',
+            version_label="test",
         )
 
-        upload_did = xor_dataset.publish()
-        TestBase._mark_entity_for_removal('data', upload_did)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            upload_did))
-        self.assertEqual(
-            _get_online_dataset_arff(upload_did),
-            xor_dataset._dataset,
-            "Uploaded ARFF does not match original one"
-        )
-        self.assertEqual(
-            _get_online_dataset_format(upload_did),
-            'sparse_arff',
-            "Wrong format for dataset"
+        xor_dataset.publish()
+        TestBase._mark_entity_for_removal("data", xor_dataset.id)
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {xor_dataset.id}",
         )
+        assert (
+            _get_online_dataset_arff(xor_dataset.id) == xor_dataset._dataset
+        ), "Uploaded ARFF does not match original one"
+        assert (
+            _get_online_dataset_format(xor_dataset.id) == "sparse_arff"
+        ), "Wrong format for dataset"
 
     def test_create_invalid_dataset(self):
-
         data = [
-            'sunny',
-            'overcast',
-            'overcast',
-            'rainy',
-            'rainy',
-            'rainy',
-            'overcast',
-            'sunny',
-            'sunny',
-            'rainy',
-            'sunny',
-            'overcast',
-            'overcast',
-            'rainy',
+            "sunny",
+            "overcast",
+            "overcast",
+            "rainy",
+            "rainy",
+            "rainy",
+            "overcast",
+            "sunny",
+            "sunny",
+            "rainy",
+            "sunny",
+            "overcast",
+            "overcast",
+            "rainy",
         ]
 
         param = self._get_empty_param_for_dataset()
-        param['data'] = data
+        param["data"] = data
 
-        self.assertRaises(
-            ValueError,
-            create_dataset,
-            **param
-        )
+        self.assertRaises(ValueError, create_dataset, **param)
 
-        param['data'] = data[0]
-        self.assertRaises(
-            ValueError,
-            create_dataset,
-            **param
-        )
+        param["data"] = data[0]
+        self.assertRaises(ValueError, create_dataset, **param)
 
+    @pytest.mark.test_server()
     def test_get_online_dataset_arff(self):
-        dataset_id = 100  # Australian
+        dataset_id = 128  # iris -- one of the few datasets without parquet file
         # lazy loading not used as arff file is checked.
-        dataset = openml.datasets.get_dataset(dataset_id)
+        dataset = openml.datasets.get_dataset(dataset_id, download_data=True)
         decoder = arff.ArffDecoder()
         # check if the arff from the dataset is
         # the same as the arff from _get_arff function
         d_format = (dataset.format).lower()
 
-        self.assertEqual(
-            dataset._get_arff(d_format),
-            decoder.decode(
-                _get_online_dataset_arff(dataset_id),
-                encode_nominal=True,
-                return_type=arff.DENSE
-                if d_format == 'arff' else arff.COO
-            ),
-            "ARFF files are not equal"
+        assert dataset._get_arff(d_format) == decoder.decode(
+            _get_online_dataset_arff(dataset_id),
+            encode_nominal=True,
+            return_type=arff.DENSE if d_format == "arff" else arff.COO,
+        ), "ARFF files are not equal"
+
+    @pytest.mark.test_server()
+    def test_topic_api_error(self):
+        # Check server exception when non-admin accessses apis
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Topic can only be added/removed by admin.",
+            _topic_add_dataset,
+            data_id=31,
+            topic="business",
+        )
+        # Check server exception when non-admin accessses apis
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Topic can only be added/removed by admin.",
+            _topic_delete_dataset,
+            data_id=31,
+            topic="business",
         )
 
+    @pytest.mark.test_server()
     def test_get_online_dataset_format(self):
-
         # Phoneme dataset
         dataset_id = 77
-        dataset = openml.datasets.get_dataset(dataset_id, download_data=False)
+        dataset = openml.datasets.get_dataset(dataset_id)
 
-        self.assertEqual(
-            (dataset.format).lower(),
-            _get_online_dataset_format(dataset_id),
-            "The format of the ARFF files is different"
-        )
+        assert dataset.format.lower() == _get_online_dataset_format(
+            dataset_id
+        ), "The format of the ARFF files is different"
 
+    @pytest.mark.test_server()
     def test_create_dataset_pandas(self):
         data = [
-            ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'],
-            ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'],
-            ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'],
-            ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'],
-            ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes']
+            ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
+            ["b", "sunny", 80.0, 90.0, "TRUE", "no"],
+            ["c", "overcast", 83.0, 86.0, "FALSE", "yes"],
+            ["d", "rainy", 70.0, 96.0, "FALSE", "yes"],
+            ["e", "rainy", 68.0, 80.0, "FALSE", "yes"],
+        ]
+        column_names = [
+            "rnd_str",
+            "outlook",
+            "temperature",
+            "humidity",
+            "windy",
+            "play",
         ]
-        column_names = ['rnd_str', 'outlook', 'temperature', 'humidity',
-                        'windy', 'play']
         df = pd.DataFrame(data, columns=column_names)
         # enforce the type of each column
-        df['outlook'] = df['outlook'].astype('category')
-        df['windy'] = df['windy'].astype('bool')
-        df['play'] = df['play'].astype('category')
+        df["outlook"] = df["outlook"].astype("category")
+        df["windy"] = df["windy"].astype("bool")
+        df["play"] = df["play"].astype("category")
         # meta-information
-        name = '%s-pandas_testing_dataset' % self._get_sentinel()
-        description = 'Synthetic dataset created from a Pandas DataFrame'
-        creator = 'OpenML tester'
-        collection_date = '01-01-2018'
-        language = 'English'
-        licence = 'MIT'
-        default_target_attribute = 'play'
-        citation = 'None'
-        original_data_url = 'http://openml.github.io/openml-python'
-        paper_url = 'http://openml.github.io/openml-python'
+        name = f"{self._get_sentinel()}-pandas_testing_dataset"
+        description = "Synthetic dataset created from a Pandas DataFrame"
+        creator = "OpenML tester"
+        collection_date = "01-01-2018"
+        language = "English"
+        licence = "MIT"
+        citation = "None"
+        original_data_url = "http://openml.github.io/openml-python"
+        paper_url = "http://openml.github.io/openml-python"
         dataset = openml.datasets.functions.create_dataset(
             name=name,
             description=description,
@@ -895,35 +966,34 @@ def test_create_dataset_pandas(self):
             collection_date=collection_date,
             language=language,
             licence=licence,
-            default_target_attribute=default_target_attribute,
+            default_target_attribute="play",
             row_id_attribute=None,
             ignore_attribute=None,
             citation=citation,
-            attributes='auto',
+            attributes="auto",
             data=df,
-            version_label='test',
+            version_label="test",
             original_data_url=original_data_url,
-            paper_url=paper_url
+            paper_url=paper_url,
         )
-        upload_did = dataset.publish()
-        TestBase._mark_entity_for_removal('data', upload_did)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            upload_did))
-        self.assertEqual(
-            _get_online_dataset_arff(upload_did),
-            dataset._dataset,
-            "Uploaded ARFF does not match original one"
+        dataset.publish()
+        TestBase._mark_entity_for_removal("data", dataset.id)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {dataset.id}")
+        assert (
+            _get_online_dataset_arff(dataset.id) == dataset._dataset
+        ), "Uploaded ARFF does not match original one"
+
+        # Check that DataFrame with Sparse columns are supported properly
+        sparse_data = scipy.sparse.coo_matrix(
+            (
+                [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+                ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]),
+            ),
         )
-
-        # Check that SparseDataFrame are supported properly
-        sparse_data = scipy.sparse.coo_matrix((
-            [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-            ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
-        ))
-        column_names = ['input1', 'input2', 'y']
-        df = pd.SparseDataFrame(sparse_data, columns=column_names)
+        column_names = ["input1", "input2", "y"]
+        df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
         # meta-information
-        description = 'Synthetic dataset created from a Pandas SparseDataFrame'
+        description = "Synthetic dataset created from a Pandas DataFrame with Sparse columns"
         dataset = openml.datasets.functions.create_dataset(
             name=name,
             description=description,
@@ -932,37 +1002,30 @@ def test_create_dataset_pandas(self):
             collection_date=collection_date,
             language=language,
             licence=licence,
-            default_target_attribute=default_target_attribute,
+            default_target_attribute="y",
             row_id_attribute=None,
             ignore_attribute=None,
             citation=citation,
-            attributes='auto',
+            attributes="auto",
             data=df,
-            version_label='test',
+            version_label="test",
             original_data_url=original_data_url,
-            paper_url=paper_url
-        )
-        upload_did = dataset.publish()
-        TestBase._mark_entity_for_removal('data', upload_did)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            upload_did))
-        self.assertEqual(
-            _get_online_dataset_arff(upload_did),
-            dataset._dataset,
-            "Uploaded ARFF does not match original one"
-        )
-        self.assertEqual(
-            _get_online_dataset_format(upload_did),
-            'sparse_arff',
-            "Wrong format for dataset"
+            paper_url=paper_url,
         )
+        dataset.publish()
+        TestBase._mark_entity_for_removal("data", dataset.id)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {dataset.id}")
+        assert (
+            _get_online_dataset_arff(dataset.id) == dataset._dataset
+        ), "Uploaded ARFF does not match original one"
+        assert _get_online_dataset_format(dataset.id) == "sparse_arff", "Wrong format for dataset"
 
         # Check that we can overwrite the attributes
-        data = [['a'], ['b'], ['c'], ['d'], ['e']]
-        column_names = ['rnd_str']
+        data = [["a"], ["b"], ["c"], ["d"], ["e"]]
+        column_names = ["rnd_str"]
         df = pd.DataFrame(data, columns=column_names)
-        df['rnd_str'] = df['rnd_str'].astype('category')
-        attributes = {'rnd_str': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}
+        df["rnd_str"] = df["rnd_str"].astype("category")
+        attributes = {"rnd_str": ["a", "b", "c", "d", "e", "f", "g"]}
         dataset = openml.datasets.functions.create_dataset(
             name=name,
             description=description,
@@ -971,55 +1034,55 @@ def test_create_dataset_pandas(self):
             collection_date=collection_date,
             language=language,
             licence=licence,
-            default_target_attribute=default_target_attribute,
+            default_target_attribute="rnd_str",
             row_id_attribute=None,
             ignore_attribute=None,
             citation=citation,
             attributes=attributes,
             data=df,
-            version_label='test',
+            version_label="test",
             original_data_url=original_data_url,
-            paper_url=paper_url
-        )
-        upload_did = dataset.publish()
-        TestBase._mark_entity_for_removal('data', upload_did)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            upload_did))
-        downloaded_data = _get_online_dataset_arff(upload_did)
-        self.assertEqual(
-            downloaded_data,
-            dataset._dataset,
-            "Uploaded ARFF does not match original one"
+            paper_url=paper_url,
         )
-        self.assertTrue(
-            '@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}' in downloaded_data)
+        dataset.publish()
+        TestBase._mark_entity_for_removal("data", dataset.id)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {dataset.id}")
+        downloaded_data = _get_online_dataset_arff(dataset.id)
+        assert downloaded_data == dataset._dataset, "Uploaded ARFF does not match original one"
+        assert "@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}" in downloaded_data
 
     def test_ignore_attributes_dataset(self):
         data = [
-            ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'],
-            ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'],
-            ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'],
-            ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'],
-            ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes']
+            ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
+            ["b", "sunny", 80.0, 90.0, "TRUE", "no"],
+            ["c", "overcast", 83.0, 86.0, "FALSE", "yes"],
+            ["d", "rainy", 70.0, 96.0, "FALSE", "yes"],
+            ["e", "rainy", 68.0, 80.0, "FALSE", "yes"],
+        ]
+        column_names = [
+            "rnd_str",
+            "outlook",
+            "temperature",
+            "humidity",
+            "windy",
+            "play",
         ]
-        column_names = ['rnd_str', 'outlook', 'temperature', 'humidity',
-                        'windy', 'play']
         df = pd.DataFrame(data, columns=column_names)
         # enforce the type of each column
-        df['outlook'] = df['outlook'].astype('category')
-        df['windy'] = df['windy'].astype('bool')
-        df['play'] = df['play'].astype('category')
+        df["outlook"] = df["outlook"].astype("category")
+        df["windy"] = df["windy"].astype("bool")
+        df["play"] = df["play"].astype("category")
         # meta-information
-        name = '%s-pandas_testing_dataset' % self._get_sentinel()
-        description = 'Synthetic dataset created from a Pandas DataFrame'
-        creator = 'OpenML tester'
-        collection_date = '01-01-2018'
-        language = 'English'
-        licence = 'MIT'
-        default_target_attribute = 'play'
-        citation = 'None'
-        original_data_url = 'http://openml.github.io/openml-python'
-        paper_url = 'http://openml.github.io/openml-python'
+        name = f"{self._get_sentinel()}-pandas_testing_dataset"
+        description = "Synthetic dataset created from a Pandas DataFrame"
+        creator = "OpenML tester"
+        collection_date = "01-01-2018"
+        language = "English"
+        licence = "MIT"
+        default_target_attribute = "play"
+        citation = "None"
+        original_data_url = "http://openml.github.io/openml-python"
+        paper_url = "http://openml.github.io/openml-python"
 
         # we use the create_dataset function which call the OpenMLDataset
         # constructor
@@ -1034,18 +1097,18 @@ def test_ignore_attributes_dataset(self):
             licence=licence,
             default_target_attribute=default_target_attribute,
             row_id_attribute=None,
-            ignore_attribute='outlook',
+            ignore_attribute="outlook",
             citation=citation,
-            attributes='auto',
+            attributes="auto",
             data=df,
-            version_label='test',
+            version_label="test",
             original_data_url=original_data_url,
-            paper_url=paper_url
+            paper_url=paper_url,
         )
-        self.assertEqual(dataset.ignore_attribute, ['outlook'])
+        assert dataset.ignore_attribute == ["outlook"]
 
         # pass a list to ignore_attribute
-        ignore_attribute = ['outlook', 'windy']
+        ignore_attribute = ["outlook", "windy"]
         dataset = openml.datasets.functions.create_dataset(
             name=name,
             description=description,
@@ -1058,16 +1121,16 @@ def test_ignore_attributes_dataset(self):
             row_id_attribute=None,
             ignore_attribute=ignore_attribute,
             citation=citation,
-            attributes='auto',
+            attributes="auto",
             data=df,
-            version_label='test',
+            version_label="test",
             original_data_url=original_data_url,
-            paper_url=paper_url
+            paper_url=paper_url,
         )
-        self.assertEqual(dataset.ignore_attribute, ignore_attribute)
+        assert dataset.ignore_attribute == ignore_attribute
 
         # raise an error if unknown type
-        err_msg = 'Wrong data type for ignore_attribute. Should be list.'
+        err_msg = "Wrong data type for ignore_attribute. Should be list."
         with pytest.raises(ValueError, match=err_msg):
             openml.datasets.functions.create_dataset(
                 name=name,
@@ -1079,59 +1142,52 @@ def test_ignore_attributes_dataset(self):
                 licence=licence,
                 default_target_attribute=default_target_attribute,
                 row_id_attribute=None,
-                ignore_attribute=tuple(['outlook', 'windy']),
+                ignore_attribute=("outlook", "windy"),
                 citation=citation,
-                attributes='auto',
+                attributes="auto",
                 data=df,
-                version_label='test',
+                version_label="test",
                 original_data_url=original_data_url,
-                paper_url=paper_url
+                paper_url=paper_url,
             )
 
-    def test___publish_fetch_ignore_attribute(self):
-        """(Part 1) Test to upload and retrieve dataset and check ignore_attributes
-
-        DEPENDS on test_publish_fetch_ignore_attribute() to be executed after this
-        This test is split into two parts:
-        1) test___publish_fetch_ignore_attribute()
-            This will be executed earlier, owing to alphabetical sorting.
-            This test creates and publish() a dataset and checks for a valid ID.
-        2) test_publish_fetch_ignore_attribute()
-            This will be executed after test___publish_fetch_ignore_attribute(),
-            owing to alphabetical sorting. The time gap is to allow the server
-            more time time to compute data qualities.
-            The dataset ID obtained previously is used to fetch the dataset.
-            The retrieved dataset is checked for valid ignore_attributes.
-        """
-        # the returned fixt
+    @pytest.mark.test_server()
+    def test_publish_fetch_ignore_attribute(self):
+        """Test to upload and retrieve dataset and check ignore_attributes"""
         data = [
-            ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'],
-            ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'],
-            ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'],
-            ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'],
-            ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes']
+            ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
+            ["b", "sunny", 80.0, 90.0, "TRUE", "no"],
+            ["c", "overcast", 83.0, 86.0, "FALSE", "yes"],
+            ["d", "rainy", 70.0, 96.0, "FALSE", "yes"],
+            ["e", "rainy", 68.0, 80.0, "FALSE", "yes"],
+        ]
+        column_names = [
+            "rnd_str",
+            "outlook",
+            "temperature",
+            "humidity",
+            "windy",
+            "play",
         ]
-        column_names = ['rnd_str', 'outlook', 'temperature', 'humidity',
-                        'windy', 'play']
         df = pd.DataFrame(data, columns=column_names)
         # enforce the type of each column
-        df['outlook'] = df['outlook'].astype('category')
-        df['windy'] = df['windy'].astype('bool')
-        df['play'] = df['play'].astype('category')
+        df["outlook"] = df["outlook"].astype("category")
+        df["windy"] = df["windy"].astype("bool")
+        df["play"] = df["play"].astype("category")
         # meta-information
-        name = '%s-pandas_testing_dataset' % self._get_sentinel()
-        description = 'Synthetic dataset created from a Pandas DataFrame'
-        creator = 'OpenML tester'
-        collection_date = '01-01-2018'
-        language = 'English'
-        licence = 'MIT'
-        default_target_attribute = 'play'
-        citation = 'None'
-        original_data_url = 'http://openml.github.io/openml-python'
-        paper_url = 'http://openml.github.io/openml-python'
+        name = f"{self._get_sentinel()}-pandas_testing_dataset"
+        description = "Synthetic dataset created from a Pandas DataFrame"
+        creator = "OpenML tester"
+        collection_date = "01-01-2018"
+        language = "English"
+        licence = "MIT"
+        default_target_attribute = "play"
+        citation = "None"
+        original_data_url = "http://openml.github.io/openml-python"
+        paper_url = "http://openml.github.io/openml-python"
 
         # pass a list to ignore_attribute
-        ignore_attribute = ['outlook', 'windy']
+        ignore_attribute = ["outlook", "windy"]
         dataset = openml.datasets.functions.create_dataset(
             name=name,
             description=description,
@@ -1144,81 +1200,56 @@ def test___publish_fetch_ignore_attribute(self):
             row_id_attribute=None,
             ignore_attribute=ignore_attribute,
             citation=citation,
-            attributes='auto',
+            attributes="auto",
             data=df,
-            version_label='test',
+            version_label="test",
             original_data_url=original_data_url,
-            paper_url=paper_url
+            paper_url=paper_url,
         )
 
         # publish dataset
-        upload_did = dataset.publish()
-        TestBase._mark_entity_for_removal('data', upload_did)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            upload_did))
+        dataset.publish()
+        TestBase._mark_entity_for_removal("data", dataset.id)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {dataset.id}")
         # test if publish was successful
-        self.assertIsInstance(upload_did, int)
-        # variables to carry forward for test_publish_fetch_ignore_attribute()
-        self.__class__.test_publish_fetch_ignore_attribute_did = upload_did
-        self.__class__.test_publish_fetch_ignore_attribute_list = ignore_attribute
+        assert isinstance(dataset.id, int)
 
-    def test_publish_fetch_ignore_attribute(self):
-        """(Part 2) Test to upload and retrieve dataset and check ignore_attributes
-
-        DEPENDS on test___publish_fetch_ignore_attribute() to be executed first
-        This will be executed after test___publish_fetch_ignore_attribute(),
-        owing to alphabetical sorting. The time gap is to allow the server
-        more time time to compute data qualities.
-        The dataset ID obtained previously is used to fetch the dataset.
-        The retrieved dataset is checked for valid ignore_attributes.
-        """
-        # Retrieving variables from test___publish_fetch_ignore_attribute()
-        upload_did = self.__class__.test_publish_fetch_ignore_attribute_did
-        ignore_attribute = self.__class__.test_publish_fetch_ignore_attribute_list
-        trials = 1
-        timeout_limit = 200
-        dataset = None
-        # fetching from server
-        # loop till timeout or fetch not successful
-        while True:
-            if trials > timeout_limit:
-                break
+        downloaded_dataset = self._wait_for_dataset_being_processed(dataset.id)
+        assert downloaded_dataset.ignore_attribute == ignore_attribute
+
+    def _wait_for_dataset_being_processed(
+        self, dataset_id, poll_delay: int = 10, max_waiting_time_seconds: int = 600
+    ):
+        start_time = time.time()
+        while (time.time() - start_time) < max_waiting_time_seconds:
             try:
-                dataset = openml.datasets.get_dataset(upload_did)
-                break
-            except Exception as e:
-                # returned code 273: Dataset not processed yet
-                # returned code 362: No qualities found
-                print("Trial {}/{}: ".format(trials, timeout_limit))
-                print("\tFailed to fetch dataset:{} with '{}'.".format(upload_did, str(e)))
-                trials += 1
-                continue
-        if dataset is None:
-            raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(upload_did))
-        self.assertEqual(dataset.ignore_attribute, ignore_attribute)
+                # being able to download qualities is a sign that the dataset is processed
+                return openml.datasets.get_dataset(dataset_id, download_qualities=True)
+            except OpenMLServerException as e:
+                TestBase.logger.error(
+                    f"Failed to fetch dataset:{dataset_id} with '{e!s}'.",
+                )
+                time.sleep(poll_delay)
+        raise ValueError(f"TIMEOUT: Failed to fetch uploaded dataset - {dataset_id}")
 
     def test_create_dataset_row_id_attribute_error(self):
         # meta-information
-        name = '%s-pandas_testing_dataset' % self._get_sentinel()
-        description = 'Synthetic dataset created from a Pandas DataFrame'
-        creator = 'OpenML tester'
-        collection_date = '01-01-2018'
-        language = 'English'
-        licence = 'MIT'
-        default_target_attribute = 'target'
-        citation = 'None'
-        original_data_url = 'http://openml.github.io/openml-python'
-        paper_url = 'http://openml.github.io/openml-python'
+        name = f"{self._get_sentinel()}-pandas_testing_dataset"
+        description = "Synthetic dataset created from a Pandas DataFrame"
+        creator = "OpenML tester"
+        collection_date = "01-01-2018"
+        language = "English"
+        licence = "MIT"
+        default_target_attribute = "target"
+        citation = "None"
+        original_data_url = "http://openml.github.io/openml-python"
+        paper_url = "http://openml.github.io/openml-python"
         # Check that the index name is well inferred.
-        data = [['a', 1, 0],
-                ['b', 2, 1],
-                ['c', 3, 0],
-                ['d', 4, 1],
-                ['e', 5, 0]]
-        column_names = ['rnd_str', 'integer', 'target']
+        data = [["a", 1, 0], ["b", 2, 1], ["c", 3, 0], ["d", 4, 1], ["e", 5, 0]]
+        column_names = ["rnd_str", "integer", "target"]
         df = pd.DataFrame(data, columns=column_names)
         # affecting row_id_attribute to an unknown column should raise an error
-        err_msg = ("should be one of the data attribute.")
+        err_msg = "should be one of the data attribute."
         with pytest.raises(ValueError, match=err_msg):
             openml.datasets.functions.create_dataset(
                 name=name,
@@ -1231,40 +1262,38 @@ def test_create_dataset_row_id_attribute_error(self):
                 default_target_attribute=default_target_attribute,
                 ignore_attribute=None,
                 citation=citation,
-                attributes='auto',
+                attributes="auto",
                 data=df,
-                row_id_attribute='unknown_row_id',
-                version_label='test',
+                row_id_attribute="unknown_row_id",
+                version_label="test",
                 original_data_url=original_data_url,
-                paper_url=paper_url
+                paper_url=paper_url,
             )
 
+    @pytest.mark.test_server()
     def test_create_dataset_row_id_attribute_inference(self):
         # meta-information
-        name = '%s-pandas_testing_dataset' % self._get_sentinel()
-        description = 'Synthetic dataset created from a Pandas DataFrame'
-        creator = 'OpenML tester'
-        collection_date = '01-01-2018'
-        language = 'English'
-        licence = 'MIT'
-        default_target_attribute = 'target'
-        citation = 'None'
-        original_data_url = 'http://openml.github.io/openml-python'
-        paper_url = 'http://openml.github.io/openml-python'
+        name = f"{self._get_sentinel()}-pandas_testing_dataset"
+        description = "Synthetic dataset created from a Pandas DataFrame"
+        creator = "OpenML tester"
+        collection_date = "01-01-2018"
+        language = "English"
+        licence = "MIT"
+        default_target_attribute = "target"
+        citation = "None"
+        original_data_url = "http://openml.github.io/openml-python"
+        paper_url = "http://openml.github.io/openml-python"
         # Check that the index name is well inferred.
-        data = [['a', 1, 0],
-                ['b', 2, 1],
-                ['c', 3, 0],
-                ['d', 4, 1],
-                ['e', 5, 0]]
-        column_names = ['rnd_str', 'integer', 'target']
+        data = [["a", 1, 0], ["b", 2, 1], ["c", 3, 0], ["d", 4, 1], ["e", 5, 0]]
+        column_names = ["rnd_str", "integer", "target"]
         df = pd.DataFrame(data, columns=column_names)
-        row_id_attr = [None, 'integer']
-        df_index_name = [None, 'index_name']
-        expected_row_id = [None, 'index_name', 'integer', 'integer']
-        for output_row_id, (row_id, index_name) in zip(expected_row_id,
-                                                       product(row_id_attr,
-                                                               df_index_name)):
+        row_id_attr = [None, "integer"]
+        df_index_name = [None, "index_name"]
+        expected_row_id = [None, "index_name", "integer", "integer"]
+        for output_row_id, (row_id, index_name) in zip(
+            expected_row_id,
+            product(row_id_attr, df_index_name),
+        ):
             df.index.name = index_name
             dataset = openml.datasets.functions.create_dataset(
                 name=name,
@@ -1277,42 +1306,40 @@ def test_create_dataset_row_id_attribute_inference(self):
                 default_target_attribute=default_target_attribute,
                 ignore_attribute=None,
                 citation=citation,
-                attributes='auto',
+                attributes="auto",
                 data=df,
                 row_id_attribute=row_id,
-                version_label='test',
+                version_label="test",
                 original_data_url=original_data_url,
-                paper_url=paper_url
+                paper_url=paper_url,
+            )
+            assert dataset.row_id_attribute == output_row_id
+            dataset.publish()
+            TestBase._mark_entity_for_removal("data", dataset.id)
+            TestBase.logger.info(
+                f"collected from {__file__.split('/')[-1]}: {dataset.id}",
             )
-            self.assertEqual(dataset.row_id_attribute, output_row_id)
-            upload_did = dataset.publish()
-            TestBase._mark_entity_for_removal('data', upload_did)
-            TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                                upload_did))
-            arff_dataset = arff.loads(_get_online_dataset_arff(upload_did))
-            arff_data = np.array(arff_dataset['data'], dtype=object)
+            arff_dataset = arff.loads(_get_online_dataset_arff(dataset.id))
+            arff_data = np.array(arff_dataset["data"], dtype=object)
             # if we set the name of the index then the index will be added to
             # the data
             expected_shape = (5, 3) if index_name is None else (5, 4)
-            self.assertEqual(arff_data.shape, expected_shape)
+            assert arff_data.shape == expected_shape
 
     def test_create_dataset_attributes_auto_without_df(self):
         # attributes cannot be inferred without passing a dataframe
-        data = np.array([[1, 2, 3],
-                         [1.2, 2.5, 3.8],
-                         [2, 5, 8],
-                         [0, 1, 0]]).T
-        attributes = 'auto'
-        name = 'NumPy_testing_dataset'
-        description = 'Synthetic dataset created from a NumPy array'
-        creator = 'OpenML tester'
-        collection_date = '01-01-2018'
-        language = 'English'
-        licence = 'MIT'
-        default_target_attribute = 'col_{}'.format(data.shape[1] - 1)
-        citation = 'None'
-        original_data_url = 'http://openml.github.io/openml-python'
-        paper_url = 'http://openml.github.io/openml-python'
+        data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T
+        attributes = "auto"
+        name = "NumPy_testing_dataset"
+        description = "Synthetic dataset created from a NumPy array"
+        creator = "OpenML tester"
+        collection_date = "01-01-2018"
+        language = "English"
+        licence = "MIT"
+        default_target_attribute = f"col_{data.shape[1] - 1}"
+        citation = "None"
+        original_data_url = "http://openml.github.io/openml-python"
+        paper_url = "http://openml.github.io/openml-python"
         err_msg = "Automatically inferring attributes requires a pandas"
         with pytest.raises(ValueError, match=err_msg):
             openml.datasets.functions.create_dataset(
@@ -1329,12 +1356,656 @@ def test_create_dataset_attributes_auto_without_df(self):
                 citation=citation,
                 attributes=attributes,
                 data=data,
-                version_label='test',
+                version_label="test",
                 original_data_url=original_data_url,
-                paper_url=paper_url
+                paper_url=paper_url,
             )
 
+    @pytest.mark.test_server()
     def test_list_qualities(self):
         qualities = openml.datasets.list_qualities()
-        self.assertEqual(isinstance(qualities, list), True)
-        self.assertEqual(all([isinstance(q, str) for q in qualities]), True)
+        assert isinstance(qualities, list) is True
+        assert all(isinstance(q, str) for q in qualities) is True
+
+    @pytest.mark.test_server()
+    def test_get_dataset_cache_format_pickle(self):
+        dataset = openml.datasets.get_dataset(1)
+        dataset.get_data()
+
+        assert type(dataset) == OpenMLDataset
+        assert dataset.name == "anneal"
+        assert len(dataset.features) > 1
+        assert len(dataset.qualities) > 4
+
+        X, y, categorical, attribute_names = dataset.get_data()
+        assert isinstance(X, pd.DataFrame)
+        assert X.shape == (898, 39)
+        assert len(categorical) == X.shape[1]
+        assert len(attribute_names) == X.shape[1]
+
+    @pytest.mark.test_server()
+    def test_get_dataset_cache_format_feather(self):
+        # This test crashed due to using the parquet file by default, which is downloaded
+        # from minio. However, there is a mismatch between OpenML test server and minio IDs.
+        # The parquet file on minio with ID 128 is not the iris dataset from the test server.
+        dataset = openml.datasets.get_dataset(128, cache_format="feather")
+        # Workaround
+        dataset._parquet_url = None
+        dataset.parquet_file = None
+        dataset.get_data()
+
+        # Check if dataset is written to cache directory using feather
+        cache_dir = openml.config.get_cache_directory()
+        cache_dir_for_id = os.path.join(cache_dir, "datasets", "128")
+        feather_file = os.path.join(cache_dir_for_id, "dataset.feather")
+        pickle_file = os.path.join(cache_dir_for_id, "dataset.feather.attributes.pkl.py3")
+        data = pd.read_feather(feather_file)
+        assert os.path.isfile(feather_file), "Feather file is missing"
+        assert os.path.isfile(pickle_file), "Attributes pickle file is missing"
+        assert data.shape == (150, 5)
+
+        # Check if get_data is able to retrieve feather data
+        assert type(dataset) == OpenMLDataset
+        assert dataset.name == "iris"
+        assert len(dataset.features) > 1
+        assert len(dataset.qualities) > 4
+
+        X, y, categorical, attribute_names = dataset.get_data()
+        assert isinstance(X, pd.DataFrame)
+        assert X.shape == (150, 5)
+        assert len(categorical) == X.shape[1]
+        assert len(attribute_names) == X.shape[1]
+
+    @pytest.mark.test_server()
+    def test_data_edit_non_critical_field(self):
+        # Case 1
+        # All users can edit non-critical fields of datasets
+        desc = (
+            "This data sets consists of 3 different types of irises' "
+            "(Setosa, Versicolour, and Virginica) petal and sepal length,"
+            " stored in a 150x4 numpy.ndarray"
+        )
+        did = 128
+        result = edit_dataset(
+            did,
+            description=desc,
+            creator="R.A.Fisher",
+            collection_date="1937",
+            citation="The use of multiple measurements in taxonomic problems",
+            language="English",
+        )
+        assert did == result
+        edited_dataset = openml.datasets.get_dataset(did)
+        assert edited_dataset.description == desc
+
+    @pytest.mark.test_server()
+    def test_data_edit_critical_field(self):
+        # Case 2
+        # only owners (or admin) can edit all critical fields of datasets
+        # for this, we need to first clone a dataset to do changes
+        did = fork_dataset(1)
+        self._wait_for_dataset_being_processed(did)
+        result = edit_dataset(did, default_target_attribute="shape", ignore_attribute="oil")
+        assert did == result
+
+        n_tries = 10
+        # we need to wait for the edit to be reflected on the server
+        for i in range(n_tries):
+            edited_dataset = openml.datasets.get_dataset(did)
+            try:
+                assert edited_dataset.default_target_attribute == "shape", edited_dataset
+                assert edited_dataset.ignore_attribute == ["oil"], edited_dataset
+                break
+            except AssertionError as e:
+                if i == n_tries - 1:
+                    raise e
+                time.sleep(10)
+                # Delete the cache dir to get the newer version of the dataset
+                
+                shutil.rmtree(
+                    os.path.join(openml.config.get_cache_directory(), "datasets", str(did)),
+                )
+
+    @pytest.mark.test_server()
+    def test_data_edit_requires_field(self):
+        # Check server exception when no field to edit is provided
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Please provide atleast one field among description, creator, "
+            "contributor, collection_date, language, citation, "
+            "original_data_url, default_target_attribute, row_id_attribute, "
+            "ignore_attribute or paper_url to edit.",
+            edit_dataset,
+            data_id=64,  # blood-transfusion-service-center
+        )
+
+    @pytest.mark.test_server()
+    def test_data_edit_requires_valid_dataset(self):
+        # Check server exception when unknown dataset is provided
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Unknown dataset",
+            edit_dataset,
+            data_id=999999,
+            description="xor operation dataset",
+        )
+
+    @pytest.mark.test_server()
+    def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
+        # Need to own a dataset to be able to edit meta-data
+        # Will be creating a forked version of an existing dataset to allow the unit test user
+        #  to edit meta-data of a dataset
+        did = fork_dataset(1)
+        self._wait_for_dataset_being_processed(did)
+        TestBase._mark_entity_for_removal("data", did)
+        # Need to upload a task attached to this data to test edit failure
+        task = create_task(
+            task_type=TaskType.SUPERVISED_CLASSIFICATION,
+            dataset_id=did,
+            target_name="class",
+            estimation_procedure_id=1,
+        )
+        task = task.publish()
+        TestBase._mark_entity_for_removal("task", task.task_id)
+        # Check server exception when owner/admin edits critical fields of dataset with tasks
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Critical features default_target_attribute, row_id_attribute and ignore_attribute "
+            "can only be edited for datasets without any tasks.",
+            edit_dataset,
+            data_id=did,
+            default_target_attribute="y",
+        )
+
+    @pytest.mark.test_server()
+    def test_edit_data_user_cannot_edit_critical_field_of_other_users_dataset(self):
+        # Check server exception when a non-owner or non-admin tries to edit critical fields
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Critical features default_target_attribute, row_id_attribute and ignore_attribute "
+            "can be edited only by the owner. Fork the dataset if changes are required.",
+            edit_dataset,
+            data_id=128,
+            default_target_attribute="y",
+        )
+
+    @pytest.mark.test_server()
+    def test_data_fork(self):
+        did = 1
+        result = fork_dataset(did)
+        assert did != result
+        # Check server exception when unknown dataset is provided
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Unknown dataset",
+            fork_dataset,
+            data_id=999999,
+        )
+
+
+    @pytest.mark.production_server()
+    def test_list_datasets_with_high_size_parameter(self):
+        # Testing on prod since concurrent deletion of uploded datasets make the test fail
+        self.use_production_server()
+
+        datasets_a = openml.datasets.list_datasets()
+        datasets_b = openml.datasets.list_datasets(size=np.inf)
+
+        # Reverting to test server
+        openml.config.server = self.test_server
+        assert len(datasets_a) == len(datasets_b)
+
+
+@pytest.mark.parametrize(
+    ("default_target_attribute", "row_id_attribute", "ignore_attribute"),
+    [
+        ("wrong", None, None),
+        (None, "wrong", None),
+        (None, None, "wrong"),
+        ("wrong,sunny", None, None),
+        (None, None, "wrong,sunny"),
+        (["wrong", "sunny"], None, None),
+        (None, None, ["wrong", "sunny"]),
+    ],
+)
+def test_invalid_attribute_validations(
+    default_target_attribute,
+    row_id_attribute,
+    ignore_attribute,
+):
+    data = [
+        ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
+        ["b", "sunny", 80.0, 90.0, "TRUE", "no"],
+        ["c", "overcast", 83.0, 86.0, "FALSE", "yes"],
+        ["d", "rainy", 70.0, 96.0, "FALSE", "yes"],
+        ["e", "rainy", 68.0, 80.0, "FALSE", "yes"],
+    ]
+    column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"]
+    df = pd.DataFrame(data, columns=column_names)
+    # enforce the type of each column
+    df["outlook"] = df["outlook"].astype("category")
+    df["windy"] = df["windy"].astype("bool")
+    df["play"] = df["play"].astype("category")
+    # meta-information
+    name = "pandas_testing_dataset"
+    description = "Synthetic dataset created from a Pandas DataFrame"
+    creator = "OpenML tester"
+    collection_date = "01-01-2018"
+    language = "English"
+    licence = "MIT"
+    citation = "None"
+    original_data_url = "http://openml.github.io/openml-python"
+    paper_url = "http://openml.github.io/openml-python"
+    with pytest.raises(ValueError, match="should be one of the data attribute"):
+        _ = openml.datasets.functions.create_dataset(
+            name=name,
+            description=description,
+            creator=creator,
+            contributor=None,
+            collection_date=collection_date,
+            language=language,
+            licence=licence,
+            default_target_attribute=default_target_attribute,
+            row_id_attribute=row_id_attribute,
+            ignore_attribute=ignore_attribute,
+            citation=citation,
+            attributes="auto",
+            data=df,
+            version_label="test",
+            original_data_url=original_data_url,
+            paper_url=paper_url,
+        )
+
+
+@pytest.mark.parametrize(
+    ("default_target_attribute", "row_id_attribute", "ignore_attribute"),
+    [
+        ("outlook", None, None),
+        (None, "outlook", None),
+        (None, None, "outlook"),
+        ("outlook,windy", None, None),
+        (None, None, "outlook,windy"),
+        (["outlook", "windy"], None, None),
+        (None, None, ["outlook", "windy"]),
+    ],
+)
+def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute):
+    data = [
+        ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
+        ["b", "sunny", 80.0, 90.0, "TRUE", "no"],
+        ["c", "overcast", 83.0, 86.0, "FALSE", "yes"],
+        ["d", "rainy", 70.0, 96.0, "FALSE", "yes"],
+        ["e", "rainy", 68.0, 80.0, "FALSE", "yes"],
+    ]
+    column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"]
+    df = pd.DataFrame(data, columns=column_names)
+    # enforce the type of each column
+    df["outlook"] = df["outlook"].astype("category")
+    df["windy"] = df["windy"].astype("bool")
+    df["play"] = df["play"].astype("category")
+    # meta-information
+    name = "pandas_testing_dataset"
+    description = "Synthetic dataset created from a Pandas DataFrame"
+    creator = "OpenML tester"
+    collection_date = "01-01-2018"
+    language = "English"
+    licence = "MIT"
+    citation = "None"
+    original_data_url = "http://openml.github.io/openml-python"
+    paper_url = "http://openml.github.io/openml-python"
+    _ = openml.datasets.functions.create_dataset(
+        name=name,
+        description=description,
+        creator=creator,
+        contributor=None,
+        collection_date=collection_date,
+        language=language,
+        licence=licence,
+        default_target_attribute=default_target_attribute,
+        row_id_attribute=row_id_attribute,
+        ignore_attribute=ignore_attribute,
+        citation=citation,
+        attributes="auto",
+        data=df,
+        version_label="test",
+        original_data_url=original_data_url,
+        paper_url=paper_url,
+    )
+
+    def test_delete_dataset(self):
+        data = [
+            ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
+            ["b", "sunny", 80.0, 90.0, "TRUE", "no"],
+            ["c", "overcast", 83.0, 86.0, "FALSE", "yes"],
+            ["d", "rainy", 70.0, 96.0, "FALSE", "yes"],
+            ["e", "rainy", 68.0, 80.0, "FALSE", "yes"],
+        ]
+        column_names = [
+            "rnd_str",
+            "outlook",
+            "temperature",
+            "humidity",
+            "windy",
+            "play",
+        ]
+        df = pd.DataFrame(data, columns=column_names)
+        # enforce the type of each column
+        df["outlook"] = df["outlook"].astype("category")
+        df["windy"] = df["windy"].astype("bool")
+        df["play"] = df["play"].astype("category")
+        # meta-information
+        name = f"{self._get_sentinel()}-pandas_testing_dataset"
+        description = "Synthetic dataset created from a Pandas DataFrame"
+        creator = "OpenML tester"
+        collection_date = "01-01-2018"
+        language = "English"
+        licence = "MIT"
+        citation = "None"
+        original_data_url = "http://openml.github.io/openml-python"
+        paper_url = "http://openml.github.io/openml-python"
+        dataset = openml.datasets.functions.create_dataset(
+            name=name,
+            description=description,
+            creator=creator,
+            contributor=None,
+            collection_date=collection_date,
+            language=language,
+            licence=licence,
+            default_target_attribute="play",
+            row_id_attribute=None,
+            ignore_attribute=None,
+            citation=citation,
+            attributes="auto",
+            data=df,
+            version_label="test",
+            original_data_url=original_data_url,
+            paper_url=paper_url,
+        )
+        dataset.publish()
+        _dataset_id = dataset.id
+        assert openml.datasets.delete_dataset(_dataset_id)
+
+
+@mock.patch.object(requests.Session, "delete")
+def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_key):
+    content_file = (
+        test_files_directory / "mock_responses" / "datasets" / "data_delete_not_owned.xml"
+    )
+    mock_delete.return_value = create_request_response(
+        status_code=412,
+        content_filepath=content_file,
+    )
+
+    with pytest.raises(
+        OpenMLNotAuthorizedError,
+        match="The data can not be deleted because it was not uploaded by you.",
+    ):
+        openml.datasets.delete_dataset(40_000)
+
+    dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/40000"
+    assert dataset_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+
+
+@mock.patch.object(requests.Session, "delete")
+def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key):
+    content_file = (
+        test_files_directory / "mock_responses" / "datasets" / "data_delete_has_tasks.xml"
+    )
+    mock_delete.return_value = create_request_response(
+        status_code=412,
+        content_filepath=content_file,
+    )
+
+    with pytest.raises(
+        OpenMLNotAuthorizedError,
+        match="The data can not be deleted because it still has associated entities:",
+    ):
+        openml.datasets.delete_dataset(40_000)
+
+    dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/40000"
+    assert dataset_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+
+
+@mock.patch.object(requests.Session, "delete")
+def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key):
+    content_file = (
+        test_files_directory / "mock_responses" / "datasets" / "data_delete_successful.xml"
+    )
+    mock_delete.return_value = create_request_response(
+        status_code=200,
+        content_filepath=content_file,
+    )
+
+    success = openml.datasets.delete_dataset(40000)
+    assert success
+
+    dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/40000"
+    assert dataset_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+
+
+@mock.patch.object(requests.Session, "delete")
+def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key):
+    content_file = (
+        test_files_directory / "mock_responses" / "datasets" / "data_delete_not_exist.xml"
+    )
+    mock_delete.return_value = create_request_response(
+        status_code=412,
+        content_filepath=content_file,
+    )
+
+    with pytest.raises(
+        OpenMLServerException,
+        match="Dataset does not exist",
+    ):
+        openml.datasets.delete_dataset(9_999_999)
+
+    dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/9999999"
+    assert dataset_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+
+
+def _assert_datasets_have_id_and_valid_status(datasets: pd.DataFrame):
+    assert pd.api.types.is_integer_dtype(datasets["did"])
+    assert {"in_preparation", "active", "deactivated"} >= set(datasets["status"])
+
+
+@pytest.fixture(scope="module")
+def all_datasets():
+    return openml.datasets.list_datasets()
+
+
+@pytest.mark.test_server()
+def test_list_datasets(all_datasets: pd.DataFrame):
+    # We can only perform a smoke test here because we test on dynamic
+    # data from the internet...
+    # 1087 as the number of datasets on openml.org
+    assert len(all_datasets) >= 100
+    _assert_datasets_have_id_and_valid_status(all_datasets)
+
+
+@pytest.mark.test_server()
+def test_list_datasets_by_tag(all_datasets: pd.DataFrame):
+    tag_datasets = openml.datasets.list_datasets(tag="study_14")
+    assert 0 < len(tag_datasets) < len(all_datasets)
+    _assert_datasets_have_id_and_valid_status(tag_datasets)
+
+
+@pytest.mark.test_server()
+def test_list_datasets_by_size():
+    datasets = openml.datasets.list_datasets(size=5)
+    assert len(datasets) == 5
+    _assert_datasets_have_id_and_valid_status(datasets)
+
+
+@pytest.mark.test_server()
+def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame):
+    small_datasets = openml.datasets.list_datasets(number_instances="5..100")
+    assert 0 < len(small_datasets) <= len(all_datasets)
+    _assert_datasets_have_id_and_valid_status(small_datasets)
+
+
+@pytest.mark.test_server()
+def test_list_datasets_by_number_features(all_datasets: pd.DataFrame):
+    wide_datasets = openml.datasets.list_datasets(number_features="50..100")
+    assert 8 <= len(wide_datasets) < len(all_datasets)
+    _assert_datasets_have_id_and_valid_status(wide_datasets)
+
+
+@pytest.mark.test_server()
+def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame):
+    five_class_datasets = openml.datasets.list_datasets(number_classes="5")
+    assert 3 <= len(five_class_datasets) < len(all_datasets)
+    _assert_datasets_have_id_and_valid_status(five_class_datasets)
+
+
+@pytest.mark.test_server()
+def test_list_datasets_by_number_missing_values(all_datasets: pd.DataFrame):
+    na_datasets = openml.datasets.list_datasets(number_missing_values="5..100")
+    assert 5 <= len(na_datasets) < len(all_datasets)
+    _assert_datasets_have_id_and_valid_status(na_datasets)
+
+
+@pytest.mark.test_server()
+def test_list_datasets_combined_filters(all_datasets: pd.DataFrame):
+    combined_filter_datasets = openml.datasets.list_datasets(
+        tag="study_14",
+        number_instances="100..1000",
+        number_missing_values="800..1000",
+    )
+    assert 1 <= len(combined_filter_datasets) < len(all_datasets)
+    _assert_datasets_have_id_and_valid_status(combined_filter_datasets)
+
+
+def _dataset_file_is_downloaded(did: int, file: str):
+    cache_directory = Path(openml.config.get_cache_directory()) / "datasets" / str(did)
+    return (cache_directory / file).exists()
+
+
+def _dataset_description_is_downloaded(did: int):
+    return _dataset_file_is_downloaded(did, "description.xml")
+
+
+def _dataset_qualities_is_downloaded(did: int):
+    return _dataset_file_is_downloaded(did, "qualities.xml")
+
+
+def _dataset_features_is_downloaded(did: int):
+    return _dataset_file_is_downloaded(did, "features.xml")
+
+
+def _dataset_data_file_is_downloaded(did: int):
+    cache_directory = Path(openml.config.get_cache_directory()) / "datasets" / str(did)
+    return any(f.suffix in (".pq", ".arff") for f in cache_directory.iterdir())
+
+
+def _assert_datasets_retrieved_successfully(
+    dids: Iterable[int],
+    with_qualities: bool = False,
+    with_features: bool = False,
+    with_data: bool = False,
+):
+    """Checks that all files for the given dids have been downloaded.
+
+    This includes:
+        - description
+        - qualities
+        - features
+        - absence of data arff if metadata_only, else it must be present too.
+    """
+    for did in dids:
+        assert _dataset_description_is_downloaded(did)
+
+        has_qualities = _dataset_qualities_is_downloaded(did)
+        assert has_qualities if with_qualities else not has_qualities
+
+        has_features = _dataset_features_is_downloaded(did)
+        assert has_features if with_features else not has_features
+
+        has_data = _dataset_data_file_is_downloaded(did)
+        assert has_data if with_data else not has_data
+
+
+@pytest.fixture()
+def isolate_for_test():
+    t = TestOpenMLDataset()
+    t.setUp(tmpdir_suffix=uuid.uuid4().hex)
+    yield
+    t.tearDown()
+
+
+@pytest.mark.parametrize(
+    ("with_data", "with_qualities", "with_features"),
+    itertools.product([True, False], repeat=3),
+)
+@pytest.mark.test_server()
+def test_get_dataset_lazy_behavior(
+    isolate_for_test, with_data: bool, with_qualities: bool, with_features: bool
+):
+    dataset = openml.datasets.get_dataset(
+        1,
+        download_data=with_data,
+        download_qualities=with_qualities,
+        download_features_meta_data=with_features,
+    )
+    assert type(dataset) == OpenMLDataset
+    assert dataset.name == "anneal"
+
+    _assert_datasets_retrieved_successfully(
+        [1],
+        with_qualities=with_qualities,
+        with_features=with_features,
+        with_data=with_data,
+    )
+    assert dataset.features, "Features should be downloaded on-demand if not during get_dataset"
+    assert dataset.qualities, "Qualities should be downloaded on-demand if not during get_dataset"
+    assert dataset.get_data(), "Data should be downloaded on-demand if not during get_dataset"
+    _assert_datasets_retrieved_successfully(
+        [1], with_qualities=True, with_features=True, with_data=True
+    )
+
+
+@pytest.mark.test_server()
+def test_get_dataset_with_invalid_id() -> None:
+    INVALID_ID = 123819023109238  # Well, at some point this will probably be valid...
+    with pytest.raises(OpenMLServerNoResult, match="Unknown dataset") as e:
+        openml.datasets.get_dataset(INVALID_ID)
+        assert e.value.code == 111
+
+
+def test__get_dataset_parquet_not_cached():
+    description = {
+        "oml:parquet_url": "http://data.openml.org/dataset20/dataset_20.pq",
+        "oml:id": "20",
+    }
+    path = _get_dataset_parquet(description, cache_directory=Path(openml.config.get_cache_directory()))
+    assert isinstance(path, Path), "_get_dataset_parquet returns a path"
+    assert path.is_file(), "_get_dataset_parquet returns path to real file"
+
+
+def test_read_features_from_xml_with_whitespace() -> None:
+    from openml.datasets.dataset import _read_features
+
+    features_file = (
+        Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml"
+    )
+    dict = _read_features(features_file)
+    assert dict[1].nominal_values == [" - 50000.", " 50000+."]
+
+
+@pytest.mark.test_server()
+def test_get_dataset_parquet(requests_mock, test_files_directory):
+    # Parquet functionality is disabled on the test server
+    # There is no parquet-copy of the test server yet.
+    content_file = (
+            test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
+    )
+    # While the mocked example is from production, unit tests by default connect to the test server.
+    requests_mock.get(f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/61", text=content_file.read_text())
+    dataset = openml.datasets.get_dataset(61, download_data=True)
+    assert dataset._parquet_url is not None
+    assert dataset.parquet_file is not None
+    assert os.path.isfile(dataset.parquet_file)
+    assert dataset.data_file is None  # is alias for arff path
\ No newline at end of file
diff --git a/tests/test_examples/__init__.py b/tests/test_evaluations/__init__.py
similarity index 100%
rename from tests/test_examples/__init__.py
rename to tests/test_evaluations/__init__.py
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index fecf4b60c..e15556d7b 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -1,144 +1,266 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import pytest
+
 import openml
 import openml.evaluations
 from openml.testing import TestBase
 
 
+@pytest.mark.usefixtures("long_version")
 class TestEvaluationFunctions(TestBase):
     _multiprocess_can_split_ = True
 
+    def _check_list_evaluation_setups(self, **kwargs):
+        evals_setups = openml.evaluations.list_evaluations_setups(
+            "predictive_accuracy",
+            **kwargs,
+            sort_order="desc",
+        )
+        evals = openml.evaluations.list_evaluations(
+            "predictive_accuracy",
+            **kwargs,
+            sort_order="desc",
+            output_format="dataframe",
+        )
+
+        # Check if list is non-empty
+        assert len(evals_setups) > 0
+        # Check if length is accurate
+        assert len(evals_setups) == len(evals)
+        # Check if output from sort is sorted in the right order
+        self.assertSequenceEqual(
+            sorted(evals_setups["value"].tolist(), reverse=True),
+            evals_setups["value"].tolist(),
+        )
+
+        # Check if output and order of list_evaluations is preserved
+        self.assertSequenceEqual(evals_setups["run_id"].tolist(), evals["run_id"].tolist())
+
+        if not self.long_version:
+            evals_setups = evals_setups.head(1)
+
+        # Check if the hyper-parameter column is as accurate and flow_id
+        for _index, row in evals_setups.iterrows():
+            params = openml.runs.get_run(row["run_id"]).parameter_settings
+            list1 = [param["oml:value"] for param in params]
+            list2 = list(row["parameters"].values())
+            # check if all values are equal
+            self.assertSequenceEqual(sorted(list1), sorted(list2))
+        return evals_setups
+
+    @pytest.mark.production_server()
     def test_evaluation_list_filter_task(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         task_id = 7312
 
-        evaluations = openml.evaluations.list_evaluations("predictive_accuracy",
-                                                          task=[task_id])
-
-        self.assertGreater(len(evaluations), 100)
-        for run_id in evaluations.keys():
-            self.assertEqual(evaluations[run_id].task_id, task_id)
+        evaluations = openml.evaluations.list_evaluations(
+            "predictive_accuracy",
+            size=110,
+            tasks=[task_id],
+        )
+
+        assert len(evaluations) > 100
+        for run_id in evaluations:
+            assert evaluations[run_id].task_id == task_id
             # default behaviour of this method: return aggregated results (not
             # per fold)
-            self.assertIsNotNone(evaluations[run_id].value)
-            self.assertIsNone(evaluations[run_id].values)
+            assert evaluations[run_id].value is not None
+            assert evaluations[run_id].values is None
 
+    @pytest.mark.production_server()
     def test_evaluation_list_filter_uploader_ID_16(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         uploader_id = 16
-        evaluations = openml.evaluations.list_evaluations("predictive_accuracy",
-                                                          uploader=[uploader_id])
+        evaluations = openml.evaluations.list_evaluations(
+            "predictive_accuracy",
+            size=60,
+            uploaders=[uploader_id],
+            output_format="dataframe",
+        )
+        assert evaluations["uploader"].unique() == [uploader_id]
 
-        self.assertGreater(len(evaluations), 50)
+        assert len(evaluations) > 50
 
+    @pytest.mark.production_server()
     def test_evaluation_list_filter_uploader_ID_10(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         setup_id = 10
-        evaluations = openml.evaluations.list_evaluations("predictive_accuracy",
-                                                          setup=[setup_id])
-
-        self.assertGreater(len(evaluations), 50)
-        for run_id in evaluations.keys():
-            self.assertEqual(evaluations[run_id].setup_id, setup_id)
+        evaluations = openml.evaluations.list_evaluations(
+            "predictive_accuracy",
+            size=60,
+            setups=[setup_id],
+        )
+
+        assert len(evaluations) > 50
+        for run_id in evaluations:
+            assert evaluations[run_id].setup_id == setup_id
             # default behaviour of this method: return aggregated results (not
             # per fold)
-            self.assertIsNotNone(evaluations[run_id].value)
-            self.assertIsNone(evaluations[run_id].values)
+            assert evaluations[run_id].value is not None
+            assert evaluations[run_id].values is None
 
+    @pytest.mark.production_server()
     def test_evaluation_list_filter_flow(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         flow_id = 100
 
-        evaluations = openml.evaluations.list_evaluations("predictive_accuracy",
-                                                          flow=[flow_id])
-
-        self.assertGreater(len(evaluations), 2)
-        for run_id in evaluations.keys():
-            self.assertEqual(evaluations[run_id].flow_id, flow_id)
+        evaluations = openml.evaluations.list_evaluations(
+            "predictive_accuracy",
+            size=10,
+            flows=[flow_id],
+        )
+
+        assert len(evaluations) > 2
+        for run_id in evaluations:
+            assert evaluations[run_id].flow_id == flow_id
             # default behaviour of this method: return aggregated results (not
             # per fold)
-            self.assertIsNotNone(evaluations[run_id].value)
-            self.assertIsNone(evaluations[run_id].values)
+            assert evaluations[run_id].value is not None
+            assert evaluations[run_id].values is None
 
+    @pytest.mark.production_server()
     def test_evaluation_list_filter_run(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         run_id = 12
 
-        evaluations = openml.evaluations.list_evaluations("predictive_accuracy",
-                                                          id=[run_id])
-
-        self.assertEqual(len(evaluations), 1)
-        for run_id in evaluations.keys():
-            self.assertEqual(evaluations[run_id].run_id, run_id)
+        evaluations = openml.evaluations.list_evaluations(
+            "predictive_accuracy",
+            size=2,
+            runs=[run_id],
+        )
+
+        assert len(evaluations) == 1
+        for run_id in evaluations:
+            assert evaluations[run_id].run_id == run_id
             # default behaviour of this method: return aggregated results (not
             # per fold)
-            self.assertIsNotNone(evaluations[run_id].value)
-            self.assertIsNone(evaluations[run_id].values)
+            assert evaluations[run_id].value is not None
+            assert evaluations[run_id].values is None
 
+    @pytest.mark.production_server()
     def test_evaluation_list_limit(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
-        evaluations = openml.evaluations.list_evaluations("predictive_accuracy",
-                                                          size=100, offset=100)
-        self.assertEqual(len(evaluations), 100)
+        evaluations = openml.evaluations.list_evaluations(
+            "predictive_accuracy",
+            size=100,
+            offset=100,
+        )
+        assert len(evaluations) == 100
 
+    @pytest.mark.test_server()
     def test_list_evaluations_empty(self):
-        evaluations = openml.evaluations.list_evaluations('unexisting_measure')
+        evaluations = openml.evaluations.list_evaluations("unexisting_measure")
         if len(evaluations) > 0:
-            raise ValueError('UnitTest Outdated, got somehow results')
+            raise ValueError("UnitTest Outdated, got somehow results")
 
-        self.assertIsInstance(evaluations, dict)
+        assert isinstance(evaluations, dict)
 
+    @pytest.mark.production_server()
     def test_evaluation_list_per_fold(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         size = 1000
         task_ids = [6]
         uploader_ids = [1]
         flow_ids = [6969]
 
         evaluations = openml.evaluations.list_evaluations(
-            "predictive_accuracy", size=size, offset=0, task=task_ids,
-            flow=flow_ids, uploader=uploader_ids, per_fold=True)
-
-        self.assertEqual(len(evaluations), size)
-        for run_id in evaluations.keys():
-            self.assertIsNone(evaluations[run_id].value)
-            self.assertIsNotNone(evaluations[run_id].values)
+            "predictive_accuracy",
+            size=size,
+            offset=0,
+            tasks=task_ids,
+            flows=flow_ids,
+            uploaders=uploader_ids,
+            per_fold=True,
+        )
+
+        assert len(evaluations) == size
+        for run_id in evaluations:
+            assert evaluations[run_id].value is None
+            assert evaluations[run_id].values is not None
             # potentially we could also test array values, but these might be
             # added in the future
 
         evaluations = openml.evaluations.list_evaluations(
-            "predictive_accuracy", size=size, offset=0, task=task_ids,
-            flow=flow_ids, uploader=uploader_ids, per_fold=False)
-        for run_id in evaluations.keys():
-            self.assertIsNotNone(evaluations[run_id].value)
-            self.assertIsNone(evaluations[run_id].values)
-
+            "predictive_accuracy",
+            size=size,
+            offset=0,
+            tasks=task_ids,
+            flows=flow_ids,
+            uploaders=uploader_ids,
+            per_fold=False,
+        )
+        for run_id in evaluations:
+            assert evaluations[run_id].value is not None
+            assert evaluations[run_id].values is None
+
+    @pytest.mark.production_server()
     def test_evaluation_list_sort(self):
+        self.use_production_server()
         size = 10
-        task_id = 115
+        task_id = 6
         # Get all evaluations of the task
         unsorted_eval = openml.evaluations.list_evaluations(
-            "predictive_accuracy", offset=0, task=[task_id])
+            "predictive_accuracy",
+            size=None,
+            offset=0,
+            tasks=[task_id],
+        )
         # Get top 10 evaluations of the same task
         sorted_eval = openml.evaluations.list_evaluations(
-            "predictive_accuracy", size=size, offset=0, task=[task_id], sort_order="desc")
-        self.assertEqual(len(sorted_eval), size)
-        self.assertGreater(len(unsorted_eval), 0)
+            "predictive_accuracy",
+            size=size,
+            offset=0,
+            tasks=[task_id],
+            sort_order="desc",
+        )
+        assert len(sorted_eval) == size
+        assert len(unsorted_eval) > 0
         sorted_output = [evaluation.value for evaluation in sorted_eval.values()]
         unsorted_output = [evaluation.value for evaluation in unsorted_eval.values()]
 
         # Check if output from sort is sorted in the right order
-        self.assertTrue(sorted(sorted_output, reverse=True) == sorted_output)
+        assert sorted(sorted_output, reverse=True) == sorted_output
 
         # Compare manual sorting against sorted output
         test_output = sorted(unsorted_output, reverse=True)
-        self.assertTrue(test_output[:size] == sorted_output)
+        assert test_output[:size] == sorted_output
 
+    @pytest.mark.test_server()
     def test_list_evaluation_measures(self):
         measures = openml.evaluations.list_evaluation_measures()
-        self.assertEqual(isinstance(measures, list), True)
-        self.assertEqual(all([isinstance(s, str) for s in measures]), True)
+        assert isinstance(measures, list) is True
+        assert all(isinstance(s, str) for s in measures) is True
+
+    @pytest.mark.production_server()
+    def test_list_evaluations_setups_filter_flow(self):
+        self.use_production_server()
+        flow_id = [405]
+        size = 100
+        evals = self._check_list_evaluation_setups(flows=flow_id, size=size)
+        # check if parameters in separate columns works
+        evals_cols = openml.evaluations.list_evaluations_setups(
+            "predictive_accuracy",
+            flows=flow_id,
+            size=size,
+            sort_order="desc",
+            parameters_in_separate_columns=True,
+        )
+        columns = list(evals_cols.columns)
+        keys = list(evals["parameters"].values[0].keys())
+        assert all(elem in columns for elem in keys)
+
+    @pytest.mark.production_server()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    def test_list_evaluations_setups_filter_task(self):
+        self.use_production_server()
+        task_id = [6]
+        size = 121
+        self._check_list_evaluation_setups(tasks=task_id, size=size)
diff --git a/tests/test_evaluations/test_evaluations_example.py b/tests/test_evaluations/test_evaluations_example.py
new file mode 100644
index 000000000..b321f475d
--- /dev/null
+++ b/tests/test_evaluations/test_evaluations_example.py
@@ -0,0 +1,45 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import unittest
+
+import openml
+
+class TestEvaluationsExample(unittest.TestCase):
+    def test_example_python_paper(self):
+        # Example script which will appear in the upcoming OpenML-Python paper
+        # This test ensures that the example will keep running!
+        with openml.config.overwrite_config_context(  # noqa: F823
+            {
+                "server": "https://www.openml.org/api/v1/xml",
+                "apikey": None,
+            }
+        ):
+            import matplotlib.pyplot as plt
+            import numpy as np
+
+            df = openml.evaluations.list_evaluations_setups(
+                "predictive_accuracy",
+                flows=[8353],
+                tasks=[6],
+                parameters_in_separate_columns=True,
+            )  # Choose an SVM flow, for example 8353, and a task.
+
+            assert len(df) > 0, (
+                "No evaluation found for flow 8353 on task 6, could "
+                "be that this task is not available on the test server."
+            )
+
+            hp_names = ["sklearn.svm.classes.SVC(16)_C", "sklearn.svm.classes.SVC(16)_gamma"]
+            df[hp_names] = df[hp_names].astype(float).apply(np.log)
+            C, gamma, score = df[hp_names[0]], df[hp_names[1]], df["value"]
+
+            cntr = plt.tricontourf(C, gamma, score, levels=12, cmap="RdBu_r")
+            plt.colorbar(cntr, label="accuracy")
+            plt.xlim((min(C), max(C)))
+            plt.ylim((min(gamma), max(gamma)))
+            plt.xlabel("C (log10)", size=16)
+            plt.ylabel("gamma (log10)", size=16)
+            plt.title("SVM performance landscape", size=20)
+
+            plt.tight_layout()
diff --git a/tests/test_examples/test_OpenMLDemo.py b/tests/test_examples/test_OpenMLDemo.py
deleted file mode 100644
index 64c710873..000000000
--- a/tests/test_examples/test_OpenMLDemo.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import os
-import shutil
-import sys
-
-import matplotlib
-matplotlib.use('AGG')
-import nbformat
-from nbconvert.exporters import export
-from nbconvert.exporters.python import PythonExporter
-
-import unittest.mock as mock
-
-from unittest import skip
-import openml._api_calls
-import openml.config
-from openml.testing import TestBase
-
-_perform_api_call = openml._api_calls._perform_api_call
-
-
-class OpenMLDemoTest(TestBase):
-    def setUp(self):
-        super(OpenMLDemoTest, self).setUp()
-
-        python_version = sys.version_info[0]
-        self.kernel_name = 'python%d' % python_version
-        self.this_file_directory = os.path.dirname(__file__)
-        self.notebook_output_directory = os.path.join(
-            self.this_file_directory, '.out')
-
-        try:
-            shutil.rmtree(self.notebook_output_directory)
-        except OSError:
-            pass
-
-        try:
-            os.makedirs(self.notebook_output_directory)
-        except OSError:
-            pass
-
-    def _tst_notebook(self, notebook_name):
-
-        notebook_filename = os.path.abspath(os.path.join(
-            self.this_file_directory, '..', '..', 'examples', notebook_name))
-
-        with open(notebook_filename) as f:
-            nb = nbformat.read(f, as_version=4)
-
-        python_nb, metadata = export(PythonExporter, nb)
-
-        # Remove magic lines manually
-        python_nb = '\n'.join([
-            line for line in python_nb.split('\n')
-            if 'get_ipython().run_line_magic(' not in line
-        ])
-
-        exec(python_nb)
-
-    @skip
-    @mock.patch('openml._api_calls._perform_api_call')
-    def test_tutorial_openml(self, patch):
-        def side_effect(*args, **kwargs):
-            if (
-                args[0].endswith('/run/')
-                and kwargs['file_elements'] is not None
-            ):
-                return """<oml:upload_run>
-    <oml:run_id>1</oml:run_id>
-</oml:upload_run>
-                """
-            else:
-                return _perform_api_call(*args, **kwargs)
-        patch.side_effect = side_effect
-
-        openml.config.server = self.production_server
-        self._tst_notebook('OpenML_Tutorial.ipynb')
-        self.assertGreater(patch.call_count, 100)
-
-    @skip("Deleted tutorial file")
-    def test_tutorial_dataset(self):
-
-        self._tst_notebook('Dataset_import.ipynb')
diff --git a/tests/test_extensions/test_functions.py b/tests/test_extensions/test_functions.py
index 76b1f9d0c..90fbaa9f1 100644
--- a/tests/test_extensions/test_functions.py
+++ b/tests/test_extensions/test_functions.py
@@ -1,12 +1,21 @@
-import inspect
+# License: BSD 3-Clause
+from __future__ import annotations
 
-import openml.testing
+from collections import OrderedDict
 
-from openml.extensions import get_extension_by_model, get_extension_by_flow, register_extension
+import inspect
+import numpy as np
+import pytest
+from unittest.mock import patch
+import openml.testing
+from openml.extensions import Extension, get_extension_by_flow, get_extension_by_model, register_extension
 
 
 class DummyFlow:
-    external_version = 'DummyFlow==0.1'
+    external_version = "DummyFlow==0.1"
+    name = "Dummy Flow"
+    flow_id = 1
+    dependencies = None
 
 
 class DummyModel:
@@ -14,22 +23,16 @@ class DummyModel:
 
 
 class DummyExtension1:
-
     @staticmethod
     def can_handle_flow(flow):
-        if not inspect.stack()[2].filename.endswith('test_functions.py'):
-            return False
-        return True
+        return inspect.stack()[2].filename.endswith("test_functions.py")
 
     @staticmethod
     def can_handle_model(model):
-        if not inspect.stack()[2].filename.endswith('test_functions.py'):
-            return False
-        return True
+        return inspect.stack()[2].filename.endswith("test_functions.py")
 
 
 class DummyExtension2:
-
     @staticmethod
     def can_handle_flow(flow):
         return False
@@ -39,57 +42,197 @@ def can_handle_model(model):
         return False
 
 
-def _unregister():
-    # "Un-register" the test extensions
-    while True:
-        rem_dum_ext1 = False
-        rem_dum_ext2 = False
-        try:
-            openml.extensions.extensions.remove(DummyExtension1)
-            rem_dum_ext1 = True
-        except ValueError:
-            pass
-        try:
-            openml.extensions.extensions.remove(DummyExtension2)
-            rem_dum_ext2 = True
-        except ValueError:
-            pass
-        if not rem_dum_ext1 and not rem_dum_ext2:
-            break
+class DummyExtension(Extension):
+    @classmethod
+    def can_handle_flow(cls, flow):
+        return isinstance(flow, DummyFlow)
+
+    @classmethod
+    def can_handle_model(cls, model):
+        return isinstance(model, DummyModel)
+
+    def flow_to_model(
+        self,
+        flow,
+        initialize_with_defaults=False,
+        strict_version=True,
+    ):
+        if not isinstance(flow, DummyFlow):
+            raise ValueError("Invalid flow")
+
+        model = DummyModel()
+        model.defaults = initialize_with_defaults
+        model.strict_version = strict_version
+        return model
+
+    def model_to_flow(self, model):
+        if not isinstance(model, DummyModel):
+            raise ValueError("Invalid model")
+        return DummyFlow()
+
+    def get_version_information(self):
+        return ["dummy==1.0"]
+
+    def create_setup_string(self, model):
+        return "DummyModel()"
+
+    def is_estimator(self, model):
+        return isinstance(model, DummyModel)
+
+    def seed_model(self, model, seed):
+        model.seed = seed
+        return model
+
+    def _run_model_on_fold(
+        self,
+        model,
+        task,
+        X_train,
+        rep_no,
+        fold_no,
+        y_train=None,
+        X_test=None,
+    ):
+        preds = np.zeros(len(X_train))
+        probs = None
+        measures = OrderedDict()
+        trace = None
+        return preds, probs, measures, trace
+
+    def obtain_parameter_values(self, flow, model=None):
+        return []
+
+    def check_if_model_fitted(self, model):
+        return False
 
+    def instantiate_model_from_hpo_class(self, model, trace_iteration):
+        return DummyModel()
 
-class TestInit(openml.testing.TestBase):
 
-    def setUp(self):
-        super().setUp()
-        _unregister()
+
+class TestInit(openml.testing.TestBase):
 
     def test_get_extension_by_flow(self):
-        self.assertIsNone(get_extension_by_flow(DummyFlow()))
-        with self.assertRaisesRegex(ValueError, 'No extension registered which can handle flow:'):
-            get_extension_by_flow(DummyFlow(), raise_if_no_extension=True)
-        register_extension(DummyExtension1)
-        self.assertIsInstance(get_extension_by_flow(DummyFlow()), DummyExtension1)
-        register_extension(DummyExtension2)
-        self.assertIsInstance(get_extension_by_flow(DummyFlow()), DummyExtension1)
-        register_extension(DummyExtension1)
-        with self.assertRaisesRegex(
-            ValueError,
-            'Multiple extensions registered which can handle flow:',
-        ):
-            get_extension_by_flow(DummyFlow())
+            # We replace the global list with a new empty list [] ONLY for this block
+            with patch("openml.extensions.extensions", []):
+                assert get_extension_by_flow(DummyFlow()) is None
+                
+                with pytest.raises(ValueError, match="No extension registered which can handle flow:"):
+                    get_extension_by_flow(DummyFlow(), raise_if_no_extension=True)
+                
+                register_extension(DummyExtension1)
+                assert isinstance(get_extension_by_flow(DummyFlow()), DummyExtension1)
+                
+                register_extension(DummyExtension2)
+                assert isinstance(get_extension_by_flow(DummyFlow()), DummyExtension1)
+                
+                register_extension(DummyExtension1)
+                with pytest.raises(
+                    ValueError, match="Multiple extensions registered which can handle flow:"
+                ):
+                    get_extension_by_flow(DummyFlow())
 
     def test_get_extension_by_model(self):
-        self.assertIsNone(get_extension_by_model(DummyModel()))
-        with self.assertRaisesRegex(ValueError, 'No extension registered which can handle model:'):
-            get_extension_by_model(DummyModel(), raise_if_no_extension=True)
-        register_extension(DummyExtension1)
-        self.assertIsInstance(get_extension_by_model(DummyModel()), DummyExtension1)
-        register_extension(DummyExtension2)
-        self.assertIsInstance(get_extension_by_model(DummyModel()), DummyExtension1)
-        register_extension(DummyExtension1)
-        with self.assertRaisesRegex(
-            ValueError,
-            'Multiple extensions registered which can handle model:',
-        ):
-            get_extension_by_model(DummyModel())
+        # Again, we start with a fresh empty list automatically
+        with patch("openml.extensions.extensions", []):
+            assert get_extension_by_model(DummyModel()) is None
+            
+            with pytest.raises(ValueError, match="No extension registered which can handle model:"):
+                get_extension_by_model(DummyModel(), raise_if_no_extension=True)
+            
+            register_extension(DummyExtension1)
+            assert isinstance(get_extension_by_model(DummyModel()), DummyExtension1)
+            
+            register_extension(DummyExtension2)
+            assert isinstance(get_extension_by_model(DummyModel()), DummyExtension1)
+            
+            register_extension(DummyExtension1)
+            with pytest.raises(
+                ValueError, match="Multiple extensions registered which can handle model:"
+            ):
+                get_extension_by_model(DummyModel())
+
+
+def test_flow_to_model_with_defaults():
+    """Test flow_to_model with initialize_with_defaults=True."""
+    ext = DummyExtension()
+    flow = DummyFlow()
+
+    model = ext.flow_to_model(flow, initialize_with_defaults=True)
+
+    assert isinstance(model, DummyModel)
+    assert model.defaults is True
+
+def test_flow_to_model_strict_version():
+    """Test flow_to_model with strict_version parameter."""
+    ext = DummyExtension()
+    flow = DummyFlow()
+
+    model_strict = ext.flow_to_model(flow, strict_version=True)
+    model_non_strict = ext.flow_to_model(flow, strict_version=False)
+
+    assert isinstance(model_strict, DummyModel)
+    assert model_strict.strict_version is True
+
+    assert isinstance(model_non_strict, DummyModel)
+    assert model_non_strict.strict_version is False
+
+def test_model_to_flow_conversion():
+    """Test converting a model back to flow representation."""
+    ext = DummyExtension()
+    model = DummyModel()
+
+    flow = ext.model_to_flow(model)
+
+    assert isinstance(flow, DummyFlow)
+
+
+def test_invalid_flow_raises_error():
+    """Test that invalid flow raises appropriate error."""
+    class InvalidFlow:
+        pass
+
+    ext = DummyExtension()
+    flow = InvalidFlow()
+
+    with pytest.raises(ValueError, match="Invalid flow"):
+        ext.flow_to_model(flow)
+
+
+@patch("openml.extensions.extensions", [])
+def test_extension_not_found_error_message():
+    """Test error message contains helpful information."""
+    class UnknownModel:
+        pass
+
+    with pytest.raises(ValueError, match="No extension registered"):
+        get_extension_by_model(UnknownModel(), raise_if_no_extension=True)
+
+ 
+def test_register_same_extension_twice():
+    """Test behavior when registering same extension twice."""
+    # Using a context manager here to isolate the list
+    with patch("openml.extensions.extensions", []):
+        register_extension(DummyExtension)
+        register_extension(DummyExtension)
+
+        matches = [
+            ext for ext in openml.extensions.extensions
+            if ext is DummyExtension
+        ]
+        assert len(matches) == 2
+
+
+@patch("openml.extensions.extensions", [])
+def test_extension_priority_order():
+    """Test that extensions are checked in registration order."""    
+    class DummyExtensionA(DummyExtension):
+        pass
+    class DummyExtensionB(DummyExtension):
+        pass
+
+    register_extension(DummyExtensionA)
+    register_extension(DummyExtensionB)
+
+    assert openml.extensions.extensions[0] is DummyExtensionA
+    assert openml.extensions.extensions[1] is DummyExtensionB
\ No newline at end of file
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
deleted file mode 100644
index 8bc615516..000000000
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ /dev/null
@@ -1,1691 +0,0 @@
-import collections
-import json
-import re
-import os
-import sys
-import unittest
-from distutils.version import LooseVersion
-from collections import OrderedDict
-from unittest import mock
-import warnings
-
-import numpy as np
-import scipy.optimize
-import scipy.stats
-import sklearn.base
-import sklearn.datasets
-import sklearn.decomposition
-import sklearn.dummy
-import sklearn.ensemble
-import sklearn.feature_selection
-import sklearn.gaussian_process
-import sklearn.linear_model
-import sklearn.model_selection
-import sklearn.naive_bayes
-import sklearn.neural_network
-import sklearn.pipeline
-import sklearn.preprocessing
-import sklearn.tree
-import sklearn.cluster
-
-
-import openml
-from openml.extensions.sklearn import SklearnExtension
-from openml.exceptions import PyOpenMLError
-from openml.flows import OpenMLFlow
-from openml.flows.functions import assert_flows_equal
-from openml.runs.trace import OpenMLRunTrace
-from openml.testing import TestBase, SimpleImputer
-
-
-this_directory = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(this_directory)
-
-
-__version__ = 0.1
-
-
-class Model(sklearn.base.BaseEstimator):
-    def __init__(self, boolean, integer, floating_point_value):
-        self.boolean = boolean
-        self.integer = integer
-        self.floating_point_value = floating_point_value
-
-    def fit(self, X, y):
-        pass
-
-
-class TestSklearnExtensionFlowFunctions(TestBase):
-    # Splitting not helpful, these test's don't rely on the server and take less
-    # than 1 seconds
-
-    def setUp(self):
-        super().setUp(n_levels=2)
-        iris = sklearn.datasets.load_iris()
-        self.X = iris.data
-        self.y = iris.target
-
-        self.extension = SklearnExtension()
-
-    def test_serialize_model(self):
-        with mock.patch.object(self.extension, '_check_dependencies') as check_dependencies_mock:
-            model = sklearn.tree.DecisionTreeClassifier(criterion='entropy',
-                                                        max_features='auto',
-                                                        max_leaf_nodes=2000)
-
-            fixture_name = 'sklearn.tree.tree.DecisionTreeClassifier'
-            fixture_short_name = 'sklearn.DecisionTreeClassifier'
-            fixture_description = 'Automatically created scikit-learn flow.'
-            version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
-                              % sklearn.__version__
-            # min_impurity_decrease has been introduced in 0.20
-            # min_impurity_split has been deprecated in 0.20
-            if LooseVersion(sklearn.__version__) < "0.19":
-                fixture_parameters = \
-                    OrderedDict((('class_weight', 'null'),
-                                ('criterion', '"entropy"'),
-                                ('max_depth', 'null'),
-                                ('max_features', '"auto"'),
-                                ('max_leaf_nodes', '2000'),
-                                ('min_impurity_split', '1e-07'),
-                                ('min_samples_leaf', '1'),
-                                ('min_samples_split', '2'),
-                                ('min_weight_fraction_leaf', '0.0'),
-                                ('presort', 'false'),
-                                ('random_state', 'null'),
-                                ('splitter', '"best"')))
-            else:
-                fixture_parameters = \
-                    OrderedDict((('class_weight', 'null'),
-                                ('criterion', '"entropy"'),
-                                ('max_depth', 'null'),
-                                ('max_features', '"auto"'),
-                                ('max_leaf_nodes', '2000'),
-                                ('min_impurity_decrease', '0.0'),
-                                ('min_impurity_split', 'null'),
-                                ('min_samples_leaf', '1'),
-                                ('min_samples_split', '2'),
-                                ('min_weight_fraction_leaf', '0.0'),
-                                ('presort', 'false'),
-                                ('random_state', 'null'),
-                                ('splitter', '"best"')))
-            structure_fixture = {'sklearn.tree.tree.DecisionTreeClassifier': []}
-
-            serialization = self.extension.model_to_flow(model)
-            structure = serialization.get_structure('name')
-
-            self.assertEqual(serialization.name, fixture_name)
-            self.assertEqual(serialization.class_name, fixture_name)
-            self.assertEqual(serialization.custom_name, fixture_short_name)
-            self.assertEqual(serialization.description, fixture_description)
-            self.assertEqual(serialization.parameters, fixture_parameters)
-            self.assertEqual(serialization.dependencies, version_fixture)
-            self.assertDictEqual(structure, structure_fixture)
-
-            new_model = self.extension.flow_to_model(serialization)
-            # compares string representations of the dict, as it potentially
-            # contains complex objects that can not be compared with == op
-            # Only in Python 3.x, as Python 2 has Unicode issues
-            if sys.version_info[0] >= 3:
-                self.assertEqual(str(model.get_params()), str(new_model.get_params()))
-
-            self.assertEqual(type(new_model), type(model))
-            self.assertIsNot(new_model, model)
-
-            self.assertEqual(new_model.get_params(), model.get_params())
-            new_model.fit(self.X, self.y)
-
-            self.assertEqual(check_dependencies_mock.call_count, 1)
-
-    def test_serialize_model_clustering(self):
-        with mock.patch.object(self.extension, '_check_dependencies') as check_dependencies_mock:
-            model = sklearn.cluster.KMeans()
-
-            fixture_name = 'sklearn.cluster.k_means_.KMeans'
-            fixture_short_name = 'sklearn.KMeans'
-            fixture_description = 'Automatically created scikit-learn flow.'
-            version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
-                              % sklearn.__version__
-            # n_jobs default has changed to None in 0.20
-            if LooseVersion(sklearn.__version__) < "0.20":
-                fixture_parameters = \
-                    OrderedDict((('algorithm', '"auto"'),
-                                 ('copy_x', 'true'),
-                                 ('init', '"k-means++"'),
-                                 ('max_iter', '300'),
-                                 ('n_clusters', '8'),
-                                 ('n_init', '10'),
-                                 ('n_jobs', '1'),
-                                 ('precompute_distances', '"auto"'),
-                                 ('random_state', 'null'),
-                                 ('tol', '0.0001'),
-                                 ('verbose', '0')))
-            else:
-                fixture_parameters = \
-                    OrderedDict((('algorithm', '"auto"'),
-                                 ('copy_x', 'true'),
-                                 ('init', '"k-means++"'),
-                                 ('max_iter', '300'),
-                                 ('n_clusters', '8'),
-                                 ('n_init', '10'),
-                                 ('n_jobs', 'null'),
-                                 ('precompute_distances', '"auto"'),
-                                 ('random_state', 'null'),
-                                 ('tol', '0.0001'),
-                                 ('verbose', '0')))
-            fixture_structure = {'sklearn.cluster.k_means_.KMeans': []}
-
-            serialization = self.extension.model_to_flow(model)
-            structure = serialization.get_structure('name')
-
-            self.assertEqual(serialization.name, fixture_name)
-            self.assertEqual(serialization.class_name, fixture_name)
-            self.assertEqual(serialization.custom_name, fixture_short_name)
-            self.assertEqual(serialization.description, fixture_description)
-            self.assertEqual(serialization.parameters, fixture_parameters)
-            self.assertEqual(serialization.dependencies, version_fixture)
-            self.assertDictEqual(structure, fixture_structure)
-
-            new_model = self.extension.flow_to_model(serialization)
-            # compares string representations of the dict, as it potentially
-            # contains complex objects that can not be compared with == op
-            self.assertEqual(str(model.get_params()), str(new_model.get_params()))
-
-            self.assertEqual(type(new_model), type(model))
-            self.assertIsNot(new_model, model)
-
-            self.assertEqual(new_model.get_params(), model.get_params())
-            new_model.fit(self.X)
-
-            self.assertEqual(check_dependencies_mock.call_count, 1)
-
-    def test_serialize_model_with_subcomponent(self):
-        model = sklearn.ensemble.AdaBoostClassifier(
-            n_estimators=100, base_estimator=sklearn.tree.DecisionTreeClassifier())
-
-        fixture_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' \
-                       '(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)'
-        fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'
-        fixture_short_name = 'sklearn.AdaBoostClassifier'
-        fixture_description = 'Automatically created scikit-learn flow.'
-        fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier'
-        fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier'
-        fixture_subcomponent_description = 'Automatically created scikit-learn flow.'
-        fixture_structure = {
-            fixture_name: [],
-            'sklearn.tree.tree.DecisionTreeClassifier': ['base_estimator']
-        }
-
-        serialization = self.extension.model_to_flow(model)
-        structure = serialization.get_structure('name')
-
-        self.assertEqual(serialization.name, fixture_name)
-        self.assertEqual(serialization.class_name, fixture_class_name)
-        self.assertEqual(serialization.custom_name, fixture_short_name)
-        self.assertEqual(serialization.description, fixture_description)
-        self.assertEqual(serialization.parameters['algorithm'], '"SAMME.R"')
-        self.assertIsInstance(serialization.parameters['base_estimator'], str)
-        self.assertEqual(serialization.parameters['learning_rate'], '1.0')
-        self.assertEqual(serialization.parameters['n_estimators'], '100')
-        self.assertEqual(serialization.components['base_estimator'].name,
-                         fixture_subcomponent_name)
-        self.assertEqual(serialization.components['base_estimator'].class_name,
-                         fixture_subcomponent_class_name)
-        self.assertEqual(serialization.components['base_estimator'].description,
-                         fixture_subcomponent_description)
-        self.assertDictEqual(structure, fixture_structure)
-
-        new_model = self.extension.flow_to_model(serialization)
-        # compares string representations of the dict, as it potentially
-        # contains complex objects that can not be compared with == op
-        self.assertEqual(str(model.get_params()), str(new_model.get_params()))
-
-        self.assertEqual(type(new_model), type(model))
-        self.assertIsNot(new_model, model)
-
-        self.assertIsNot(new_model.base_estimator, model.base_estimator)
-        self.assertEqual(new_model.base_estimator.get_params(),
-                         model.base_estimator.get_params())
-        new_model_params = new_model.get_params()
-        del new_model_params['base_estimator']
-        model_params = model.get_params()
-        del model_params['base_estimator']
-
-        self.assertEqual(new_model_params, model_params)
-        new_model.fit(self.X, self.y)
-
-    def test_serialize_pipeline(self):
-        scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
-        dummy = sklearn.dummy.DummyClassifier(strategy='prior')
-        model = sklearn.pipeline.Pipeline(steps=[
-            ('scaler', scaler), ('dummy', dummy)])
-
-        fixture_name = 'sklearn.pipeline.Pipeline(' \
-                       'scaler=sklearn.preprocessing.data.StandardScaler,' \
-                       'dummy=sklearn.dummy.DummyClassifier)'
-        fixture_short_name = 'sklearn.Pipeline(StandardScaler,DummyClassifier)'
-        fixture_description = 'Automatically created scikit-learn flow.'
-        fixture_structure = {
-            fixture_name: [],
-            'sklearn.preprocessing.data.StandardScaler': ['scaler'],
-            'sklearn.dummy.DummyClassifier': ['dummy']
-        }
-
-        serialization = self.extension.model_to_flow(model)
-        structure = serialization.get_structure('name')
-
-        self.assertEqual(serialization.name, fixture_name)
-        self.assertEqual(serialization.custom_name, fixture_short_name)
-        self.assertEqual(serialization.description, fixture_description)
-        self.assertDictEqual(structure, fixture_structure)
-
-        # Comparing the pipeline
-        # The parameters only have the name of base objects(not the whole flow)
-        # as value
-        # memory parameter has been added in 0.19, verbose in 0.21
-        if LooseVersion(sklearn.__version__) < "0.19":
-            self.assertEqual(len(serialization.parameters), 1)
-        elif LooseVersion(sklearn.__version__) < "0.21":
-            self.assertEqual(len(serialization.parameters), 2)
-        else:
-            self.assertEqual(len(serialization.parameters), 3)
-
-        # Hard to compare two representations of a dict due to possibly
-        # different sorting. Making a json makes it easier
-        self.assertEqual(
-            json.loads(serialization.parameters['steps']),
-            [
-                {
-                    'oml-python:serialized_object':
-                        'component_reference',
-                    'value': {'key': 'scaler', 'step_name': 'scaler'}
-                },
-                {
-                    'oml-python:serialized_object':
-                        'component_reference',
-                    'value': {'key': 'dummy', 'step_name': 'dummy'}
-                }
-            ]
-        )
-
-        # Checking the sub-component
-        self.assertEqual(len(serialization.components), 2)
-        self.assertIsInstance(serialization.components['scaler'],
-                              OpenMLFlow)
-        self.assertIsInstance(serialization.components['dummy'],
-                              OpenMLFlow)
-
-        new_model = self.extension.flow_to_model(serialization)
-        # compares string representations of the dict, as it potentially
-        # contains complex objects that can not be compared with == op
-        # Only in Python 3.x, as Python 2 has Unicode issues
-        if sys.version_info[0] >= 3:
-            self.assertEqual(str(model.get_params()),
-                             str(new_model.get_params()))
-
-        self.assertEqual(type(new_model), type(model))
-        self.assertIsNot(new_model, model)
-
-        self.assertEqual([step[0] for step in new_model.steps],
-                         [step[0] for step in model.steps])
-        self.assertIsNot(new_model.steps[0][1], model.steps[0][1])
-        self.assertIsNot(new_model.steps[1][1], model.steps[1][1])
-
-        new_model_params = new_model.get_params()
-        del new_model_params['scaler']
-        del new_model_params['dummy']
-        del new_model_params['steps']
-        fu_params = model.get_params()
-        del fu_params['scaler']
-        del fu_params['dummy']
-        del fu_params['steps']
-
-        self.assertEqual(new_model_params, fu_params)
-        new_model.fit(self.X, self.y)
-
-    def test_serialize_pipeline_clustering(self):
-        scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
-        km = sklearn.cluster.KMeans()
-        model = sklearn.pipeline.Pipeline(steps=[
-            ('scaler', scaler), ('clusterer', km)])
-
-        fixture_name = 'sklearn.pipeline.Pipeline(' \
-                       'scaler=sklearn.preprocessing.data.StandardScaler,' \
-                       'clusterer=sklearn.cluster.k_means_.KMeans)'
-        fixture_short_name = 'sklearn.Pipeline(StandardScaler,KMeans)'
-        fixture_description = 'Automatically created scikit-learn flow.'
-        fixture_structure = {
-            fixture_name: [],
-            'sklearn.preprocessing.data.StandardScaler': ['scaler'],
-            'sklearn.cluster.k_means_.KMeans': ['clusterer']
-        }
-
-        serialization = self.extension.model_to_flow(model)
-        structure = serialization.get_structure('name')
-
-        self.assertEqual(serialization.name, fixture_name)
-        self.assertEqual(serialization.custom_name, fixture_short_name)
-        self.assertEqual(serialization.description, fixture_description)
-        self.assertDictEqual(structure, fixture_structure)
-
-        # Comparing the pipeline
-        # The parameters only have the name of base objects(not the whole flow)
-        # as value
-        # memory parameter has been added in 0.19
-        if LooseVersion(sklearn.__version__) < "0.19":
-            self.assertEqual(len(serialization.parameters), 1)
-        elif LooseVersion(sklearn.__version__) < "0.21":
-            self.assertEqual(len(serialization.parameters), 2)
-        else:
-            self.assertEqual(len(serialization.parameters), 3)
-        # Hard to compare two representations of a dict due to possibly
-        # different sorting. Making a json makes it easier
-        self.assertEqual(
-            json.loads(serialization.parameters['steps']),
-            [
-                {
-                    'oml-python:serialized_object': 'component_reference',
-                    'value': {'key': 'scaler', 'step_name': 'scaler'}
-                },
-                {
-                    'oml-python:serialized_object': 'component_reference',
-                    'value': {'key': 'clusterer', 'step_name': 'clusterer'}
-                },
-            ]
-        )
-
-        # Checking the sub-component
-        self.assertEqual(len(serialization.components), 2)
-        self.assertIsInstance(serialization.components['scaler'],
-                              OpenMLFlow)
-        self.assertIsInstance(serialization.components['clusterer'],
-                              OpenMLFlow)
-
-        # del serialization.model
-        new_model = self.extension.flow_to_model(serialization)
-        # compares string representations of the dict, as it potentially
-        # contains complex objects that can not be compared with == op
-        # Only in Python 3.x, as Python 2 has Unicode issues
-        if sys.version_info[0] >= 3:
-            self.assertEqual(str(model.get_params()),
-                             str(new_model.get_params()))
-
-        self.assertEqual(type(new_model), type(model))
-        self.assertIsNot(new_model, model)
-
-        self.assertEqual([step[0] for step in new_model.steps],
-                         [step[0] for step in model.steps])
-        self.assertIsNot(new_model.steps[0][1], model.steps[0][1])
-        self.assertIsNot(new_model.steps[1][1], model.steps[1][1])
-
-        new_model_params = new_model.get_params()
-        del new_model_params['scaler']
-        del new_model_params['clusterer']
-        del new_model_params['steps']
-        fu_params = model.get_params()
-        del fu_params['scaler']
-        del fu_params['clusterer']
-        del fu_params['steps']
-
-        self.assertEqual(new_model_params, fu_params)
-        new_model.fit(self.X, self.y)
-
-    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
-                     reason="columntransformer introduction in 0.20.0")
-    def test_serialize_column_transformer(self):
-        # temporary local import, dependend on version 0.20
-        import sklearn.compose
-        model = sklearn.compose.ColumnTransformer(
-            transformers=[
-                ('numeric', sklearn.preprocessing.StandardScaler(), [0, 1, 2]),
-                ('nominal', sklearn.preprocessing.OneHotEncoder(
-                    handle_unknown='ignore'), [3, 4, 5])],
-            remainder='passthrough')
-        fixture = 'sklearn.compose._column_transformer.ColumnTransformer(' \
-                  'numeric=sklearn.preprocessing.data.StandardScaler,' \
-                  'nominal=sklearn.preprocessing._encoders.OneHotEncoder)'
-        fixture_short_name = 'sklearn.ColumnTransformer'
-        fixture_description = 'Automatically created scikit-learn flow.'
-        fixture_structure = {
-            fixture: [],
-            'sklearn.preprocessing.data.StandardScaler': ['numeric'],
-            'sklearn.preprocessing._encoders.OneHotEncoder': ['nominal']
-        }
-
-        serialization = self.extension.model_to_flow(model)
-        structure = serialization.get_structure('name')
-        self.assertEqual(serialization.name, fixture)
-        self.assertEqual(serialization.custom_name, fixture_short_name)
-        self.assertEqual(serialization.description, fixture_description)
-        self.assertDictEqual(structure, fixture_structure)
-        # del serialization.model
-        new_model = self.extension.flow_to_model(serialization)
-        # compares string representations of the dict, as it potentially
-        # contains complex objects that can not be compared with == op
-        # Only in Python 3.x, as Python 2 has Unicode issues
-        if sys.version_info[0] >= 3:
-            self.assertEqual(str(model.get_params()),
-                             str(new_model.get_params()))
-        self.assertEqual(type(new_model), type(model))
-        self.assertIsNot(new_model, model)
-        serialization2 = self.extension.model_to_flow(new_model)
-        assert_flows_equal(serialization, serialization2)
-
-    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
-                     reason="columntransformer introduction in 0.20.0")
-    def test_serialize_column_transformer_pipeline(self):
-        # temporary local import, dependend on version 0.20
-        import sklearn.compose
-        inner = sklearn.compose.ColumnTransformer(
-            transformers=[
-                ('numeric', sklearn.preprocessing.StandardScaler(), [0, 1, 2]),
-                ('nominal', sklearn.preprocessing.OneHotEncoder(
-                    handle_unknown='ignore'), [3, 4, 5])],
-            remainder='passthrough')
-        model = sklearn.pipeline.Pipeline(
-            steps=[('transformer', inner),
-                   ('classifier', sklearn.tree.DecisionTreeClassifier())])
-        fixture_name = \
-            'sklearn.pipeline.Pipeline('\
-            'transformer=sklearn.compose._column_transformer.'\
-            'ColumnTransformer('\
-            'numeric=sklearn.preprocessing.data.StandardScaler,'\
-            'nominal=sklearn.preprocessing._encoders.OneHotEncoder),'\
-            'classifier=sklearn.tree.tree.DecisionTreeClassifier)'
-        fixture_structure = {
-            'sklearn.preprocessing.data.StandardScaler':
-                ['transformer', 'numeric'],
-            'sklearn.preprocessing._encoders.OneHotEncoder':
-                ['transformer', 'nominal'],
-            'sklearn.compose._column_transformer.ColumnTransformer(numeric='
-            'sklearn.preprocessing.data.StandardScaler,nominal=sklearn.'
-            'preprocessing._encoders.OneHotEncoder)': ['transformer'],
-            'sklearn.tree.tree.DecisionTreeClassifier': ['classifier'],
-            fixture_name: [],
-        }
-
-        fixture_description = 'Automatically created scikit-learn flow.'
-        serialization = self.extension.model_to_flow(model)
-        structure = serialization.get_structure('name')
-        self.assertEqual(serialization.name, fixture_name)
-        self.assertEqual(serialization.description, fixture_description)
-        self.assertDictEqual(structure, fixture_structure)
-        # del serialization.model
-        new_model = self.extension.flow_to_model(serialization)
-        # compares string representations of the dict, as it potentially
-        # contains complex objects that can not be compared with == op
-        self.assertEqual(str(model.get_params()), str(new_model.get_params()))
-        self.assertEqual(type(new_model), type(model))
-        self.assertIsNot(new_model, model)
-        serialization2 = self.extension.model_to_flow(new_model)
-        assert_flows_equal(serialization, serialization2)
-
-    def test_serialize_feature_union(self):
-        ohe_params = {'sparse': False}
-        if LooseVersion(sklearn.__version__) >= "0.20":
-            ohe_params['categories'] = 'auto'
-        ohe = sklearn.preprocessing.OneHotEncoder(**ohe_params)
-        scaler = sklearn.preprocessing.StandardScaler()
-
-        fu = sklearn.pipeline.FeatureUnion(
-            transformer_list=[('ohe', ohe), ('scaler', scaler)]
-        )
-        serialization = self.extension.model_to_flow(fu)
-        structure = serialization.get_structure('name')
-        # OneHotEncoder was moved to _encoders module in 0.20
-        module_name_encoder = ('_encoders'
-                               if LooseVersion(sklearn.__version__) >= "0.20"
-                               else 'data')
-        fixture_name = ('sklearn.pipeline.FeatureUnion('
-                        'ohe=sklearn.preprocessing.{}.OneHotEncoder,'
-                        'scaler=sklearn.preprocessing.data.StandardScaler)'
-                        .format(module_name_encoder))
-        fixture_structure = {
-            fixture_name: [],
-            'sklearn.preprocessing.{}.'
-            'OneHotEncoder'.format(module_name_encoder): ['ohe'],
-            'sklearn.preprocessing.data.StandardScaler': ['scaler']
-        }
-        self.assertEqual(serialization.name, fixture_name)
-        self.assertDictEqual(structure, fixture_structure)
-        new_model = self.extension.flow_to_model(serialization)
-        # compares string representations of the dict, as it potentially
-        # contains complex objects that can not be compared with == op
-        # Only in Python 3.x, as Python 2 has Unicode issues
-        if sys.version_info[0] >= 3:
-            self.assertEqual(str(fu.get_params()),
-                             str(new_model.get_params()))
-
-        self.assertEqual(type(new_model), type(fu))
-        self.assertIsNot(new_model, fu)
-        self.assertEqual(new_model.transformer_list[0][0],
-                         fu.transformer_list[0][0])
-        self.assertEqual(new_model.transformer_list[0][1].get_params(),
-                         fu.transformer_list[0][1].get_params())
-        self.assertEqual(new_model.transformer_list[1][0],
-                         fu.transformer_list[1][0])
-        self.assertEqual(new_model.transformer_list[1][1].get_params(),
-                         fu.transformer_list[1][1].get_params())
-
-        self.assertEqual([step[0] for step in new_model.transformer_list],
-                         [step[0] for step in fu.transformer_list])
-        self.assertIsNot(new_model.transformer_list[0][1],
-                         fu.transformer_list[0][1])
-        self.assertIsNot(new_model.transformer_list[1][1],
-                         fu.transformer_list[1][1])
-
-        new_model_params = new_model.get_params()
-        del new_model_params['ohe']
-        del new_model_params['scaler']
-        del new_model_params['transformer_list']
-        fu_params = fu.get_params()
-        del fu_params['ohe']
-        del fu_params['scaler']
-        del fu_params['transformer_list']
-
-        self.assertEqual(new_model_params, fu_params)
-        new_model.fit(self.X, self.y)
-
-        fu.set_params(scaler=None)
-        serialization = self.extension.model_to_flow(fu)
-        self.assertEqual(serialization.name,
-                         'sklearn.pipeline.FeatureUnion('
-                         'ohe=sklearn.preprocessing.{}.OneHotEncoder)'
-                         .format(module_name_encoder))
-        new_model = self.extension.flow_to_model(serialization)
-        self.assertEqual(type(new_model), type(fu))
-        self.assertIsNot(new_model, fu)
-        self.assertIs(new_model.transformer_list[1][1], None)
-
-    def test_serialize_feature_union_switched_names(self):
-        ohe_params = ({'categories': 'auto'}
-                      if LooseVersion(sklearn.__version__) >= "0.20" else {})
-        ohe = sklearn.preprocessing.OneHotEncoder(**ohe_params)
-        scaler = sklearn.preprocessing.StandardScaler()
-        fu1 = sklearn.pipeline.FeatureUnion(
-            transformer_list=[('ohe', ohe), ('scaler', scaler)])
-        fu2 = sklearn.pipeline.FeatureUnion(
-            transformer_list=[('scaler', ohe), ('ohe', scaler)])
-        fu1_serialization = self.extension.model_to_flow(fu1)
-        fu2_serialization = self.extension.model_to_flow(fu2)
-        # OneHotEncoder was moved to _encoders module in 0.20
-        module_name_encoder = ('_encoders'
-                               if LooseVersion(sklearn.__version__) >= "0.20"
-                               else 'data')
-        self.assertEqual(
-            fu1_serialization.name,
-            "sklearn.pipeline.FeatureUnion("
-            "ohe=sklearn.preprocessing.{}.OneHotEncoder,"
-            "scaler=sklearn.preprocessing.data.StandardScaler)"
-            .format(module_name_encoder))
-        self.assertEqual(
-            fu2_serialization.name,
-            "sklearn.pipeline.FeatureUnion("
-            "scaler=sklearn.preprocessing.{}.OneHotEncoder,"
-            "ohe=sklearn.preprocessing.data.StandardScaler)"
-            .format(module_name_encoder))
-
-    def test_serialize_complex_flow(self):
-        ohe = sklearn.preprocessing.OneHotEncoder()
-        scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
-        boosting = sklearn.ensemble.AdaBoostClassifier(
-            base_estimator=sklearn.tree.DecisionTreeClassifier())
-        model = sklearn.pipeline.Pipeline(steps=[
-            ('ohe', ohe), ('scaler', scaler), ('boosting', boosting)])
-        parameter_grid = {
-            'base_estimator__max_depth': scipy.stats.randint(1, 10),
-            'learning_rate': scipy.stats.uniform(0.01, 0.99),
-            'n_estimators': [1, 5, 10, 100]
-        }
-        # convert to ordered dict, sorted by keys) due to param grid check
-        parameter_grid = OrderedDict(sorted(parameter_grid.items()))
-        cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
-        rs = sklearn.model_selection.RandomizedSearchCV(
-            estimator=model, param_distributions=parameter_grid, cv=cv)
-        serialized = self.extension.model_to_flow(rs)
-        structure = serialized.get_structure('name')
-        # OneHotEncoder was moved to _encoders module in 0.20
-        module_name_encoder = ('_encoders'
-                               if LooseVersion(sklearn.__version__) >= "0.20"
-                               else 'data')
-        ohe_name = 'sklearn.preprocessing.%s.OneHotEncoder' % \
-                   module_name_encoder
-        scaler_name = 'sklearn.preprocessing.data.StandardScaler'
-        tree_name = 'sklearn.tree.tree.DecisionTreeClassifier'
-        boosting_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' \
-                        '(base_estimator=%s)' % tree_name
-        pipeline_name = 'sklearn.pipeline.Pipeline(ohe=%s,scaler=%s,' \
-                        'boosting=%s)' % (ohe_name, scaler_name, boosting_name)
-        fixture_name = 'sklearn.model_selection._search.RandomizedSearchCV' \
-                       '(estimator=%s)' % pipeline_name
-        fixture_structure = {
-            ohe_name: ['estimator', 'ohe'],
-            scaler_name: ['estimator', 'scaler'],
-            tree_name: ['estimator', 'boosting', 'base_estimator'],
-            boosting_name: ['estimator', 'boosting'],
-            pipeline_name: ['estimator'],
-            fixture_name: []
-        }
-        self.assertEqual(serialized.name, fixture_name)
-        self.assertEqual(structure, fixture_structure)
-
-        # now do deserialization
-        deserialized = self.extension.flow_to_model(serialized)
-        # compares string representations of the dict, as it potentially
-        # contains complex objects that can not be compared with == op
-        # JvR: compare str length, due to memory address of distribution
-        self.assertEqual(len(str(rs.get_params())), len(str(deserialized.get_params())))
-
-        # Checks that sklearn_to_flow is idempotent.
-        serialized2 = self.extension.model_to_flow(deserialized)
-        self.assertNotEqual(rs, deserialized)
-        # Would raise an exception if the flows would be unequal
-        assert_flows_equal(serialized, serialized2)
-
-    def test_serialize_type(self):
-        supported_types = [float, np.float, np.float32, np.float64,
-                           int, np.int, np.int32, np.int64]
-
-        for supported_type in supported_types:
-            serialized = self.extension.model_to_flow(supported_type)
-            deserialized = self.extension.flow_to_model(serialized)
-            self.assertEqual(deserialized, supported_type)
-
-    def test_serialize_rvs(self):
-        supported_rvs = [scipy.stats.norm(loc=1, scale=5),
-                         scipy.stats.expon(loc=1, scale=5),
-                         scipy.stats.randint(low=-3, high=15)]
-
-        for supported_rv in supported_rvs:
-            serialized = self.extension.model_to_flow(supported_rv)
-            deserialized = self.extension.flow_to_model(serialized)
-            self.assertEqual(type(deserialized.dist), type(supported_rv.dist))
-            del deserialized.dist
-            del supported_rv.dist
-            self.assertEqual(deserialized.__dict__,
-                             supported_rv.__dict__)
-
-    def test_serialize_function(self):
-        serialized = self.extension.model_to_flow(sklearn.feature_selection.chi2)
-        deserialized = self.extension.flow_to_model(serialized)
-        self.assertEqual(deserialized, sklearn.feature_selection.chi2)
-
-    def test_serialize_cvobject(self):
-        methods = [sklearn.model_selection.KFold(3),
-                   sklearn.model_selection.LeaveOneOut()]
-        fixtures = [
-            OrderedDict([
-                ('oml-python:serialized_object', 'cv_object'),
-                ('value', OrderedDict([
-                    ('name', 'sklearn.model_selection._split.KFold'),
-                    ('parameters', OrderedDict([
-                        ('n_splits', '3'),
-                        ('random_state', 'null'),
-                        ('shuffle', 'false'),
-                    ]))
-                ]))
-            ]),
-            OrderedDict([
-                ('oml-python:serialized_object', 'cv_object'),
-                ('value', OrderedDict([
-                    ('name', 'sklearn.model_selection._split.LeaveOneOut'),
-                    ('parameters', OrderedDict())
-                ]))
-            ]),
-        ]
-        for method, fixture in zip(methods, fixtures):
-            m = self.extension.model_to_flow(method)
-            self.assertEqual(m, fixture)
-
-            m_new = self.extension.flow_to_model(m)
-            self.assertIsNot(m_new, m)
-            self.assertIsInstance(m_new, type(method))
-
-    def test_serialize_simple_parameter_grid(self):
-
-        # We cannot easily test for scipy random variables in here, but they
-        # should be covered
-
-        # Examples from the scikit-learn documentation
-        models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()]
-        grids = \
-            [[OrderedDict([('C', [1, 10, 100, 1000]), ('kernel', ['linear'])]),
-              OrderedDict([('C', [1, 10, 100, 1000]), ('gamma', [0.001, 0.0001]),
-                           ('kernel', ['rbf'])])],
-             OrderedDict([("bootstrap", [True, False]),
-                          ("criterion", ["gini", "entropy"]),
-                          ("max_depth", [3, None]),
-                          ("max_features", [1, 3, 10]),
-                          ("min_samples_leaf", [1, 3, 10]),
-                          ("min_samples_split", [1, 3, 10])
-                          ])]
-
-        for grid, model in zip(grids, models):
-            serialized = self.extension.model_to_flow(grid)
-            deserialized = self.extension.flow_to_model(serialized)
-
-            self.assertEqual(deserialized, grid)
-            self.assertIsNot(deserialized, grid)
-            # providing error_score because nan != nan
-            hpo = sklearn.model_selection.GridSearchCV(
-                param_grid=grid, estimator=model, error_score=-1000)
-
-            serialized = self.extension.model_to_flow(hpo)
-            deserialized = self.extension.flow_to_model(serialized)
-            self.assertEqual(hpo.param_grid, deserialized.param_grid)
-            self.assertEqual(hpo.estimator.get_params(),
-                             deserialized.estimator.get_params())
-            hpo_params = hpo.get_params(deep=False)
-            deserialized_params = deserialized.get_params(deep=False)
-            del hpo_params['estimator']
-            del deserialized_params['estimator']
-            self.assertEqual(hpo_params, deserialized_params)
-
-    @unittest.skip('This feature needs further reworking. If we allow several '
-                   'components, we need to register them all in the downstream '
-                   'flows. This is so far not implemented.')
-    def test_serialize_advanced_grid(self):
-        # TODO instead a GridSearchCV object should be serialized
-
-        # This needs to be in its own function because we cannot simply check
-        # for the equality of the grid, because scikit-learn objects don't
-        # really support the equality operator
-        # This will only work with sklearn==0.18
-        N_FEATURES_OPTIONS = [2, 4, 8]
-        C_OPTIONS = [1, 10, 100, 1000]
-        grid = [{'reduce_dim': [sklearn.decomposition.PCA(iterated_power=7),
-                                sklearn.decomposition.NMF()],
-                 'reduce_dim__n_components': N_FEATURES_OPTIONS,
-                 'classify__C': C_OPTIONS},
-                {'reduce_dim': [sklearn.feature_selection.SelectKBest(
-                                sklearn.feature_selection.chi2)],
-                 'reduce_dim__k': N_FEATURES_OPTIONS,
-                 'classify__C': C_OPTIONS}]
-
-        serialized = self.extension.model_to_flow(grid)
-        deserialized = self.extension.flow_to_model(serialized)
-
-        self.assertEqual(grid[0]['reduce_dim'][0].get_params(),
-                         deserialized[0]['reduce_dim'][0].get_params())
-        self.assertIsNot(grid[0]['reduce_dim'][0],
-                         deserialized[0]['reduce_dim'][0])
-        self.assertEqual(grid[0]['reduce_dim'][1].get_params(),
-                         deserialized[0]['reduce_dim'][1].get_params())
-        self.assertIsNot(grid[0]['reduce_dim'][1],
-                         deserialized[0]['reduce_dim'][1])
-        self.assertEqual(grid[0]['reduce_dim__n_components'],
-                         deserialized[0]['reduce_dim__n_components'])
-        self.assertEqual(grid[0]['classify__C'],
-                         deserialized[0]['classify__C'])
-        self.assertEqual(grid[1]['reduce_dim'][0].get_params(),
-                         deserialized[1]['reduce_dim'][0].get_params())
-        self.assertIsNot(grid[1]['reduce_dim'][0],
-                         deserialized[1]['reduce_dim'][0])
-        self.assertEqual(grid[1]['reduce_dim__k'],
-                         deserialized[1]['reduce_dim__k'])
-        self.assertEqual(grid[1]['classify__C'],
-                         deserialized[1]['classify__C'])
-
-    def test_serialize_advanced_grid_fails(self):
-        # This unit test is checking that the test we skip above would actually fail
-
-        param_grid = {
-            "base_estimator": [
-                sklearn.tree.DecisionTreeClassifier(),
-                sklearn.tree.ExtraTreeClassifier()]
-        }
-
-        clf = sklearn.model_selection.GridSearchCV(
-            sklearn.ensemble.BaggingClassifier(),
-            param_grid=param_grid,
-        )
-        with self.assertRaisesRegex(
-            TypeError,
-                re.compile(r".*OpenML.*Flow.*is not JSON serializable",
-                           flags=re.DOTALL)
-        ):
-            self.extension.model_to_flow(clf)
-
-    def test_serialize_resampling(self):
-        kfold = sklearn.model_selection.StratifiedKFold(
-            n_splits=4, shuffle=True)
-        serialized = self.extension.model_to_flow(kfold)
-        deserialized = self.extension.flow_to_model(serialized)
-        # Best approximation to get_params()
-        self.assertEqual(str(deserialized), str(kfold))
-        self.assertIsNot(deserialized, kfold)
-
-    def test_hypothetical_parameter_values(self):
-        # The hypothetical parameter values of true, 1, 0.1 formatted as a
-        # string (and their correct serialization and deserialization) an only
-        #  be checked inside a model
-
-        model = Model('true', '1', '0.1')
-
-        serialized = self.extension.model_to_flow(model)
-        serialized.external_version = 'sklearn==test123'
-        deserialized = self.extension.flow_to_model(serialized)
-        self.assertEqual(deserialized.get_params(), model.get_params())
-        self.assertIsNot(deserialized, model)
-
-    def test_gaussian_process(self):
-        opt = scipy.optimize.fmin_l_bfgs_b
-        kernel = sklearn.gaussian_process.kernels.Matern()
-        gp = sklearn.gaussian_process.GaussianProcessClassifier(
-            kernel=kernel, optimizer=opt)
-        with self.assertRaisesRegex(
-            TypeError,
-            r"Matern\(length_scale=1, nu=1.5\), <class 'sklearn.gaussian_process.kernels.Matern'>",
-        ):
-            self.extension.model_to_flow(gp)
-
-    def test_error_on_adding_component_multiple_times_to_flow(self):
-        # this function implicitly checks
-        # - openml.flows._check_multiple_occurence_of_component_in_flow()
-        pca = sklearn.decomposition.PCA()
-        pca2 = sklearn.decomposition.PCA()
-        pipeline = sklearn.pipeline.Pipeline((('pca1', pca), ('pca2', pca2)))
-        fixture = "Found a second occurence of component .*.PCA when trying to serialize Pipeline"
-        with self.assertRaisesRegex(ValueError, fixture):
-            self.extension.model_to_flow(pipeline)
-
-        fu = sklearn.pipeline.FeatureUnion((('pca1', pca), ('pca2', pca2)))
-        fixture = "Found a second occurence of component .*.PCA when trying " \
-                  "to serialize FeatureUnion"
-        with self.assertRaisesRegex(ValueError, fixture):
-            self.extension.model_to_flow(fu)
-
-        fs = sklearn.feature_selection.SelectKBest()
-        fu2 = sklearn.pipeline.FeatureUnion((('pca1', pca), ('fs', fs)))
-        pipeline2 = sklearn.pipeline.Pipeline((('fu', fu2), ('pca2', pca2)))
-        fixture = "Found a second occurence of component .*.PCA when trying to serialize Pipeline"
-        with self.assertRaisesRegex(ValueError, fixture):
-            self.extension.model_to_flow(pipeline2)
-
-    def test_subflow_version_propagated(self):
-        this_directory = os.path.dirname(os.path.abspath(__file__))
-        tests_directory = os.path.abspath(os.path.join(this_directory,
-                                                       '..', '..'))
-        sys.path.append(tests_directory)
-        import tests.test_flows.dummy_learn.dummy_forest
-        pca = sklearn.decomposition.PCA()
-        dummy = tests.test_flows.dummy_learn.dummy_forest.DummyRegressor()
-        pipeline = sklearn.pipeline.Pipeline((('pca', pca), ('dummy', dummy)))
-        flow = self.extension.model_to_flow(pipeline)
-        # In python2.7, the unit tests work differently on travis-ci; therefore,
-        # I put the alternative travis-ci answer here as well. While it has a
-        # different value, it is still correct as it is a propagation of the
-        # subclasses' module name
-        self.assertEqual(flow.external_version, '%s,%s,%s' % (
-            self.extension._format_external_version('openml', openml.__version__),
-            self.extension._format_external_version('sklearn', sklearn.__version__),
-            self.extension._format_external_version('tests', '0.1')))
-
-    @mock.patch('warnings.warn')
-    def test_check_dependencies(self, warnings_mock):
-        dependencies = ['sklearn==0.1', 'sklearn>=99.99.99',
-                        'sklearn>99.99.99']
-        for dependency in dependencies:
-            self.assertRaises(ValueError, self.extension._check_dependencies, dependency)
-
-    def test_illegal_parameter_names(self):
-        # illegal name: estimators
-        clf1 = sklearn.ensemble.VotingClassifier(
-            estimators=[
-                ('estimators', sklearn.ensemble.RandomForestClassifier()),
-                ('whatevs', sklearn.ensemble.ExtraTreesClassifier())])
-        clf2 = sklearn.ensemble.VotingClassifier(
-            estimators=[
-                ('whatevs', sklearn.ensemble.RandomForestClassifier()),
-                ('estimators', sklearn.ensemble.ExtraTreesClassifier())])
-        cases = [clf1, clf2]
-
-        for case in cases:
-            self.assertRaises(PyOpenMLError, self.extension.model_to_flow, case)
-
-    def test_illegal_parameter_names_pipeline(self):
-        # illegal name: steps
-        steps = [
-            ('Imputer', SimpleImputer(strategy='median')),
-            ('OneHotEncoder',
-             sklearn.preprocessing.OneHotEncoder(sparse=False,
-                                                 handle_unknown='ignore')),
-            ('steps', sklearn.ensemble.BaggingClassifier(
-                base_estimator=sklearn.tree.DecisionTreeClassifier))
-        ]
-        self.assertRaises(ValueError, sklearn.pipeline.Pipeline, steps=steps)
-
-    def test_illegal_parameter_names_featureunion(self):
-        # illegal name: transformer_list
-        transformer_list = [
-            ('transformer_list',
-             SimpleImputer(strategy='median')),
-            ('OneHotEncoder',
-             sklearn.preprocessing.OneHotEncoder(sparse=False,
-                                                 handle_unknown='ignore'))
-        ]
-        self.assertRaises(ValueError, sklearn.pipeline.FeatureUnion,
-                          transformer_list=transformer_list)
-
-    def test_paralizable_check(self):
-        # using this model should pass the test (if param distribution is
-        # legal)
-        singlecore_bagging = sklearn.ensemble.BaggingClassifier()
-        # using this model should return false (if param distribution is legal)
-        multicore_bagging = sklearn.ensemble.BaggingClassifier(n_jobs=5)
-        # using this param distribution should raise an exception
-        illegal_param_dist = {"base__n_jobs": [-1, 0, 1]}
-        # using this param distribution should not raise an exception
-        legal_param_dist = {"base__max_depth": [2, 3, 4]}
-
-        legal_models = [
-            sklearn.ensemble.RandomForestClassifier(),
-            sklearn.ensemble.RandomForestClassifier(n_jobs=5),
-            sklearn.ensemble.RandomForestClassifier(n_jobs=-1),
-            sklearn.pipeline.Pipeline(
-                steps=[('bag', sklearn.ensemble.BaggingClassifier(n_jobs=1))]),
-            sklearn.pipeline.Pipeline(
-                steps=[('bag', sklearn.ensemble.BaggingClassifier(n_jobs=5))]),
-            sklearn.pipeline.Pipeline(
-                steps=[('bag', sklearn.ensemble.BaggingClassifier(n_jobs=-1))]),
-            sklearn.model_selection.GridSearchCV(singlecore_bagging,
-                                                 legal_param_dist),
-            sklearn.model_selection.GridSearchCV(multicore_bagging,
-                                                 legal_param_dist),
-            sklearn.ensemble.BaggingClassifier(
-                n_jobs=-1,
-                base_estimator=sklearn.ensemble.RandomForestClassifier(n_jobs=5)
-            )
-        ]
-        illegal_models = [
-            sklearn.model_selection.GridSearchCV(singlecore_bagging,
-                                                 illegal_param_dist),
-            sklearn.model_selection.GridSearchCV(multicore_bagging,
-                                                 illegal_param_dist)
-        ]
-
-        can_measure_cputime_answers = [True, False, False, True, False, False, True, False, False]
-        can_measure_walltime_answers = [True, True, False, True, True, False, True, True, False]
-
-        for model, allowed_cputime, allowed_walltime in zip(legal_models,
-                                                            can_measure_cputime_answers,
-                                                            can_measure_walltime_answers):
-            self.assertEqual(self.extension._can_measure_cputime(model), allowed_cputime)
-            self.assertEqual(self.extension._can_measure_wallclocktime(model), allowed_walltime)
-
-        for model in illegal_models:
-            with self.assertRaises(PyOpenMLError):
-                self.extension._prevent_optimize_n_jobs(model)
-
-    def test__get_fn_arguments_with_defaults(self):
-        sklearn_version = LooseVersion(sklearn.__version__)
-        if sklearn_version < "0.19":
-            fns = [
-                (sklearn.ensemble.RandomForestRegressor.__init__, 15),
-                (sklearn.tree.DecisionTreeClassifier.__init__, 12),
-                (sklearn.pipeline.Pipeline.__init__, 0)
-            ]
-        elif sklearn_version < "0.21":
-            fns = [
-                (sklearn.ensemble.RandomForestRegressor.__init__, 16),
-                (sklearn.tree.DecisionTreeClassifier.__init__, 13),
-                (sklearn.pipeline.Pipeline.__init__, 1)
-            ]
-        else:
-            fns = [
-                (sklearn.ensemble.RandomForestRegressor.__init__, 16),
-                (sklearn.tree.DecisionTreeClassifier.__init__, 13),
-                (sklearn.pipeline.Pipeline.__init__, 2)
-            ]
-
-        for fn, num_params_with_defaults in fns:
-            defaults, defaultless = (
-                self.extension._get_fn_arguments_with_defaults(fn)
-            )
-            self.assertIsInstance(defaults, dict)
-            self.assertIsInstance(defaultless, set)
-            # check whether we have both defaults and defaultless params
-            self.assertEqual(len(defaults), num_params_with_defaults)
-            self.assertGreater(len(defaultless), 0)
-            # check no overlap
-            self.assertSetEqual(set(defaults.keys()),
-                                set(defaults.keys()) - defaultless)
-            self.assertSetEqual(defaultless,
-                                defaultless - set(defaults.keys()))
-
-    def test_deserialize_with_defaults(self):
-        # used the 'initialize_with_defaults' flag of the deserialization
-        # method to return a flow that contains default hyperparameter
-        # settings.
-        steps = [('Imputer', SimpleImputer()),
-                 ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()),
-                 ('Estimator', sklearn.tree.DecisionTreeClassifier())]
-        pipe_orig = sklearn.pipeline.Pipeline(steps=steps)
-
-        pipe_adjusted = sklearn.clone(pipe_orig)
-        params = {'Imputer__strategy': 'median',
-                  'OneHotEncoder__sparse': False,
-                  'Estimator__min_samples_leaf': 42}
-        pipe_adjusted.set_params(**params)
-        flow = self.extension.model_to_flow(pipe_adjusted)
-        pipe_deserialized = self.extension.flow_to_model(flow, initialize_with_defaults=True)
-
-        # we want to compare pipe_deserialized and pipe_orig. We use the flow
-        # equals function for this
-        assert_flows_equal(
-            self.extension.model_to_flow(pipe_orig),
-            self.extension.model_to_flow(pipe_deserialized),
-        )
-
-    def test_deserialize_adaboost_with_defaults(self):
-        # used the 'initialize_with_defaults' flag of the deserialization
-        # method to return a flow that contains default hyperparameter
-        # settings.
-        steps = [('Imputer', SimpleImputer()),
-                 ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()),
-                 ('Estimator', sklearn.ensemble.AdaBoostClassifier(
-                     sklearn.tree.DecisionTreeClassifier()))]
-        pipe_orig = sklearn.pipeline.Pipeline(steps=steps)
-
-        pipe_adjusted = sklearn.clone(pipe_orig)
-        params = {'Imputer__strategy': 'median',
-                  'OneHotEncoder__sparse': False,
-                  'Estimator__n_estimators': 10}
-        pipe_adjusted.set_params(**params)
-        flow = self.extension.model_to_flow(pipe_adjusted)
-        pipe_deserialized = self.extension.flow_to_model(flow, initialize_with_defaults=True)
-
-        # we want to compare pipe_deserialized and pipe_orig. We use the flow
-        # equals function for this
-        assert_flows_equal(
-            self.extension.model_to_flow(pipe_orig),
-            self.extension.model_to_flow(pipe_deserialized),
-        )
-
-    def test_deserialize_complex_with_defaults(self):
-        # used the 'initialize_with_defaults' flag of the deserialization
-        # method to return a flow that contains default hyperparameter
-        # settings.
-        steps = [
-            ('Imputer', SimpleImputer()),
-            ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()),
-            (
-                'Estimator',
-                sklearn.ensemble.AdaBoostClassifier(
-                    sklearn.ensemble.BaggingClassifier(
-                        sklearn.ensemble.GradientBoostingClassifier(
-                            sklearn.neighbors.KNeighborsClassifier()
-                        )
-                    )
-                )
-            ),
-        ]
-        pipe_orig = sklearn.pipeline.Pipeline(steps=steps)
-
-        pipe_adjusted = sklearn.clone(pipe_orig)
-        params = {'Imputer__strategy': 'median',
-                  'OneHotEncoder__sparse': False,
-                  'Estimator__n_estimators': 10,
-                  'Estimator__base_estimator__n_estimators': 10,
-                  'Estimator__base_estimator__base_estimator__learning_rate': 0.1,
-                  'Estimator__base_estimator__base_estimator__loss__n_neighbors': 13}
-        pipe_adjusted.set_params(**params)
-        flow = self.extension.model_to_flow(pipe_adjusted)
-        pipe_deserialized = self.extension.flow_to_model(flow, initialize_with_defaults=True)
-
-        # we want to compare pipe_deserialized and pipe_orig. We use the flow
-        # equals function for this
-        assert_flows_equal(
-            self.extension.model_to_flow(pipe_orig),
-            self.extension.model_to_flow(pipe_deserialized),
-        )
-
-    def test_openml_param_name_to_sklearn(self):
-        scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
-        boosting = sklearn.ensemble.AdaBoostClassifier(
-            base_estimator=sklearn.tree.DecisionTreeClassifier())
-        model = sklearn.pipeline.Pipeline(steps=[
-            ('scaler', scaler), ('boosting', boosting)])
-        flow = self.extension.model_to_flow(model)
-        task = openml.tasks.get_task(115)
-        run = openml.runs.run_flow_on_task(flow, task)
-        run = run.publish()
-        TestBase._mark_entity_for_removal('run', run.run_id)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run.run_id))
-        run = openml.runs.get_run(run.run_id)
-        setup = openml.setups.get_setup(run.setup_id)
-
-        # make sure to test enough parameters
-        self.assertGreater(len(setup.parameters), 15)
-
-        for parameter in setup.parameters.values():
-            sklearn_name = self.extension._openml_param_name_to_sklearn(parameter, flow)
-
-            # test the inverse. Currently, OpenML stores the hyperparameter
-            # fullName as flow.name + flow.version + parameter.name on the
-            # server (but this behaviour is not documented and might or might
-            # not change in the future. Hence, we won't offer this
-            # transformation functionality in the main package yet.)
-            splitted = sklearn_name.split("__")
-            if len(splitted) > 1:  # if len is 1, it is part of root flow
-                subflow = flow.get_subflow(splitted[0:-1])
-            else:
-                subflow = flow
-            openml_name = "%s(%s)_%s" % (subflow.name,
-                                         subflow.version,
-                                         splitted[-1])
-            self.assertEqual(parameter.full_name, openml_name)
-
-    def test_obtain_parameter_values_flow_not_from_server(self):
-        model = sklearn.linear_model.LogisticRegression(solver='lbfgs')
-        flow = self.extension.model_to_flow(model)
-        msg = 'Flow sklearn.linear_model.logistic.LogisticRegression has no ' \
-              'flow_id!'
-
-        with self.assertRaisesRegex(ValueError, msg):
-            self.extension.obtain_parameter_values(flow)
-
-        model = sklearn.ensemble.AdaBoostClassifier(
-            base_estimator=sklearn.linear_model.LogisticRegression(
-                solver='lbfgs',
-            )
-        )
-        flow = self.extension.model_to_flow(model)
-        flow.flow_id = 1
-        with self.assertRaisesRegex(ValueError, msg):
-            self.extension.obtain_parameter_values(flow)
-
-    def test_obtain_parameter_values(self):
-
-        model = sklearn.model_selection.RandomizedSearchCV(
-            estimator=sklearn.ensemble.RandomForestClassifier(n_estimators=5),
-            param_distributions={
-                "max_depth": [3, None],
-                "max_features": [1, 2, 3, 4],
-                "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
-                "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-                "bootstrap": [True, False], "criterion": ["gini", "entropy"]},
-            cv=sklearn.model_selection.StratifiedKFold(n_splits=2,
-                                                       random_state=1),
-            n_iter=5)
-        flow = self.extension.model_to_flow(model)
-        flow.flow_id = 1
-        flow.components['estimator'].flow_id = 2
-        parameters = self.extension.obtain_parameter_values(flow)
-        for parameter in parameters:
-            self.assertIsNotNone(parameter['oml:component'], msg=parameter)
-            if parameter['oml:name'] == 'n_estimators':
-                self.assertEqual(parameter['oml:value'], '5')
-                self.assertEqual(parameter['oml:component'], 2)
-
-    def test_numpy_type_allowed_in_flow(self):
-        """ Simple numpy types should be serializable. """
-        dt = sklearn.tree.DecisionTreeClassifier(
-            max_depth=np.float64(3.0),
-            min_samples_leaf=np.int32(5)
-        )
-        self.extension.model_to_flow(dt)
-
-    def test_numpy_array_not_allowed_in_flow(self):
-        """ Simple numpy arrays should not be serializable. """
-        bin = sklearn.preprocessing.MultiLabelBinarizer(classes=np.asarray([1, 2, 3]))
-        with self.assertRaises(TypeError):
-            self.extension.model_to_flow(bin)
-
-
-class TestSklearnExtensionRunFunctions(TestBase):
-    _multiprocess_can_split_ = True
-
-    def setUp(self):
-        super().setUp(n_levels=2)
-        self.extension = SklearnExtension()
-
-    ################################################################################################
-    # Test methods for performing runs with this extension module
-
-    def test_run_model_on_task(self):
-        class MyPipe(sklearn.pipeline.Pipeline):
-            pass
-        task = openml.tasks.get_task(1)
-        pipe = MyPipe([('imp', SimpleImputer()),
-                       ('dummy', sklearn.dummy.DummyClassifier())])
-        openml.runs.run_model_on_task(pipe, task)
-
-    def test_seed_model(self):
-        # randomized models that are initialized without seeds, can be seeded
-        randomized_clfs = [
-            sklearn.ensemble.BaggingClassifier(),
-            sklearn.model_selection.RandomizedSearchCV(
-                sklearn.ensemble.RandomForestClassifier(),
-                {
-                    "max_depth": [3, None],
-                    "max_features": [1, 2, 3, 4],
-                    "bootstrap": [True, False],
-                    "criterion": ["gini", "entropy"],
-                    "random_state": [-1, 0, 1, 2],
-                },
-                cv=sklearn.model_selection.StratifiedKFold(n_splits=2, shuffle=True),
-            ),
-            sklearn.dummy.DummyClassifier()
-        ]
-
-        for idx, clf in enumerate(randomized_clfs):
-            const_probe = 42
-            all_params = clf.get_params()
-            params = [key for key in all_params if
-                      key.endswith('random_state')]
-            self.assertGreater(len(params), 0)
-
-            # before param value is None
-            for param in params:
-                self.assertIsNone(all_params[param])
-
-            # now seed the params
-            clf_seeded = self.extension.seed_model(clf, const_probe)
-            new_params = clf_seeded.get_params()
-
-            randstate_params = [key for key in new_params if
-                                key.endswith('random_state')]
-
-            # afterwards, param value is set
-            for param in randstate_params:
-                self.assertIsInstance(new_params[param], int)
-                self.assertIsNotNone(new_params[param])
-
-            if idx == 1:
-                self.assertEqual(clf.cv.random_state, 56422)
-
-    def test_seed_model_raises(self):
-        # the _set_model_seed_where_none should raise exception if random_state is
-        # anything else than an int
-        randomized_clfs = [
-            sklearn.ensemble.BaggingClassifier(random_state=np.random.RandomState(42)),
-            sklearn.dummy.DummyClassifier(random_state="OpenMLIsGreat")
-        ]
-
-        for clf in randomized_clfs:
-            with self.assertRaises(ValueError):
-                self.extension.seed_model(model=clf, seed=42)
-
-    def test_run_model_on_fold_classification_1(self):
-        task = openml.tasks.get_task(1)
-
-        X, y = task.get_X_and_y()
-        train_indices, test_indices = task.get_train_test_split_indices(
-            repeat=0, fold=0, sample=0)
-        X_train = X[train_indices]
-        y_train = y[train_indices]
-        X_test = X[test_indices]
-        y_test = y[test_indices]
-
-        pipeline = sklearn.pipeline.Pipeline(steps=[
-            ('imp', SimpleImputer()),
-            ('clf', sklearn.tree.DecisionTreeClassifier()),
-        ])
-        # TODO add some mocking here to actually test the innards of this function, too!
-        res = self.extension._run_model_on_fold(
-            model=pipeline,
-            task=task,
-            fold_no=0,
-            rep_no=0,
-            X_train=X_train,
-            y_train=y_train,
-            X_test=X_test,
-        )
-
-        y_hat, y_hat_proba, user_defined_measures, trace = res
-
-        # predictions
-        self.assertIsInstance(y_hat, np.ndarray)
-        self.assertEqual(y_hat.shape, y_test.shape)
-        self.assertIsInstance(y_hat_proba, np.ndarray)
-        self.assertEqual(y_hat_proba.shape, (y_test.shape[0], 6))
-        np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape))
-        # The class '4' (at index 3) is not present in the training data. We check that the
-        # predicted probabilities for that class are zero!
-        np.testing.assert_array_almost_equal(y_hat_proba[:, 3], np.zeros(y_test.shape))
-        for i in (0, 1, 2, 4, 5):
-            self.assertTrue(np.any(y_hat_proba[:, i] != np.zeros(y_test.shape)))
-
-        # check user defined measures
-        fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict))
-        for measure in user_defined_measures:
-            fold_evaluations[measure][0][0] = user_defined_measures[measure]
-
-        # trace. SGD does not produce any
-        self.assertIsNone(trace)
-
-        self._check_fold_timing_evaluations(
-            fold_evaluations,
-            num_repeats=1,
-            num_folds=1,
-            task_type=task.task_type_id,
-            check_scores=False,
-        )
-
-    def test_run_model_on_fold_classification_2(self):
-        task = openml.tasks.get_task(7)
-
-        X, y = task.get_X_and_y()
-        train_indices, test_indices = task.get_train_test_split_indices(
-            repeat=0, fold=0, sample=0)
-        X_train = X[train_indices]
-        y_train = y[train_indices]
-        X_test = X[test_indices]
-        y_test = y[test_indices]
-
-        pipeline = sklearn.model_selection.GridSearchCV(
-            sklearn.tree.DecisionTreeClassifier(),
-            {
-                "max_depth": [1, 2],
-            },
-        )
-        # TODO add some mocking here to actually test the innards of this function, too!
-        res = self.extension._run_model_on_fold(
-            model=pipeline,
-            task=task,
-            fold_no=0,
-            rep_no=0,
-            X_train=X_train,
-            y_train=y_train,
-            X_test=X_test,
-        )
-
-        y_hat, y_hat_proba, user_defined_measures, trace = res
-
-        # predictions
-        self.assertIsInstance(y_hat, np.ndarray)
-        self.assertEqual(y_hat.shape, y_test.shape)
-        self.assertIsInstance(y_hat_proba, np.ndarray)
-        self.assertEqual(y_hat_proba.shape, (y_test.shape[0], 2))
-        np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape))
-        for i in (0, 1):
-            self.assertTrue(np.any(y_hat_proba[:, i] != np.zeros(y_test.shape)))
-
-        # check user defined measures
-        fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict))
-        for measure in user_defined_measures:
-            fold_evaluations[measure][0][0] = user_defined_measures[measure]
-
-        # check that it produced and returned a trace object of the correct length
-        self.assertIsInstance(trace, OpenMLRunTrace)
-        self.assertEqual(len(trace.trace_iterations), 2)
-
-        self._check_fold_timing_evaluations(
-            fold_evaluations,
-            num_repeats=1,
-            num_folds=1,
-            task_type=task.task_type_id,
-            check_scores=False,
-        )
-
-    def test_run_model_on_fold_classification_3(self):
-
-        class HardNaiveBayes(sklearn.naive_bayes.GaussianNB):
-            # class for testing a naive bayes classifier that does not allow soft
-            # predictions
-            def __init__(self, priors=None):
-                super(HardNaiveBayes, self).__init__(priors)
-
-            def predict_proba(*args, **kwargs):
-                raise AttributeError('predict_proba is not available when '
-                                     'probability=False')
-
-        # task 1 (test server) is important: it is a task with an unused class
-        tasks = [1, 3, 115]
-        flow = unittest.mock.Mock()
-        flow.name = 'dummy'
-
-        for task_id in tasks:
-            task = openml.tasks.get_task(task_id)
-            X, y = task.get_X_and_y()
-            train_indices, test_indices = task.get_train_test_split_indices(
-                repeat=0, fold=0, sample=0)
-            X_train = X[train_indices]
-            y_train = y[train_indices]
-            X_test = X[test_indices]
-            clf1 = sklearn.pipeline.Pipeline(steps=[
-                ('imputer', SimpleImputer()),
-                ('estimator', sklearn.naive_bayes.GaussianNB())
-            ])
-            clf2 = sklearn.pipeline.Pipeline(steps=[
-                ('imputer', SimpleImputer()),
-                ('estimator', HardNaiveBayes())
-            ])
-
-            pred_1, proba_1, _, _ = self.extension._run_model_on_fold(
-                model=clf1,
-                task=task,
-                X_train=X_train,
-                y_train=y_train,
-                X_test=X_test,
-                fold_no=0,
-                rep_no=0,
-            )
-            pred_2, proba_2, _, _ = self.extension._run_model_on_fold(
-                model=clf2,
-                task=task,
-                X_train=X_train,
-                y_train=y_train,
-                X_test=X_test,
-                fold_no=0,
-                rep_no=0,
-            )
-
-            # verifies that the predictions are identical
-            np.testing.assert_array_equal(pred_1, pred_2)
-            np.testing.assert_array_almost_equal(np.sum(proba_1, axis=1), np.ones(X_test.shape[0]))
-            # Test that there are predictions other than ones and zeros
-            self.assertLess(
-                np.sum(proba_1 == 0) + np.sum(proba_1 == 1),
-                X_test.shape[0] * len(task.class_labels),
-            )
-
-            np.testing.assert_array_almost_equal(np.sum(proba_2, axis=1), np.ones(X_test.shape[0]))
-            # Test that there are only ones and zeros predicted
-            self.assertEqual(
-                np.sum(proba_2 == 0) + np.sum(proba_2 == 1),
-                X_test.shape[0] * len(task.class_labels),
-            )
-
-    def test_run_model_on_fold_regression(self):
-        # There aren't any regression tasks on the test server
-        openml.config.server = self.production_server
-        task = openml.tasks.get_task(2999)
-
-        X, y = task.get_X_and_y()
-        train_indices, test_indices = task.get_train_test_split_indices(
-            repeat=0, fold=0, sample=0)
-        X_train = X[train_indices]
-        y_train = y[train_indices]
-        X_test = X[test_indices]
-        y_test = y[test_indices]
-
-        pipeline = sklearn.pipeline.Pipeline(steps=[
-            ('imp', SimpleImputer()),
-            ('clf', sklearn.tree.DecisionTreeRegressor()),
-        ])
-        # TODO add some mocking here to actually test the innards of this function, too!
-        res = self.extension._run_model_on_fold(
-            model=pipeline,
-            task=task,
-            fold_no=0,
-            rep_no=0,
-            X_train=X_train,
-            y_train=y_train,
-            X_test=X_test,
-        )
-
-        y_hat, y_hat_proba, user_defined_measures, trace = res
-
-        # predictions
-        self.assertIsInstance(y_hat, np.ndarray)
-        self.assertEqual(y_hat.shape, y_test.shape)
-        self.assertIsNone(y_hat_proba)
-
-        # check user defined measures
-        fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict))
-        for measure in user_defined_measures:
-            fold_evaluations[measure][0][0] = user_defined_measures[measure]
-
-        # trace. SGD does not produce any
-        self.assertIsNone(trace)
-
-        self._check_fold_timing_evaluations(
-            fold_evaluations,
-            num_repeats=1,
-            num_folds=1,
-            task_type=task.task_type_id,
-            check_scores=False,
-        )
-
-    def test_run_model_on_fold_clustering(self):
-        # There aren't any regression tasks on the test server
-        openml.config.server = self.production_server
-        task = openml.tasks.get_task(126033)
-
-        X = task.get_X(dataset_format='array')
-
-        pipeline = sklearn.pipeline.Pipeline(steps=[
-            ('imp', SimpleImputer()),
-            ('clf', sklearn.cluster.KMeans()),
-        ])
-        # TODO add some mocking here to actually test the innards of this function, too!
-        res = self.extension._run_model_on_fold(
-            model=pipeline,
-            task=task,
-            fold_no=0,
-            rep_no=0,
-            X_train=X,
-        )
-
-        y_hat, y_hat_proba, user_defined_measures, trace = res
-
-        # predictions
-        self.assertIsInstance(y_hat, np.ndarray)
-        self.assertEqual(y_hat.shape, (X.shape[0], ))
-        self.assertIsNone(y_hat_proba)
-
-        # check user defined measures
-        fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict))
-        for measure in user_defined_measures:
-            fold_evaluations[measure][0][0] = user_defined_measures[measure]
-
-        # trace. SGD does not produce any
-        self.assertIsNone(trace)
-
-        self._check_fold_timing_evaluations(
-            fold_evaluations,
-            num_repeats=1,
-            num_folds=1,
-            task_type=task.task_type_id,
-            check_scores=False,
-        )
-
-    def test__extract_trace_data(self):
-
-        param_grid = {"hidden_layer_sizes": [[5, 5], [10, 10], [20, 20]],
-                      "activation": ['identity', 'logistic', 'tanh', 'relu'],
-                      "learning_rate_init": [0.1, 0.01, 0.001, 0.0001],
-                      "max_iter": [10, 20, 40, 80]}
-        num_iters = 10
-        task = openml.tasks.get_task(20)
-        clf = sklearn.model_selection.RandomizedSearchCV(
-            sklearn.neural_network.MLPClassifier(),
-            param_grid,
-            num_iters,
-        )
-        # just run the task on the model (without invoking any fancy extension & openml code)
-        train, _ = task.get_train_test_split_indices(0, 0)
-        X, y = task.get_X_and_y()
-        with warnings.catch_warnings():
-            warnings.simplefilter('ignore')
-            clf.fit(X[train], y[train])
-
-        # check num layers of MLP
-        self.assertIn(clf.best_estimator_.hidden_layer_sizes, param_grid['hidden_layer_sizes'])
-
-        trace_list = self.extension._extract_trace_data(clf, rep_no=0, fold_no=0)
-        trace = self.extension._obtain_arff_trace(clf, trace_list)
-
-        self.assertIsInstance(trace, OpenMLRunTrace)
-        self.assertIsInstance(trace_list, list)
-        self.assertEqual(len(trace_list), num_iters)
-
-        for trace_iteration in iter(trace):
-            self.assertEqual(trace_iteration.repeat, 0)
-            self.assertEqual(trace_iteration.fold, 0)
-            self.assertGreaterEqual(trace_iteration.iteration, 0)
-            self.assertLessEqual(trace_iteration.iteration, num_iters)
-            self.assertIsNone(trace_iteration.setup_string)
-            self.assertIsInstance(trace_iteration.evaluation, float)
-            self.assertTrue(np.isfinite(trace_iteration.evaluation))
-            self.assertIsInstance(trace_iteration.selected, bool)
-
-            self.assertEqual(len(trace_iteration.parameters), len(param_grid))
-            for param in param_grid:
-
-                # Prepend with the "parameter_" prefix
-                param_in_trace = "parameter_%s" % param
-                self.assertIn(param_in_trace, trace_iteration.parameters)
-                param_value = json.loads(trace_iteration.parameters[param_in_trace])
-                self.assertTrue(param_value in param_grid[param])
-
-    def test_trim_flow_name(self):
-        import re
-        long = """sklearn.pipeline.Pipeline(
-                    columntransformer=sklearn.compose._column_transformer.ColumnTransformer(
-                        numeric=sklearn.pipeline.Pipeline(
-                            SimpleImputer=sklearn.preprocessing.imputation.Imputer,
-                            standardscaler=sklearn.preprocessing.data.StandardScaler),
-                        nominal=sklearn.pipeline.Pipeline(
-                            simpleimputer=sklearn.impute.SimpleImputer,
-                            onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)),
-                    variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
-                    svc=sklearn.svm.classes.SVC)"""
-        short = "sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)"
-        shorter = "sklearn.Pipeline(...,SVC)"
-        long_stripped, _ = re.subn(r'\s', '', long)
-        self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
-        self.assertEqual(shorter,
-                         SklearnExtension.trim_flow_name(long_stripped, extra_trim_length=50))
-
-        long = """sklearn.pipeline.Pipeline(
-                    imputation=openmlstudy14.preprocessing.ConditionalImputer,
-                    hotencoding=sklearn.preprocessing.data.OneHotEncoder,
-                    variencethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
-                    classifier=sklearn.ensemble.forest.RandomForestClassifier)"""
-        short = "sklearn.Pipeline(ConditionalImputer,OneHotEncoder,VarianceThreshold,RandomForestClassifier)"  # noqa: E501
-        long_stripped, _ = re.subn(r'\s', '', long)
-        self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
-
-        long = """sklearn.pipeline.Pipeline(
-                    SimpleImputer=sklearn.preprocessing.imputation.Imputer,
-                    VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: E501
-                    Estimator=sklearn.model_selection._search.RandomizedSearchCV(
-                        estimator=sklearn.tree.tree.DecisionTreeClassifier))"""
-        short = "sklearn.Pipeline(Imputer,VarianceThreshold,RandomizedSearchCV(DecisionTreeClassifier))"  # noqa: E501
-        long_stripped, _ = re.subn(r'\s', '', long)
-        self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
-
-        long = """sklearn.model_selection._search.RandomizedSearchCV(
-                    estimator=sklearn.pipeline.Pipeline(
-                        SimpleImputer=sklearn.preprocessing.imputation.Imputer,
-                        classifier=sklearn.ensemble.forest.RandomForestClassifier))"""
-        short = "sklearn.RandomizedSearchCV(Pipeline(Imputer,RandomForestClassifier))"
-        long_stripped, _ = re.subn(r'\s', '', long)
-        self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
-
-        long = """sklearn.pipeline.FeatureUnion(
-                    pca=sklearn.decomposition.pca.PCA,
-                    svd=sklearn.decomposition.truncated_svd.TruncatedSVD)"""
-        short = "sklearn.FeatureUnion(PCA,TruncatedSVD)"
-        long_stripped, _ = re.subn(r'\s', '', long)
-        self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
-
-        long = "sklearn.ensemble.forest.RandomForestClassifier"
-        short = "sklearn.RandomForestClassifier"
-        self.assertEqual(short, SklearnExtension.trim_flow_name(long))
-
-        self.assertEqual("weka.IsolationForest",
-                         SklearnExtension.trim_flow_name("weka.IsolationForest"))
diff --git a/tests/test_flows/dummy_learn/dummy_forest.py b/tests/test_flows/dummy_learn/dummy_forest.py
index 06eaab62e..65e79e760 100644
--- a/tests/test_flows/dummy_learn/dummy_forest.py
+++ b/tests/test_flows/dummy_learn/dummy_forest.py
@@ -1,4 +1,8 @@
-class DummyRegressor(object):
+# License: BSD 3-Clause
+from __future__ import annotations
+
+
+class DummyRegressor:
     def fit(self, X, y):
         return self
 
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index 25e2dacfb..4e391fd3b 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -1,11 +1,16 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import collections
 import copy
-from distutils.version import LooseVersion
 import hashlib
 import re
+import os
 import time
+from packaging.version import Version
 from unittest import mock
 
+import pytest
 import scipy.stats
 import sklearn
 import sklearn.datasets
@@ -14,19 +19,19 @@
 import sklearn.ensemble
 import sklearn.feature_selection
 import sklearn.model_selection
+import sklearn.naive_bayes
 import sklearn.pipeline
 import sklearn.preprocessing
-import sklearn.naive_bayes
 import sklearn.tree
-
 import xmltodict
 
+from openml_sklearn import SklearnExtension
+
 import openml
-from openml._api_calls import _perform_api_call
 import openml.exceptions
-import openml.extensions.sklearn
-from openml.testing import TestBase, SimpleImputer
 import openml.utils
+from openml._api_calls import _perform_api_call
+from openml.testing import SimpleImputer, TestBase
 
 
 class TestFlow(TestBase):
@@ -34,114 +39,135 @@ class TestFlow(TestBase):
 
     def setUp(self):
         super().setUp()
-        self.extension = openml.extensions.sklearn.SklearnExtension()
+        self.extension = SklearnExtension()
 
     def tearDown(self):
         super().tearDown()
 
+    @pytest.mark.production_server()
     def test_get_flow(self):
         # We need to use the production server here because 4024 is not the
         # test server
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         flow = openml.flows.get_flow(4024)
-        self.assertIsInstance(flow, openml.OpenMLFlow)
-        self.assertEqual(flow.flow_id, 4024)
-        self.assertEqual(len(flow.parameters), 24)
-        self.assertEqual(len(flow.components), 1)
-
-        subflow_1 = list(flow.components.values())[0]
-        self.assertIsInstance(subflow_1, openml.OpenMLFlow)
-        self.assertEqual(subflow_1.flow_id, 4025)
-        self.assertEqual(len(subflow_1.parameters), 14)
-        self.assertEqual(subflow_1.parameters['E'], 'CC')
-        self.assertEqual(len(subflow_1.components), 1)
-
-        subflow_2 = list(subflow_1.components.values())[0]
-        self.assertIsInstance(subflow_2, openml.OpenMLFlow)
-        self.assertEqual(subflow_2.flow_id, 4026)
-        self.assertEqual(len(subflow_2.parameters), 13)
-        self.assertEqual(subflow_2.parameters['I'], '10')
-        self.assertEqual(len(subflow_2.components), 1)
-
-        subflow_3 = list(subflow_2.components.values())[0]
-        self.assertIsInstance(subflow_3, openml.OpenMLFlow)
-        self.assertEqual(subflow_3.flow_id, 1724)
-        self.assertEqual(len(subflow_3.parameters), 11)
-        self.assertEqual(subflow_3.parameters['L'], '-1')
-        self.assertEqual(len(subflow_3.components), 0)
-
+        assert isinstance(flow, openml.OpenMLFlow)
+        assert flow.flow_id == 4024
+        assert len(flow.parameters) == 24
+        assert len(flow.components) == 1
+
+        subflow_1 = next(iter(flow.components.values()))
+        assert isinstance(subflow_1, openml.OpenMLFlow)
+        assert subflow_1.flow_id == 4025
+        assert len(subflow_1.parameters) == 14
+        assert subflow_1.parameters["E"] == "CC"
+        assert len(subflow_1.components) == 1
+
+        subflow_2 = next(iter(subflow_1.components.values()))
+        assert isinstance(subflow_2, openml.OpenMLFlow)
+        assert subflow_2.flow_id == 4026
+        assert len(subflow_2.parameters) == 13
+        assert subflow_2.parameters["I"] == "10"
+        assert len(subflow_2.components) == 1
+
+        subflow_3 = next(iter(subflow_2.components.values()))
+        assert isinstance(subflow_3, openml.OpenMLFlow)
+        assert subflow_3.flow_id == 1724
+        assert len(subflow_3.parameters) == 11
+        assert subflow_3.parameters["L"] == "-1"
+        assert len(subflow_3.components) == 0
+
+    @pytest.mark.production_server()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_structure(self):
         # also responsible for testing: flow.get_subflow
         # We need to use the production server here because 4024 is not the
         # test server
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         flow = openml.flows.get_flow(4024)
-        flow_structure_name = flow.get_structure('name')
-        flow_structure_id = flow.get_structure('flow_id')
+        flow_structure_name = flow.get_structure("name")
+        flow_structure_id = flow.get_structure("flow_id")
         # components: root (filteredclassifier), multisearch, loginboost,
         # reptree
-        self.assertEqual(len(flow_structure_name), 4)
-        self.assertEqual(len(flow_structure_id), 4)
+        assert len(flow_structure_name) == 4
+        assert len(flow_structure_id) == 4
 
         for sub_flow_name, structure in flow_structure_name.items():
             if len(structure) > 0:  # skip root element
                 subflow = flow.get_subflow(structure)
-                self.assertEqual(subflow.name, sub_flow_name)
+                assert subflow.name == sub_flow_name
 
         for sub_flow_id, structure in flow_structure_id.items():
             if len(structure) > 0:  # skip root element
                 subflow = flow.get_subflow(structure)
-                self.assertEqual(subflow.flow_id, sub_flow_id)
+                assert subflow.flow_id == sub_flow_id
 
+    @pytest.mark.test_server()
     def test_tagging(self):
-        flow_list = openml.flows.list_flows(size=1)
-        flow_id = list(flow_list.keys())[0]
+        flows = openml.flows.list_flows(size=1)
+        flow_id = flows["id"].iloc[0]
         flow = openml.flows.get_flow(flow_id)
-        tag = "testing_tag_{}_{}".format(self.id(), time.time())
-        flow_list = openml.flows.list_flows(tag=tag)
-        self.assertEqual(len(flow_list), 0)
+        # tags can be at most 64 alphanumeric (+ underscore) chars
+        unique_indicator = str(time.time()).replace(".", "")
+        tag = f"test_tag_TestFlow_{unique_indicator}"
+        flows = openml.flows.list_flows(tag=tag)
+        assert len(flows) == 0
         flow.push_tag(tag)
-        flow_list = openml.flows.list_flows(tag=tag)
-        self.assertEqual(len(flow_list), 1)
-        self.assertIn(flow_id, flow_list)
+        flows = openml.flows.list_flows(tag=tag)
+        assert len(flows) == 1
+        assert flow_id in flows["id"]
         flow.remove_tag(tag)
-        flow_list = openml.flows.list_flows(tag=tag)
-        self.assertEqual(len(flow_list), 0)
+        flows = openml.flows.list_flows(tag=tag)
+        assert len(flows) == 0
 
+    @pytest.mark.test_server()
     def test_from_xml_to_xml(self):
         # Get the raw xml thing
         # TODO maybe get this via get_flow(), which would have to be refactored
         # to allow getting only the xml dictionary
         # TODO: no sklearn flows.
-        for flow_id in [3, 5, 7, 9, ]:
-            flow_xml = _perform_api_call("flow/%d" % flow_id,
-                                         request_method='get')
+        for flow_id in [
+            3,
+            5,
+            7,
+            9,
+        ]:
+            flow_xml = _perform_api_call("flow/%d" % flow_id, request_method="get")
             flow_dict = xmltodict.parse(flow_xml)
 
             flow = openml.OpenMLFlow._from_dict(flow_dict)
             new_xml = flow._to_xml()
 
             flow_xml = (
-                flow_xml.replace('  ', '').replace('\t', '').
-                strip().replace('\n\n', '\n').replace('&quot;', '"')
+                flow_xml.replace("  ", "")
+                .replace("\t", "")
+                .strip()
+                .replace("\n\n", "\n")
+                .replace("&quot;", '"')
             )
-            flow_xml = re.sub(r'^$', '', flow_xml)
+            flow_xml = re.sub(r"^$", "", flow_xml)
             new_xml = (
-                new_xml.replace('  ', '').replace('\t', '').
-                strip().replace('\n\n', '\n').replace('&quot;', '"')
+                new_xml.replace("  ", "")
+                .replace("\t", "")
+                .strip()
+                .replace("\n\n", "\n")
+                .replace("&quot;", '"')
             )
-            new_xml = re.sub(r'^$', '', new_xml)
+            new_xml = re.sub(r"^$", "", new_xml)
 
-            self.assertEqual(new_xml, flow_xml)
+            assert new_xml == flow_xml
 
+    @pytest.mark.sklearn()
     def test_to_xml_from_xml(self):
         scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
+        estimator_name = (
+            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        )
         boosting = sklearn.ensemble.AdaBoostClassifier(
-            base_estimator=sklearn.tree.DecisionTreeClassifier())
-        model = sklearn.pipeline.Pipeline(steps=(
-            ('scaler', scaler), ('boosting', boosting)))
+            **{estimator_name: sklearn.tree.DecisionTreeClassifier()},
+        )
+        model = sklearn.pipeline.Pipeline(steps=(("scaler", scaler), ("boosting", boosting)))
         flow = self.extension.model_to_flow(model)
         flow.flow_id = -234
         # end of setup
@@ -152,206 +178,229 @@ def test_to_xml_from_xml(self):
 
         # Would raise exception if they are not legal
         openml.flows.functions.assert_flows_equal(new_flow, flow)
-        self.assertIsNot(new_flow, flow)
+        assert new_flow is not flow
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_publish_flow(self):
         flow = openml.OpenMLFlow(
-            name='sklearn.dummy.DummyClassifier',
-            class_name='sklearn.dummy.DummyClassifier',
+            name="sklearn.dummy.DummyClassifier",
+            class_name="sklearn.dummy.DummyClassifier",
             description="test description",
             model=sklearn.dummy.DummyClassifier(),
             components=collections.OrderedDict(),
             parameters=collections.OrderedDict(),
             parameters_meta_info=collections.OrderedDict(),
             external_version=self.extension._format_external_version(
-                'sklearn',
+                "sklearn",
                 sklearn.__version__,
             ),
             tags=[],
-            language='English',
+            language="English",
             dependencies=None,
         )
 
         flow, _ = self._add_sentinel_to_flow_name(flow, None)
 
         flow.publish()
-        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            flow.flow_id))
-        self.assertIsInstance(flow.flow_id, int)
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+        assert isinstance(flow.flow_id, int)
 
-    @mock.patch('openml.flows.functions.flow_exists')
+    @pytest.mark.sklearn()
+    @mock.patch("openml.flows.functions.flow_exists")
     def test_publish_existing_flow(self, flow_exists_mock):
         clf = sklearn.tree.DecisionTreeClassifier(max_depth=2)
         flow = self.extension.model_to_flow(clf)
         flow_exists_mock.return_value = 1
 
-        with self.assertRaises(openml.exceptions.PyOpenMLError) as context_manager:
+        with pytest.raises(openml.exceptions.PyOpenMLError, match="OpenMLFlow already exists"):
             flow.publish(raise_error_if_exists=True)
-            TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
-            TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                                flow.flow_id))
 
-        self.assertTrue('OpenMLFlow already exists' in context_manager.exception.message)
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {flow.flow_id}",
+        )
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_publish_flow_with_similar_components(self):
-        clf = sklearn.ensemble.VotingClassifier([
-            ('lr', sklearn.linear_model.LogisticRegression(solver='lbfgs')),
-        ])
+        clf = sklearn.ensemble.VotingClassifier(
+            [("lr", sklearn.linear_model.LogisticRegression(solver="lbfgs"))],
+        )
         flow = self.extension.model_to_flow(clf)
         flow, _ = self._add_sentinel_to_flow_name(flow, None)
         flow.publish()
-        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            flow.flow_id))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
         # For a flow where both components are published together, the upload
         # date should be equal
-        self.assertEqual(
-            flow.upload_date,
-            flow.components['lr'].upload_date,
-            msg=(
-                flow.name,
-                flow.flow_id,
-                flow.components['lr'].name, flow.components['lr'].flow_id,
-            ),
+        assert flow.upload_date == flow.components["lr"].upload_date, (
+            flow.name,
+            flow.flow_id,
+            flow.components["lr"].name,
+            flow.components["lr"].flow_id,
         )
 
         clf1 = sklearn.tree.DecisionTreeClassifier(max_depth=2)
         flow1 = self.extension.model_to_flow(clf1)
         flow1, sentinel = self._add_sentinel_to_flow_name(flow1, None)
         flow1.publish()
-        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            flow1.flow_id))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow1.flow_id}")
 
         # In order to assign different upload times to the flows!
         time.sleep(1)
 
         clf2 = sklearn.ensemble.VotingClassifier(
-            [('dt', sklearn.tree.DecisionTreeClassifier(max_depth=2))])
+            [("dt", sklearn.tree.DecisionTreeClassifier(max_depth=2))],
+        )
         flow2 = self.extension.model_to_flow(clf2)
         flow2, _ = self._add_sentinel_to_flow_name(flow2, sentinel)
         flow2.publish()
-        TestBase._mark_entity_for_removal('flow', (flow2.flow_id, flow2.name))
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            flow2.flow_id))
+        TestBase._mark_entity_for_removal("flow", flow2.flow_id, flow2.name)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow2.flow_id}")
         # If one component was published before the other, the components in
         # the flow should have different upload dates
-        self.assertNotEqual(flow2.upload_date,
-                            flow2.components['dt'].upload_date)
+        assert flow2.upload_date != flow2.components["dt"].upload_date
 
-        clf3 = sklearn.ensemble.AdaBoostClassifier(
-            sklearn.tree.DecisionTreeClassifier(max_depth=3))
+        clf3 = sklearn.ensemble.AdaBoostClassifier(sklearn.tree.DecisionTreeClassifier(max_depth=3))
         flow3 = self.extension.model_to_flow(clf3)
         flow3, _ = self._add_sentinel_to_flow_name(flow3, sentinel)
         # Child flow has different parameter. Check for storing the flow
         # correctly on the server should thus not check the child's parameters!
         flow3.publish()
-        TestBase._mark_entity_for_removal('flow', (flow3.flow_id, flow3.name))
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            flow3.flow_id))
+        TestBase._mark_entity_for_removal("flow", flow3.flow_id, flow3.name)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}")
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_semi_legal_flow(self):
         # TODO: Test if parameters are set correctly!
         # should not throw error as it contains two differentiable forms of
         # Bagging i.e., Bagging(Bagging(J48)) and Bagging(J48)
+        estimator_name = (
+            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        )
         semi_legal = sklearn.ensemble.BaggingClassifier(
-            base_estimator=sklearn.ensemble.BaggingClassifier(
-                base_estimator=sklearn.tree.DecisionTreeClassifier()))
+            **{
+                estimator_name: sklearn.ensemble.BaggingClassifier(
+                    **{
+                        estimator_name: sklearn.tree.DecisionTreeClassifier(),
+                    }
+                )
+            }
+        )
         flow = self.extension.model_to_flow(semi_legal)
         flow, _ = self._add_sentinel_to_flow_name(flow, None)
 
         flow.publish()
-        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            flow.flow_id))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
 
-    @mock.patch('openml.flows.functions.get_flow')
-    @mock.patch('openml.flows.functions.flow_exists')
-    @mock.patch('openml._api_calls._perform_api_call')
+    @pytest.mark.sklearn()
+    @mock.patch("openml.flows.functions.get_flow")
+    @mock.patch("openml.flows.functions.flow_exists")
+    @mock.patch("openml._api_calls._perform_api_call")
     def test_publish_error(self, api_call_mock, flow_exists_mock, get_flow_mock):
         model = sklearn.ensemble.RandomForestClassifier()
         flow = self.extension.model_to_flow(model)
-        api_call_mock.return_value = "<oml:upload_flow>\n" \
-                                     "    <oml:id>1</oml:id>\n" \
-                                     "</oml:upload_flow>"
+        api_call_mock.return_value = (
+            "<oml:upload_flow>\n" "    <oml:id>1</oml:id>\n" "</oml:upload_flow>"
+        )
         flow_exists_mock.return_value = False
         get_flow_mock.return_value = flow
 
         flow.publish()
         # Not collecting flow_id for deletion since this is a test for failed upload
 
-        self.assertEqual(api_call_mock.call_count, 1)
-        self.assertEqual(get_flow_mock.call_count, 1)
-        self.assertEqual(flow_exists_mock.call_count, 1)
+        assert api_call_mock.call_count == 1
+        assert get_flow_mock.call_count == 1
+        assert flow_exists_mock.call_count == 1
 
         flow_copy = copy.deepcopy(flow)
         flow_copy.name = flow_copy.name[:-1]
         get_flow_mock.return_value = flow_copy
         flow_exists_mock.return_value = 1
 
-        with self.assertRaises(ValueError) as context_manager:
+        if Version(sklearn.__version__) < Version("0.22"):
+            fixture = (
+                "The flow on the server is inconsistent with the local flow. "
+                "The server flow ID is 1. Please check manually and remove "
+                "the flow if necessary! Error is:\n"
+                "'Flow sklearn.ensemble.forest.RandomForestClassifier: "
+                "values for attribute 'name' differ: "
+                "'sklearn.ensemble.forest.RandomForestClassifier'"
+                "\nvs\n'sklearn.ensemble.forest.RandomForestClassifie'.'"
+            )
+        else:
+            # sklearn.ensemble.forest -> sklearn.ensemble._forest
+            fixture = (
+                "The flow on the server is inconsistent with the local flow. "
+                "The server flow ID is 1. Please check manually and remove "
+                "the flow if necessary! Error is:\n"
+                "'Flow sklearn.ensemble._forest.RandomForestClassifier: "
+                "values for attribute 'name' differ: "
+                "'sklearn.ensemble._forest.RandomForestClassifier'"
+                "\nvs\n'sklearn.ensemble._forest.RandomForestClassifie'.'"
+            )
+        with pytest.raises(ValueError, match=fixture):
             flow.publish()
-            TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
-            TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                                flow.flow_id))
-
-        fixture = (
-            "The flow on the server is inconsistent with the local flow. "
-            "The server flow ID is 1. Please check manually and remove "
-            "the flow if necessary! Error is:\n"
-            "'Flow sklearn.ensemble.forest.RandomForestClassifier: "
-            "values for attribute 'name' differ: "
-            "'sklearn.ensemble.forest.RandomForestClassifier'"
-            "\nvs\n'sklearn.ensemble.forest.RandomForestClassifie'.'"
+
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {flow.flow_id}",
         )
 
-        self.assertEqual(context_manager.exception.args[0], fixture)
-        self.assertEqual(get_flow_mock.call_count, 2)
+        assert get_flow_mock.call_count == 2
 
+    @pytest.mark.sklearn()
     def test_illegal_flow(self):
         # should throw error as it contains two imputers
         illegal = sklearn.pipeline.Pipeline(
             steps=[
-                ('imputer1', SimpleImputer()),
-                ('imputer2', SimpleImputer()),
-                ('classif', sklearn.tree.DecisionTreeClassifier())
-            ]
+                ("imputer1", SimpleImputer()),
+                ("imputer2", SimpleImputer()),
+                ("classif", sklearn.tree.DecisionTreeClassifier()),
+            ],
         )
         self.assertRaises(ValueError, self.extension.model_to_flow, illegal)
 
+    @pytest.mark.test_server()
     def test_nonexisting_flow_exists(self):
         def get_sentinel():
             # Create a unique prefix for the flow. Necessary because the flow
             # is identified by its name and external version online. Having a
             # unique name allows us to publish the same flow in each test run
             md5 = hashlib.md5()
-            md5.update(str(time.time()).encode('utf-8'))
+            md5.update(str(time.time()).encode("utf-8"))
             sentinel = md5.hexdigest()[:10]
-            sentinel = 'TEST%s' % sentinel
-            return sentinel
+            return f"TEST{sentinel}"
 
         name = get_sentinel() + get_sentinel()
         version = get_sentinel()
 
         flow_id = openml.flows.flow_exists(name, version)
-        self.assertFalse(flow_id)
+        assert not flow_id
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_existing_flow_exists(self):
         # create a flow
         nb = sklearn.naive_bayes.GaussianNB()
 
-        ohe_params = {'sparse': False, 'handle_unknown': 'ignore'}
-        if LooseVersion(sklearn.__version__) >= '0.20':
-            ohe_params['categories'] = 'auto'
+        sparse = "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output"
+        ohe_params = {sparse: False, "handle_unknown": "ignore"}
+        if Version(sklearn.__version__) >= Version("0.20"):
+            ohe_params["categories"] = "auto"
         steps = [
-            ('imputation', SimpleImputer(strategy='median')),
-            ('hotencoding', sklearn.preprocessing.OneHotEncoder(**ohe_params)),
+            ("imputation", SimpleImputer(strategy="median")),
+            ("hotencoding", sklearn.preprocessing.OneHotEncoder(**ohe_params)),
             (
-                'variencethreshold',
+                "variencethreshold",
                 sklearn.feature_selection.VarianceThreshold(),
             ),
-            ('classifier', sklearn.tree.DecisionTreeClassifier())
+            ("classifier", sklearn.tree.DecisionTreeClassifier()),
         ]
         complicated = sklearn.pipeline.Pipeline(steps=steps)
 
@@ -360,9 +409,10 @@ def test_existing_flow_exists(self):
             flow, _ = self._add_sentinel_to_flow_name(flow, None)
             # publish the flow
             flow = flow.publish()
-            TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
-            TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                                flow.flow_id))
+            TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+            TestBase.logger.info(
+                f"collected from {__file__.split('/')[-1]}: {flow.flow_id}",
+            )
             # redownload the flow
             flow = openml.flows.get_flow(flow.flow_id)
 
@@ -372,42 +422,47 @@ def test_existing_flow_exists(self):
                 flow.name,
                 flow.external_version,
             )
-            self.assertEqual(downloaded_flow_id, flow.flow_id)
+            assert downloaded_flow_id == flow.flow_id
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_sklearn_to_upload_to_flow(self):
         iris = sklearn.datasets.load_iris()
         X = iris.data
         y = iris.target
 
         # Test a more complicated flow
-        ohe_params = {'handle_unknown': 'ignore'}
-        if LooseVersion(sklearn.__version__) >= "0.20":
-            ohe_params['categories'] = 'auto'
+        ohe_params = {"handle_unknown": "ignore"}
+        if Version(sklearn.__version__) >= Version("0.20"):
+            ohe_params["categories"] = "auto"
         ohe = sklearn.preprocessing.OneHotEncoder(**ohe_params)
         scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
         pca = sklearn.decomposition.TruncatedSVD()
         fs = sklearn.feature_selection.SelectPercentile(
-            score_func=sklearn.feature_selection.f_classif, percentile=30)
-        fu = sklearn.pipeline.FeatureUnion(transformer_list=[
-            ('pca', pca), ('fs', fs)])
+            score_func=sklearn.feature_selection.f_classif,
+            percentile=30,
+        )
+        fu = sklearn.pipeline.FeatureUnion(transformer_list=[("pca", pca), ("fs", fs)])
+        estimator_name = (
+            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        )
         boosting = sklearn.ensemble.AdaBoostClassifier(
-            base_estimator=sklearn.tree.DecisionTreeClassifier())
+            **{estimator_name: sklearn.tree.DecisionTreeClassifier()},
+        )
         model = sklearn.pipeline.Pipeline(
-            steps=[
-                ('ohe', ohe),
-                ('scaler', scaler),
-                ('fu', fu),
-                ('boosting', boosting),
-            ]
+            steps=[("ohe", ohe), ("scaler", scaler), ("fu", fu), ("boosting", boosting)],
         )
         parameter_grid = {
-            'boosting__n_estimators': [1, 5, 10, 100],
-            'boosting__learning_rate': scipy.stats.uniform(0.01, 0.99),
-            'boosting__base_estimator__max_depth': scipy.stats.randint(1, 10),
+            "boosting__n_estimators": [1, 5, 10, 100],
+            "boosting__learning_rate": scipy.stats.uniform(0.01, 0.99),
+            f"boosting__{estimator_name}__max_depth": scipy.stats.randint(1, 10),
         }
         cv = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
         rs = sklearn.model_selection.RandomizedSearchCV(
-            estimator=model, param_distributions=parameter_grid, cv=cv)
+            estimator=model,
+            param_distributions=parameter_grid,
+            cv=cv,
+        )
         rs.fit(X, y)
         flow = self.extension.model_to_flow(rs)
         # Tags may be sorted in any order (by the server). Just using one tag
@@ -421,10 +476,9 @@ def test_sklearn_to_upload_to_flow(self):
         flow, sentinel = self._add_sentinel_to_flow_name(flow, None)
 
         flow.publish()
-        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            flow.flow_id))
-        self.assertIsInstance(flow.flow_id, int)
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
+        assert isinstance(flow.flow_id, int)
 
         # Check whether we can load the flow again
         # Remove the sentinel from the name again so that we can reinstantiate
@@ -434,81 +488,106 @@ def test_sklearn_to_upload_to_flow(self):
         local_xml = flow._to_xml()
         server_xml = new_flow._to_xml()
 
-        for i in range(10):
+        for _i in range(10):
             # Make sure that we replace all occurences of two newlines
-            local_xml = local_xml.replace(sentinel, '')
+            local_xml = local_xml.replace(sentinel, "")
             local_xml = (
-                local_xml.replace('  ', '').replace('\t', '').
-                strip().replace('\n\n', '\n').replace('&quot;', '"')
+                local_xml.replace("  ", "")
+                .replace("\t", "")
+                .strip()
+                .replace("\n\n", "\n")
+                .replace("&quot;", '"')
             )
-            local_xml = re.sub(r'(^$)', '', local_xml)
-            server_xml = server_xml.replace(sentinel, '')
+            local_xml = re.sub(r"(^$)", "", local_xml)
+            server_xml = server_xml.replace(sentinel, "")
             server_xml = (
-                server_xml.replace('  ', '').replace('\t', '').
-                strip().replace('\n\n', '\n').replace('&quot;', '"')
+                server_xml.replace("  ", "")
+                .replace("\t", "")
+                .strip()
+                .replace("\n\n", "\n")
+                .replace("&quot;", '"')
             )
-            server_xml = re.sub(r'^$', '', server_xml)
+            server_xml = re.sub(r"^$", "", server_xml)
 
-        self.assertEqual(server_xml, local_xml)
+        assert server_xml == local_xml
 
         # Would raise exception if they are not equal!
         openml.flows.functions.assert_flows_equal(new_flow, flow)
-        self.assertIsNot(new_flow, flow)
+        assert new_flow is not flow
 
         # OneHotEncoder was moved to _encoders module in 0.20
-        module_name_encoder = ('_encoders'
-                               if LooseVersion(sklearn.__version__) >= "0.20"
-                               else 'data')
-        fixture_name = (
-            '%ssklearn.model_selection._search.RandomizedSearchCV('
-            'estimator=sklearn.pipeline.Pipeline('
-            'ohe=sklearn.preprocessing.%s.OneHotEncoder,'
-            'scaler=sklearn.preprocessing.data.StandardScaler,'
-            'fu=sklearn.pipeline.FeatureUnion('
-            'pca=sklearn.decomposition.truncated_svd.TruncatedSVD,'
-            'fs='
-            'sklearn.feature_selection.univariate_selection.SelectPercentile),'
-            'boosting=sklearn.ensemble.weight_boosting.AdaBoostClassifier('
-            'base_estimator=sklearn.tree.tree.DecisionTreeClassifier)))'
-            % (sentinel, module_name_encoder)
+        module_name_encoder = (
+            "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
         )
-        self.assertEqual(new_flow.name, fixture_name)
+        if Version(sklearn.__version__) < Version("0.22"):
+            fixture_name = (
+                f"{sentinel}sklearn.model_selection._search.RandomizedSearchCV("
+                "estimator=sklearn.pipeline.Pipeline("
+                f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder,"
+                "scaler=sklearn.preprocessing.data.StandardScaler,"
+                "fu=sklearn.pipeline.FeatureUnion("
+                "pca=sklearn.decomposition.truncated_svd.TruncatedSVD,"
+                "fs="
+                "sklearn.feature_selection.univariate_selection.SelectPercentile),"
+                "boosting=sklearn.ensemble.weight_boosting.AdaBoostClassifier("
+                "base_estimator=sklearn.tree.tree.DecisionTreeClassifier)))"
+            )
+        else:
+            # sklearn.sklearn.preprocessing.data -> sklearn.sklearn.preprocessing._data
+            # sklearn.sklearn.decomposition.truncated_svd -> sklearn.decomposition._truncated_svd
+            # sklearn.feature_selection.univariate_selection ->
+            #     sklearn.feature_selection._univariate_selection
+            # sklearn.ensemble.weight_boosting -> sklearn.ensemble._weight_boosting
+            # sklearn.tree.tree.DecisionTree... -> sklearn.tree._classes.DecisionTree...
+            fixture_name = (
+                f"{sentinel}sklearn.model_selection._search.RandomizedSearchCV("
+                "estimator=sklearn.pipeline.Pipeline("
+                f"ohe=sklearn.preprocessing.{module_name_encoder}.OneHotEncoder,"
+                "scaler=sklearn.preprocessing._data.StandardScaler,"
+                "fu=sklearn.pipeline.FeatureUnion("
+                "pca=sklearn.decomposition._truncated_svd.TruncatedSVD,"
+                "fs="
+                "sklearn.feature_selection._univariate_selection.SelectPercentile),"
+                "boosting=sklearn.ensemble._weight_boosting.AdaBoostClassifier("
+                f"{estimator_name}=sklearn.tree._classes.DecisionTreeClassifier)))"
+            )
+        assert new_flow.name == fixture_name
         new_flow.model.fit(X, y)
 
     def test_extract_tags(self):
         flow_xml = "<oml:tag>study_14</oml:tag>"
         flow_dict = xmltodict.parse(flow_xml)
-        tags = openml.utils.extract_xml_tags('oml:tag', flow_dict)
-        self.assertEqual(tags, ['study_14'])
+        tags = openml.utils.extract_xml_tags("oml:tag", flow_dict)
+        assert tags == ["study_14"]
 
-        flow_xml = "<oml:flow><oml:tag>OpenmlWeka</oml:tag>\n" \
-                   "<oml:tag>weka</oml:tag></oml:flow>"
+        flow_xml = "<oml:flow><oml:tag>OpenmlWeka</oml:tag>\n" "<oml:tag>weka</oml:tag></oml:flow>"
         flow_dict = xmltodict.parse(flow_xml)
-        tags = openml.utils.extract_xml_tags('oml:tag', flow_dict['oml:flow'])
-        self.assertEqual(tags, ['OpenmlWeka', 'weka'])
+        tags = openml.utils.extract_xml_tags("oml:tag", flow_dict["oml:flow"])
+        assert tags == ["OpenmlWeka", "weka"]
 
+    @pytest.mark.production_server()
     def test_download_non_scikit_learn_flows(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         flow = openml.flows.get_flow(6742)
-        self.assertIsInstance(flow, openml.OpenMLFlow)
-        self.assertEqual(flow.flow_id, 6742)
-        self.assertEqual(len(flow.parameters), 19)
-        self.assertEqual(len(flow.components), 1)
-        self.assertIsNone(flow.model)
-
-        subflow_1 = list(flow.components.values())[0]
-        self.assertIsInstance(subflow_1, openml.OpenMLFlow)
-        self.assertEqual(subflow_1.flow_id, 6743)
-        self.assertEqual(len(subflow_1.parameters), 8)
-        self.assertEqual(subflow_1.parameters['U'], '0')
-        self.assertEqual(len(subflow_1.components), 1)
-        self.assertIsNone(subflow_1.model)
-
-        subflow_2 = list(subflow_1.components.values())[0]
-        self.assertIsInstance(subflow_2, openml.OpenMLFlow)
-        self.assertEqual(subflow_2.flow_id, 5888)
-        self.assertEqual(len(subflow_2.parameters), 4)
-        self.assertIsNone(subflow_2.parameters['batch-size'])
-        self.assertEqual(len(subflow_2.components), 0)
-        self.assertIsNone(subflow_2.model)
+        assert isinstance(flow, openml.OpenMLFlow)
+        assert flow.flow_id == 6742
+        assert len(flow.parameters) == 19
+        assert len(flow.components) == 1
+        assert flow.model is None
+
+        subflow_1 = next(iter(flow.components.values()))
+        assert isinstance(subflow_1, openml.OpenMLFlow)
+        assert subflow_1.flow_id == 6743
+        assert len(subflow_1.parameters) == 8
+        assert subflow_1.parameters["U"] == "0"
+        assert len(subflow_1.components) == 1
+        assert subflow_1.model is None
+
+        subflow_2 = next(iter(subflow_1.components.values()))
+        assert isinstance(subflow_2, openml.OpenMLFlow)
+        assert subflow_2.flow_id == 5888
+        assert len(subflow_2.parameters) == 4
+        assert subflow_2.parameters["batch-size"] is None
+        assert len(subflow_2.components) == 0
+        assert subflow_2.model is None
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index 95b4fa3f0..14bb78060 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -1,112 +1,127 @@
-from collections import OrderedDict
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import copy
+import functools
 import unittest
+from collections import OrderedDict
+from multiprocessing.managers import Value
 
-from distutils.version import LooseVersion
+from openml_sklearn import SklearnExtension
+from packaging.version import Version
+from unittest import mock
+from unittest.mock import patch
+
+import os
+import pandas as pd
+import pytest
+import requests
 import sklearn
 from sklearn import ensemble
-import pandas as pd
 
 import openml
-from openml.testing import TestBase
-import openml.extensions.sklearn
+from openml.exceptions import OpenMLNotAuthorizedError, OpenMLServerException
+from openml.testing import TestBase, create_request_response
 
 
+@pytest.mark.usefixtures("long_version")
 class TestFlowFunctions(TestBase):
     _multiprocess_can_split_ = True
 
     def setUp(self):
-        super(TestFlowFunctions, self).setUp()
+        super().setUp()
 
     def tearDown(self):
-        super(TestFlowFunctions, self).tearDown()
+        super().tearDown()
 
     def _check_flow(self, flow):
-        self.assertEqual(type(flow), dict)
-        self.assertEqual(len(flow), 6)
-        self.assertIsInstance(flow['id'], int)
-        self.assertIsInstance(flow['name'], str)
-        self.assertIsInstance(flow['full_name'], str)
-        self.assertIsInstance(flow['version'], str)
+        assert type(flow) == dict
+        assert len(flow) == 6
+        assert isinstance(flow["id"], int)
+        assert isinstance(flow["name"], str)
+        assert isinstance(flow["full_name"], str)
+        assert isinstance(flow["version"], str)
         # There are some runs on openml.org that can have an empty external version
-        ext_version_str_or_none = (isinstance(flow['external_version'], str)
-                                   or flow['external_version'] is None)
-        self.assertTrue(ext_version_str_or_none)
+        ext_version = flow["external_version"]
+        ext_version_str_or_none = (
+            isinstance(ext_version, str) or ext_version is None or pd.isna(ext_version)
+        )
+        assert ext_version_str_or_none
 
+    @pytest.mark.production_server()
     def test_list_flows(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         # We can only perform a smoke test here because we test on dynamic
         # data from the internet...
         flows = openml.flows.list_flows()
         # 3000 as the number of flows on openml.org
-        self.assertGreaterEqual(len(flows), 1500)
-        for fid in flows:
-            self._check_flow(flows[fid])
+        assert len(flows) >= 1500
+        for flow in flows.to_dict(orient="index").values():
+            self._check_flow(flow)
 
+    @pytest.mark.production_server()
     def test_list_flows_output_format(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         # We can only perform a smoke test here because we test on dynamic
         # data from the internet...
-        flows = openml.flows.list_flows(output_format='dataframe')
-        self.assertIsInstance(flows, pd.DataFrame)
-        self.assertGreaterEqual(len(flows), 1500)
+        flows = openml.flows.list_flows()
+        assert isinstance(flows, pd.DataFrame)
+        assert len(flows) >= 1500
 
+    @pytest.mark.production_server()
     def test_list_flows_empty(self):
-        openml.config.server = self.production_server
-        flows = openml.flows.list_flows(tag='NoOneEverUsesThisTag123')
-        if len(flows) > 0:
-            raise ValueError(
-                'UnitTest Outdated, got somehow results (please adapt)'
-            )
-
-        self.assertIsInstance(flows, dict)
+        self.use_production_server()
+        flows = openml.flows.list_flows(tag="NoOneEverUsesThisTag123")
+        assert flows.empty
 
+    @pytest.mark.production_server()
     def test_list_flows_by_tag(self):
-        openml.config.server = self.production_server
-        flows = openml.flows.list_flows(tag='weka')
-        self.assertGreaterEqual(len(flows), 5)
-        for did in flows:
-            self._check_flow(flows[did])
+        self.use_production_server()
+        flows = openml.flows.list_flows(tag="weka")
+        assert len(flows) >= 5
+        for flow in flows.to_dict(orient="index").values():
+            self._check_flow(flow)
 
+    @pytest.mark.production_server()
     def test_list_flows_paginate(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         size = 10
         maximum = 100
         for i in range(0, maximum, size):
             flows = openml.flows.list_flows(offset=i, size=size)
-            self.assertGreaterEqual(size, len(flows))
-            for did in flows:
-                self._check_flow(flows[did])
+            assert size >= len(flows)
+            for flow in flows.to_dict(orient="index").values():
+                self._check_flow(flow)
 
     def test_are_flows_equal(self):
-        flow = openml.flows.OpenMLFlow(name='Test',
-                                       description='Test flow',
-                                       model=None,
-                                       components=OrderedDict(),
-                                       parameters=OrderedDict(),
-                                       parameters_meta_info=OrderedDict(),
-                                       external_version='1',
-                                       tags=['abc', 'def'],
-                                       language='English',
-                                       dependencies='abc',
-                                       class_name='Test',
-                                       custom_name='Test')
+        flow = openml.flows.OpenMLFlow(
+            name="Test",
+            description="Test flow",
+            model=None,
+            components=OrderedDict(),
+            parameters=OrderedDict(),
+            parameters_meta_info=OrderedDict(),
+            external_version="1",
+            tags=["abc", "def"],
+            language="English",
+            dependencies="abc",
+            class_name="Test",
+            custom_name="Test",
+        )
 
         # Test most important values that can be set by a user
         openml.flows.functions.assert_flows_equal(flow, flow)
-        for attribute, new_value in [('name', 'Tes'),
-                                     ('description', 'Test flo'),
-                                     ('external_version', '2'),
-                                     ('language', 'english'),
-                                     ('dependencies', 'ab'),
-                                     ('class_name', 'Tes'),
-                                     ('custom_name', 'Tes')]:
+        for attribute, new_value in [
+            ("name", "Tes"),
+            ("external_version", "2"),
+            ("language", "english"),
+            ("dependencies", "ab"),
+            ("class_name", "Tes"),
+            ("custom_name", "Tes"),
+        ]:
             new_flow = copy.deepcopy(flow)
             setattr(new_flow, attribute, new_value)
-            self.assertNotEqual(
-                getattr(flow, attribute),
-                getattr(new_flow, attribute),
-            )
+            assert getattr(flow, attribute) != getattr(new_flow, attribute)
             self.assertRaises(
                 ValueError,
                 openml.flows.functions.assert_flows_equal,
@@ -116,140 +131,163 @@ def test_are_flows_equal(self):
 
         # Test that the API ignores several keys when comparing flows
         openml.flows.functions.assert_flows_equal(flow, flow)
-        for attribute, new_value in [('flow_id', 1),
-                                     ('uploader', 1),
-                                     ('version', 1),
-                                     ('upload_date', '18.12.1988'),
-                                     ('binary_url', 'openml.org'),
-                                     ('binary_format', 'gzip'),
-                                     ('binary_md5', '12345'),
-                                     ('model', []),
-                                     ('tags', ['abc', 'de'])]:
+        for attribute, new_value in [
+            ("flow_id", 1),
+            ("uploader", 1),
+            ("version", 1),
+            ("upload_date", "18.12.1988"),
+            ("binary_url", "openml.org"),
+            ("binary_format", "gzip"),
+            ("binary_md5", "12345"),
+            ("model", []),
+            ("tags", ["abc", "de"]),
+        ]:
             new_flow = copy.deepcopy(flow)
             setattr(new_flow, attribute, new_value)
-            self.assertNotEqual(
-                getattr(flow, attribute),
-                getattr(new_flow, attribute),
-            )
+            assert getattr(flow, attribute) != getattr(new_flow, attribute)
             openml.flows.functions.assert_flows_equal(flow, new_flow)
 
         # Now test for parameters
-        flow.parameters['abc'] = 1.0
-        flow.parameters['def'] = 2.0
+        flow.parameters["abc"] = 1.0
+        flow.parameters["def"] = 2.0
         openml.flows.functions.assert_flows_equal(flow, flow)
         new_flow = copy.deepcopy(flow)
-        new_flow.parameters['abc'] = 3.0
-        self.assertRaises(ValueError, openml.flows.functions.assert_flows_equal,
-                          flow, new_flow)
+        new_flow.parameters["abc"] = 3.0
+        self.assertRaises(ValueError, openml.flows.functions.assert_flows_equal, flow, new_flow)
 
         # Now test for components (subflows)
         parent_flow = copy.deepcopy(flow)
         subflow = copy.deepcopy(flow)
-        parent_flow.components['subflow'] = subflow
+        parent_flow.components["subflow"] = subflow
         openml.flows.functions.assert_flows_equal(parent_flow, parent_flow)
-        self.assertRaises(ValueError,
-                          openml.flows.functions.assert_flows_equal,
-                          parent_flow, subflow)
+        self.assertRaises(
+            ValueError,
+            openml.flows.functions.assert_flows_equal,
+            parent_flow,
+            subflow,
+        )
         new_flow = copy.deepcopy(parent_flow)
-        new_flow.components['subflow'].name = 'Subflow name'
-        self.assertRaises(ValueError,
-                          openml.flows.functions.assert_flows_equal,
-                          parent_flow, new_flow)
+        new_flow.components["subflow"].name = "Subflow name"
+        self.assertRaises(
+            ValueError,
+            openml.flows.functions.assert_flows_equal,
+            parent_flow,
+            new_flow,
+        )
 
     def test_are_flows_equal_ignore_parameter_values(self):
-        paramaters = OrderedDict((('a', 5), ('b', 6)))
-        parameters_meta_info = OrderedDict((('a', None), ('b', None)))
+        paramaters = OrderedDict((("a", 5), ("b", 6)))
+        parameters_meta_info = OrderedDict((("a", None), ("b", None)))
 
         flow = openml.flows.OpenMLFlow(
-            name='Test',
-            description='Test flow',
+            name="Test",
+            description="Test flow",
             model=None,
             components=OrderedDict(),
             parameters=paramaters,
             parameters_meta_info=parameters_meta_info,
-            external_version='1',
-            tags=['abc', 'def'],
-            language='English',
-            dependencies='abc',
-            class_name='Test',
-            custom_name='Test',
+            external_version="1",
+            tags=["abc", "def"],
+            language="English",
+            dependencies="abc",
+            class_name="Test",
+            custom_name="Test",
         )
 
         openml.flows.functions.assert_flows_equal(flow, flow)
-        openml.flows.functions.assert_flows_equal(flow, flow,
-                                                  ignore_parameter_values=True)
+        openml.flows.functions.assert_flows_equal(flow, flow, ignore_parameter_values=True)
 
         new_flow = copy.deepcopy(flow)
-        new_flow.parameters['a'] = 7
-        self.assertRaisesRegex(
-            ValueError,
-            r"values for attribute 'parameters' differ: "
-            r"'OrderedDict\(\[\('a', 5\), \('b', 6\)\]\)'\nvs\n"
-            r"'OrderedDict\(\[\('a', 7\), \('b', 6\)\]\)'",
-            openml.flows.functions.assert_flows_equal,
-            flow, new_flow,
+        new_flow.parameters["a"] = 7
+        with pytest.raises(ValueError) as excinfo:
+            openml.flows.functions.assert_flows_equal(flow, new_flow)
+        assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(
+            excinfo.value
         )
-        openml.flows.functions.assert_flows_equal(flow, new_flow,
-                                                  ignore_parameter_values=True)
 
-        del new_flow.parameters['a']
-        self.assertRaisesRegex(
-            ValueError,
-            r"values for attribute 'parameters' differ: "
-            r"'OrderedDict\(\[\('a', 5\), \('b', 6\)\]\)'\nvs\n"
-            r"'OrderedDict\(\[\('b', 6\)\]\)'",
-            openml.flows.functions.assert_flows_equal,
-            flow, new_flow,
+        openml.flows.functions.assert_flows_equal(flow, new_flow, ignore_parameter_values=True)
+
+        del new_flow.parameters["a"]
+        with pytest.raises(ValueError) as excinfo:
+            openml.flows.functions.assert_flows_equal(flow, new_flow)
+        assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(
+            excinfo.value
         )
+
         self.assertRaisesRegex(
             ValueError,
             r"Flow Test: parameter set of flow differs from the parameters "
             r"stored on the server.",
             openml.flows.functions.assert_flows_equal,
-            flow, new_flow, ignore_parameter_values=True,
+            flow,
+            new_flow,
+            ignore_parameter_values=True,
         )
 
     def test_are_flows_equal_ignore_if_older(self):
-        paramaters = OrderedDict((('a', 5), ('b', 6)))
-        parameters_meta_info = OrderedDict((('a', None), ('b', None)))
-        flow_upload_date = '2017-01-31T12-01-01'
+        paramaters = OrderedDict((("a", 5), ("b", 6)))
+        parameters_meta_info = OrderedDict((("a", None), ("b", None)))
+        flow_upload_date = "2017-01-31T12-01-01"
         assert_flows_equal = openml.flows.functions.assert_flows_equal
 
-        flow = openml.flows.OpenMLFlow(name='Test',
-                                       description='Test flow',
-                                       model=None,
-                                       components=OrderedDict(),
-                                       parameters=paramaters,
-                                       parameters_meta_info=parameters_meta_info,
-                                       external_version='1',
-                                       tags=['abc', 'def'],
-                                       language='English',
-                                       dependencies='abc',
-                                       class_name='Test',
-                                       custom_name='Test',
-                                       upload_date=flow_upload_date)
+        flow = openml.flows.OpenMLFlow(
+            name="Test",
+            description="Test flow",
+            model=None,
+            components=OrderedDict(),
+            parameters=paramaters,
+            parameters_meta_info=parameters_meta_info,
+            external_version="1",
+            tags=["abc", "def"],
+            language="English",
+            dependencies="abc",
+            class_name="Test",
+            custom_name="Test",
+            upload_date=flow_upload_date,
+        )
 
         assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=flow_upload_date)
         assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=None)
         new_flow = copy.deepcopy(flow)
-        new_flow.parameters['a'] = 7
-        self.assertRaises(ValueError, assert_flows_equal, flow, new_flow,
-                          ignore_parameter_values_on_older_children=flow_upload_date)
-        self.assertRaises(ValueError, assert_flows_equal, flow, new_flow,
-                          ignore_parameter_values_on_older_children=None)
-
-        new_flow.upload_date = '2016-01-31T12-01-01'
-        self.assertRaises(ValueError, assert_flows_equal, flow, new_flow,
-                          ignore_parameter_values_on_older_children=flow_upload_date)
+        new_flow.parameters["a"] = 7
+        self.assertRaises(
+            ValueError,
+            assert_flows_equal,
+            flow,
+            new_flow,
+            ignore_parameter_values_on_older_children=flow_upload_date,
+        )
+        self.assertRaises(
+            ValueError,
+            assert_flows_equal,
+            flow,
+            new_flow,
+            ignore_parameter_values_on_older_children=None,
+        )
+
+        new_flow.upload_date = "2016-01-31T12-01-01"
+        self.assertRaises(
+            ValueError,
+            assert_flows_equal,
+            flow,
+            new_flow,
+            ignore_parameter_values_on_older_children=flow_upload_date,
+        )
         assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=None)
 
-    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
-                     reason="OrdinalEncoder introduced in 0.20. "
-                            "No known models with list of lists parameters in older versions.")
+    @pytest.mark.sklearn()
+    @unittest.skipIf(
+        Version(sklearn.__version__) < Version("0.20"),
+        reason="OrdinalEncoder introduced in 0.20. "
+        "No known models with list of lists parameters in older versions.",
+    )
+    @pytest.mark.test_server()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_sklearn_to_flow_list_of_lists(self):
         from sklearn.preprocessing import OrdinalEncoder
+
         ordinal_encoder = OrdinalEncoder(categories=[[0, 1], [0, 1]])
-        extension = openml.extensions.sklearn.SklearnExtension()
+        extension = SklearnExtension()
 
         # Test serialization works
         flow = extension.model_to_flow(ordinal_encoder)
@@ -257,42 +295,251 @@ def test_sklearn_to_flow_list_of_lists(self):
         # Test flow is accepted by server
         self._add_sentinel_to_flow_name(flow)
         flow.publish()
-        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
         # Test deserialization works
         server_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
-        self.assertEqual(server_flow.parameters['categories'], '[[0, 1], [0, 1]]')
-        self.assertEqual(server_flow.model.categories, flow.model.categories)
-
+        assert server_flow.parameters["categories"] == "[[0, 1], [0, 1]]"
+        assert server_flow.model.categories == flow.model.categories
+
+    @pytest.mark.production_server()
+    def test_get_flow1(self):
+        # Regression test for issue #305
+        # Basically, this checks that a flow without an external version can be loaded
+        self.use_production_server()
+        flow = openml.flows.get_flow(1)
+        assert flow.external_version is None
+
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_get_flow_reinstantiate_model(self):
         model = ensemble.RandomForestClassifier(n_estimators=33)
         extension = openml.extensions.get_extension_by_model(model)
         flow = extension.model_to_flow(model)
         flow.publish(raise_error_if_exists=False)
-        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
 
         downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
-        self.assertIsInstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier)
+        assert isinstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier)
 
+    @pytest.mark.test_server()
     def test_get_flow_reinstantiate_model_no_extension(self):
         # Flow 10 is a WEKA flow
-        self.assertRaisesRegex(RuntimeError,
-                               "No extension could be found for flow 10: weka.SMO",
-                               openml.flows.get_flow,
-                               flow_id=10,
-                               reinstantiate=True)
-
-    @unittest.skipIf(LooseVersion(sklearn.__version__) == "0.19.1",
-                     reason="Target flow is from sklearn 0.19.1")
-    def test_get_flow_reinstantiate_model_wrong_version(self):
-        # Note that CI does not test against 0.19.1.
-        openml.config.server = self.production_server
-        _, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3]
+        self.assertRaisesRegex(
+            ValueError,
+            ".* flow: 10 \(weka.SMO\). ",
+            openml.flows.get_flow,
+            flow_id=10,
+            reinstantiate=True,
+        )
+
+    @pytest.mark.sklearn()
+    @unittest.skipIf(
+        Version(sklearn.__version__) == Version("0.19.1"),
+        reason="Requires scikit-learn!=0.19.1, because target flow is from that version.",
+    )
+    @pytest.mark.production_server()
+    def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(self):
+        self.use_production_server()
         flow = 8175
-        expected = 'Trying to deserialize a model with dependency sklearn==0.19.1 not satisfied.'
-        self.assertRaisesRegex(ValueError,
-                               expected,
-                               openml.flows.get_flow,
-                               flow_id=flow,
-                               reinstantiate=True)
+        expected = "Trying to deserialize a model with dependency sklearn==0.19.1 not satisfied."
+        self.assertRaisesRegex(
+            ValueError,
+            expected,
+            openml.flows.get_flow,
+            flow_id=flow,
+            reinstantiate=True,
+            strict_version=True,
+        )
+
+    @pytest.mark.sklearn()
+    @unittest.skipIf(
+        Version(sklearn.__version__) >= Version("1.0.0"),
+        reason="Requires scikit-learn < 1.0.0.",
+        # Because scikit-learn dropped min_impurity_split hyperparameter in 1.0,
+        # and the requested flow is from 1.0.0 exactly.
+    )
+    @pytest.mark.production_server()
+    def test_get_flow_reinstantiate_flow_not_strict_post_1(self):
+        self.use_production_server()
+        flow = openml.flows.get_flow(flow_id=19190, reinstantiate=True, strict_version=False)
+        assert flow.flow_id is None
+        assert "sklearn==1.0.0" not in flow.dependencies
+
+    @pytest.mark.sklearn()
+    @unittest.skipIf(
+        (Version(sklearn.__version__) < Version("0.23.2"))
+        or (Version(sklearn.__version__) >= Version("1.0")),
+        reason="Requires scikit-learn 0.23.2 or ~0.24.",
+        # Because these still have min_impurity_split, but with new scikit-learn module structure."
+    )
+    @pytest.mark.production_server()
+    def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self):
+        self.use_production_server()
+        flow = openml.flows.get_flow(flow_id=18587, reinstantiate=True, strict_version=False)
+        assert flow.flow_id is None
+        assert "sklearn==0.23.1" not in flow.dependencies
+
+    @pytest.mark.sklearn()
+    @unittest.skipIf(
+        Version(sklearn.__version__) > Version("0.23"),
+        reason="Requires scikit-learn<=0.23, because the scikit-learn module structure changed.",
+    )
+    @pytest.mark.production_server()
+    def test_get_flow_reinstantiate_flow_not_strict_pre_023(self):
+        self.use_production_server()
+        flow = openml.flows.get_flow(flow_id=8175, reinstantiate=True, strict_version=False)
+        assert flow.flow_id is None
+        assert "sklearn==0.19.1" not in flow.dependencies
+
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
+    def test_get_flow_id(self):
+        if self.long_version:
+            list_all = openml.utils._list_all
+        else:
+            list_all = functools.lru_cache()(openml.utils._list_all)
+        with patch("openml.utils._list_all", list_all):
+            clf = sklearn.tree.DecisionTreeClassifier()
+            flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish()
+            TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+            TestBase.logger.info(
+                f"collected from {__file__.split('/')[-1]}: {flow.flow_id}",
+            )
+
+            assert openml.flows.get_flow_id(model=clf, exact_version=True) == flow.flow_id
+            flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
+            assert flow.flow_id in flow_ids
+            assert len(flow_ids) > 0
+
+            # Check that the output of get_flow_id is identical if only the name is given, no matter
+            # whether exact_version is set to True or False.
+            flow_ids_exact_version_True = openml.flows.get_flow_id(
+                name=flow.name,
+                exact_version=True,
+            )
+            flow_ids_exact_version_False = openml.flows.get_flow_id(
+                name=flow.name,
+                exact_version=False,
+            )
+            assert flow.flow_id in flow_ids_exact_version_True
+            assert set(flow_ids_exact_version_True).issubset(set(flow_ids_exact_version_False))
+            # instead of the assertion above, the assertion below used to be used.
+            pytest.skip(reason="Not sure why there should only be one version of this flow.")
+            assert flow_ids_exact_version_True == flow_ids_exact_version_False
+
+    @pytest.mark.test_server()
+    def test_delete_flow(self):
+        flow = openml.OpenMLFlow(
+            name="sklearn.dummy.DummyClassifier",
+            class_name="sklearn.dummy.DummyClassifier",
+            description="test description",
+            model=sklearn.dummy.DummyClassifier(),
+            components=OrderedDict(),
+            parameters=OrderedDict(),
+            parameters_meta_info=OrderedDict(),
+            external_version="1",
+            tags=[],
+            language="English",
+            dependencies=None,
+        )
+
+        flow, _ = self._add_sentinel_to_flow_name(flow, None)
+
+        flow.publish()
+        _flow_id = flow.flow_id
+        assert openml.flows.delete_flow(_flow_id)
+
+
+@mock.patch.object(requests.Session, "delete")
+def test_delete_flow_not_owned(mock_delete, test_files_directory, test_api_key):
+    content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_not_owned.xml"
+    mock_delete.return_value = create_request_response(
+        status_code=412,
+        content_filepath=content_file,
+    )
+
+    with pytest.raises(
+        OpenMLNotAuthorizedError,
+        match="The flow can not be deleted because it was not uploaded by you.",
+    ):
+        openml.flows.delete_flow(40_000)
+
+    flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/40000"
+    assert flow_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+
+
+@mock.patch.object(requests.Session, "delete")
+def test_delete_flow_with_run(mock_delete, test_files_directory, test_api_key):
+    content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_has_runs.xml"
+    mock_delete.return_value = create_request_response(
+        status_code=412,
+        content_filepath=content_file,
+    )
+
+    with pytest.raises(
+        OpenMLNotAuthorizedError,
+        match="The flow can not be deleted because it still has associated entities:",
+    ):
+        openml.flows.delete_flow(40_000)
+
+    flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/40000"
+    assert flow_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+
+
+@mock.patch.object(requests.Session, "delete")
+def test_delete_subflow(mock_delete, test_files_directory, test_api_key):
+    content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_is_subflow.xml"
+    mock_delete.return_value = create_request_response(
+        status_code=412,
+        content_filepath=content_file,
+    )
+
+    with pytest.raises(
+        OpenMLNotAuthorizedError,
+        match="The flow can not be deleted because it still has associated entities:",
+    ):
+        openml.flows.delete_flow(40_000)
+
+    flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/40000"
+    assert flow_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+
+
+@mock.patch.object(requests.Session, "delete")
+def test_delete_flow_success(mock_delete, test_files_directory, test_api_key):
+    content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_successful.xml"
+    mock_delete.return_value = create_request_response(
+        status_code=200,
+        content_filepath=content_file,
+    )
+
+    success = openml.flows.delete_flow(33364)
+    assert success
+
+    flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/33364"
+    assert flow_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+
+
+@mock.patch.object(requests.Session, "delete")
+@pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+def test_delete_unknown_flow(mock_delete, test_files_directory, test_api_key):
+    content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_not_exist.xml"
+    mock_delete.return_value = create_request_response(
+        status_code=412,
+        content_filepath=content_file,
+    )
+
+    with pytest.raises(
+        OpenMLServerException,
+        match="flow does not exist",
+    ):
+        openml.flows.delete_flow(9_999_999)
+
+    flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/9999999"
+    assert flow_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py
new file mode 100644
index 000000000..f2a81be9f
--- /dev/null
+++ b/tests/test_openml/test_api_calls.py
@@ -0,0 +1,128 @@
+from __future__ import annotations
+
+import unittest.mock
+from pathlib import Path
+from typing import NamedTuple, Iterable, Iterator
+from unittest import mock
+
+import minio
+import pytest
+import os
+
+import openml
+import openml.testing
+from openml._api_calls import _download_minio_bucket, API_TOKEN_HELP_LINK
+
+
+class TestConfig(openml.testing.TestBase):
+    @pytest.mark.test_server()
+    def test_too_long_uri(self):
+        with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"):
+            openml.datasets.list_datasets(data_id=list(range(10000)))
+
+    @unittest.mock.patch("time.sleep")
+    @unittest.mock.patch("requests.Session")
+    @pytest.mark.test_server()
+    def test_retry_on_database_error(self, Session_class_mock, _):
+        response_mock = unittest.mock.Mock()
+        response_mock.text = (
+            "<oml:error>\n"
+            "<oml:code>107</oml:code>"
+            "<oml:message>Database connection error. "
+            "Usually due to high server load. "
+            "Please wait for N seconds and try again.</oml:message>\n"
+            "</oml:error>"
+        )
+        Session_class_mock.return_value.__enter__.return_value.get.return_value = response_mock
+        with pytest.raises(openml.exceptions.OpenMLServerException, match="/abc returned code 107"):
+            openml._api_calls._send_request("get", "/abc", {})
+
+        assert Session_class_mock.return_value.__enter__.return_value.get.call_count == 20
+
+
+class FakeObject(NamedTuple):
+    object_name: str
+    etag: str
+    """We use the etag of a Minio object as the name of a marker if we already downloaded it."""
+
+
+class FakeMinio:
+    def __init__(self, objects: Iterable[FakeObject] | None = None):
+        self._objects = objects or []
+
+    def list_objects(self, *args, **kwargs) -> Iterator[FakeObject]:
+        yield from self._objects
+
+    def fget_object(self, object_name: str, file_path: str, *args, **kwargs) -> None:
+        if object_name in [obj.object_name for obj in self._objects]:
+            Path(file_path).write_text("foo")
+            return
+        raise FileNotFoundError
+
+
+@mock.patch.object(minio, "Minio")
+def test_download_all_files_observes_cache(mock_minio, tmp_path: Path) -> None:
+    some_prefix, some_filename = "some/prefix", "dataset.arff"
+    some_object_path = f"{some_prefix}/{some_filename}"
+    some_url = f"https://not.real.com/bucket/{some_object_path}"
+    mock_minio.return_value = FakeMinio(
+        objects=[
+            FakeObject(object_name=some_object_path, etag=str(hash(some_object_path))),
+        ],
+    )
+
+    _download_minio_bucket(source=some_url, destination=tmp_path)
+    time_created = (tmp_path / "dataset.arff").stat().st_ctime
+
+    _download_minio_bucket(source=some_url, destination=tmp_path)
+    time_modified = (tmp_path / some_filename).stat().st_mtime
+
+    assert time_created == time_modified
+
+
+@mock.patch.object(minio, "Minio")
+def test_download_minio_failure(mock_minio, tmp_path: Path) -> None:
+    some_prefix, some_filename = "some/prefix", "dataset.arff"
+    some_object_path = f"{some_prefix}/{some_filename}"
+    some_url = f"https://not.real.com/bucket/{some_object_path}"
+    mock_minio.return_value = FakeMinio(
+        objects=[
+            FakeObject(object_name=None, etag="tmp"),
+        ],
+    )
+
+    with pytest.raises(ValueError):
+        _download_minio_bucket(source=some_url, destination=tmp_path)
+
+    mock_minio.return_value = FakeMinio(
+        objects=[
+            FakeObject(object_name="tmp", etag=None),
+        ],
+    )
+
+    with pytest.raises(ValueError):
+        _download_minio_bucket(source=some_url, destination=tmp_path)
+
+
+@pytest.mark.parametrize(
+    "endpoint, method",
+    [
+        # https://github.com/openml/OpenML/blob/develop/openml_OS/views/pages/api_new/v1/xml/pre.php
+        ("flow/exists", "post"),  # 102
+        ("dataset", "post"),  # 137
+        ("dataset/42", "delete"),  # 350
+        # ("flow/owned", "post"),  # 310 - Couldn't find what would trigger this
+        ("flow/42", "delete"),  # 320
+        ("run/42", "delete"),  # 400
+        ("task/42", "delete"),  # 460
+    ],
+)
+@pytest.mark.test_server()
+def test_authentication_endpoints_requiring_api_key_show_relevant_help_link(
+    endpoint: str,
+    method: str,
+) -> None:
+    # We need to temporarily disable the API key to test the error message
+    with openml.config.overwrite_config_context({"apikey": None}):
+        with pytest.raises(openml.exceptions.OpenMLAuthenticationError, match=API_TOKEN_HELP_LINK):
+            openml._api_calls._perform_api_call(call=endpoint, request_method=method, data=None)
diff --git a/tests/test_openml/test_cli.py b/tests/test_openml/test_cli.py
new file mode 100644
index 000000000..eb213b561
--- /dev/null
+++ b/tests/test_openml/test_cli.py
@@ -0,0 +1,44 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import shutil
+import subprocess
+import sys
+
+import openml
+import pytest
+
+
+def test_cli_version_prints_package_version():
+    # Invoke the CLI via module to avoid relying on console script installation
+    result = subprocess.run(
+        [sys.executable, "-m", "openml.cli", "--version"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        check=False,
+    )
+
+    # Ensure successful exit and version present in stdout only
+    assert result.returncode == 0
+    assert result.stderr == ""
+    assert openml.__version__ in result.stdout
+
+
+def test_console_script_version_prints_package_version():
+    # Try to locate the console script; skip if not installed in PATH
+    console = shutil.which("openml")
+    if console is None:
+        pytest.skip("'openml' console script not found in PATH")
+
+    result = subprocess.run(
+        [console, "--version"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        check=False,
+    )
+
+    assert result.returncode == 0
+    assert result.stderr == ""
+    assert openml.__version__ in result.stdout
diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py
index 44cf4862f..f3feca784 100644
--- a/tests/test_openml/test_config.py
+++ b/tests/test_openml/test_config.py
@@ -1,55 +1,194 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+from contextlib import contextmanager
 import os
+import tempfile
+import unittest.mock
+from copy import copy
+from typing import Any, Iterator
+from pathlib import Path
+import platform
+
+import pytest
 
-import openml.config
+import openml
 import openml.testing
+from openml.testing import TestBase
 
 
-class TestConfig(openml.testing.TestBase):
+@contextmanager
+def safe_environ_patcher(key: str, value: Any) -> Iterator[None]:
+    """Context manager to temporarily set an environment variable.
 
-    def test_config_loading(self):
-        self.assertTrue(os.path.exists(openml.config.config_file))
-        self.assertTrue(os.path.isdir(os.path.expanduser('~/.openml')))
+    Safe to errors happening in the yielded to function.
+    """
+    _prev = os.environ.get(key)
+    os.environ[key] = value
+    try:
+        yield
+    except Exception as e:
+        raise e
+    finally:
+        os.environ.pop(key)
+        if _prev is not None:
+            os.environ[key] = _prev
 
 
-class TestConfigurationForExamples(openml.testing.TestBase):
+class TestConfig(openml.testing.TestBase):
+    @unittest.mock.patch("openml.config.openml_logger.warning")
+    @unittest.mock.patch("openml._config.OpenMLConfigManager._create_log_handlers")
+    @unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033")
+    @unittest.skipIf(
+        platform.uname().release.endswith(("-Microsoft", "microsoft-standard-WSL2")),
+        "WSL does nto support chmod as we would need here, see https://github.com/microsoft/WSL/issues/81",
+    )
+    def test_non_writable_home(self, log_handler_mock, warnings_mock):
+        with tempfile.TemporaryDirectory(dir=self.workdir) as td:
+            os.chmod(td, 0o444)
+            _dd = copy(openml.config._defaults)
+            _dd["cachedir"] = Path(td) / "something-else"
+            openml.config._setup(_dd)
+
+        assert warnings_mock.call_count == 1
+        assert log_handler_mock.call_count == 1
+        assert not log_handler_mock.call_args_list[0][1]["create_file_handler"]
+        assert openml.config._root_cache_directory == Path(td) / "something-else"
+
+    @unittest.skipIf(platform.system() != "Linux", "XDG only exists for Linux systems.")
+    def test_XDG_directories_do_not_exist(self):
+        with tempfile.TemporaryDirectory(dir=self.workdir) as td:
+            # Save previous state
+            path = Path(td) / "fake_xdg_cache_home"
+            with safe_environ_patcher("XDG_CONFIG_HOME", str(path)):
+                expected_config_dir = path / "openml"
+                expected_determined_config_file_path = expected_config_dir / "config"
+
+                # Ensure that it correctly determines the path to the config file
+                determined_config_file_path = openml.config.determine_config_file_path()
+                assert determined_config_file_path == expected_determined_config_file_path
+
+                # Ensure that setup will create the config folder as the configuration
+                # will be written to that location.
+                openml.config._setup()
+                assert expected_config_dir.exists()
+
+    def test_get_config_as_dict(self):
+        """Checks if the current configuration is returned accurately as a dict."""
+        config = openml.config.get_config_as_dict()
+        _config = {}
+        _config["apikey"] = TestBase.user_key
+        _config["server"] = f"{openml.config.TEST_SERVER_URL}/api/v1/xml"
+        _config["cachedir"] = self.workdir
+        _config["avoid_duplicate_runs"] = False
+        _config["connection_n_retries"] = 20
+        _config["retry_policy"] = "robot"
+        _config["show_progress"] = False
+        assert isinstance(config, dict)
+        assert len(config) == 7
+        self.assertDictEqual(config, _config)
+
+    def test_setup_with_config(self):
+        """Checks if the OpenML configuration can be updated using _setup()."""
+        _config = {}
+        _config["apikey"] = TestBase.user_key
+        _config["server"] = "https://www.openml.org/api/v1/xml"
+        _config["cachedir"] = self.workdir
+        _config["avoid_duplicate_runs"] = True
+        _config["retry_policy"] = "human"
+        _config["connection_n_retries"] = 100
+        _config["show_progress"] = False
+        orig_config = openml.config.get_config_as_dict()
+        openml.config._setup(_config)
+        updated_config = openml.config.get_config_as_dict()
+        openml.config._setup(orig_config)  # important to not affect other unit tests
+        self.assertDictEqual(_config, updated_config)
 
+
+class TestConfigurationForExamples(openml.testing.TestBase):
+    @pytest.mark.production_server()
     def test_switch_to_example_configuration(self):
-        """ Verifies the test configuration is loaded properly. """
+        """Verifies the test configuration is loaded properly."""
         # Below is the default test key which would be used anyway, but just for clarity:
-        openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de"
+        openml.config.apikey = "any-api-key"
         openml.config.server = self.production_server
 
         openml.config.start_using_configuration_for_example()
 
-        self.assertEqual(openml.config.apikey, "c0c42819af31e706efe1f4b88c23c6c1")
-        self.assertEqual(openml.config.server, self.test_server)
+        assert openml.config.apikey == TestBase.user_key
+        assert openml.config.server == self.test_server
 
+    @pytest.mark.production_server()
     def test_switch_from_example_configuration(self):
-        """ Verifies the previous configuration is loaded after stopping. """
+        """Verifies the previous configuration is loaded after stopping."""
         # Below is the default test key which would be used anyway, but just for clarity:
-        openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de"
+        openml.config.apikey = TestBase.user_key
         openml.config.server = self.production_server
 
         openml.config.start_using_configuration_for_example()
         openml.config.stop_using_configuration_for_example()
-
-        self.assertEqual(openml.config.apikey, "610344db6388d9ba34f6db45a3cf71de")
-        self.assertEqual(openml.config.server, self.production_server)
+        assert openml.config.apikey == TestBase.user_key
+        assert openml.config.server == self.production_server
 
     def test_example_configuration_stop_before_start(self):
-        """ Verifies an error is raised is `stop_...` is called before `start_...`. """
+        """Verifies an error is raised if `stop_...` is called before `start_...`."""
         error_regex = ".*stop_use_example_configuration.*start_use_example_configuration.*first"
-        self.assertRaisesRegex(RuntimeError, error_regex,
-                               openml.config.stop_using_configuration_for_example)
-
+        # Tests do not reset the state of this class. Thus, we ensure it is in
+        # the original state before the test.
+        openml.config._examples._start_last_called = False
+        self.assertRaisesRegex(
+            RuntimeError,
+            error_regex,
+            openml.config.stop_using_configuration_for_example,
+        )
+
+    @pytest.mark.production_server()
     def test_example_configuration_start_twice(self):
-        """ Checks that the original config can be returned to if `start..` is called twice. """
-        openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de"
+        """Checks that the original config can be returned to if `start..` is called twice."""
+        openml.config.apikey = TestBase.user_key
         openml.config.server = self.production_server
 
         openml.config.start_using_configuration_for_example()
         openml.config.start_using_configuration_for_example()
         openml.config.stop_using_configuration_for_example()
 
-        self.assertEqual(openml.config.apikey, "610344db6388d9ba34f6db45a3cf71de")
-        self.assertEqual(openml.config.server, self.production_server)
+        assert openml.config.apikey == TestBase.user_key
+        assert openml.config.server == self.production_server
+
+
+def test_configuration_file_not_overwritten_on_load():
+    """Regression test for #1337"""
+    config_file_content = "apikey = abcd"
+    with tempfile.TemporaryDirectory() as tmpdir:
+        config_file_path = Path(tmpdir) / "config"
+        with config_file_path.open("w") as config_file:
+            config_file.write(config_file_content)
+
+        read_config = openml.config._parse_config(config_file_path)
+
+        with config_file_path.open("r") as config_file:
+            new_file_content = config_file.read()
+
+    assert config_file_content == new_file_content
+    assert "abcd" == read_config["apikey"]
+
+
+def test_configuration_loads_booleans(tmp_path):
+    config_file_content = "avoid_duplicate_runs=true\nshow_progress=false"
+    tmp_file = tmp_path / "config"
+    with tmp_file.open("w") as config_file:
+        config_file.write(config_file_content)
+    read_config = openml.config._parse_config(tmp_file)
+
+    # Explicit test to avoid truthy/falsy modes of other types
+    assert read_config["avoid_duplicate_runs"] is True
+    assert read_config["show_progress"] is False
+
+
+def test_openml_cache_dir_env_var(tmp_path: Path) -> None:
+    expected_path = tmp_path / "test-cache"
+
+    with safe_environ_patcher("OPENML_CACHE_DIR", str(expected_path)):
+        openml.config._setup()
+        assert openml.config._root_cache_directory == expected_path
+        assert openml.config.get_cache_directory() == str(expected_path / "org" / "openml" / "www")
diff --git a/tests/test_openml/test_openml.py b/tests/test_openml/test_openml.py
index a3fdf541c..998046726 100644
--- a/tests/test_openml/test_openml.py
+++ b/tests/test_openml/test_openml.py
@@ -1,17 +1,20 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
 from unittest import mock
 
-from openml.testing import TestBase
 import openml
+from openml.testing import TestBase
 
 
 class TestInit(TestBase):
     # Splitting not helpful, these test's don't rely on the server and take less
     # than 1 seconds
 
-    @mock.patch('openml.tasks.functions.get_task')
-    @mock.patch('openml.datasets.functions.get_dataset')
-    @mock.patch('openml.flows.functions.get_flow')
-    @mock.patch('openml.runs.functions.get_run')
+    @mock.patch("openml.tasks.functions.get_task")
+    @mock.patch("openml.datasets.functions.get_dataset")
+    @mock.patch("openml.flows.functions.get_flow")
+    @mock.patch("openml.runs.functions.get_run")
     def test_populate_cache(
         self,
         run_mock,
@@ -19,23 +22,22 @@ def test_populate_cache(
         dataset_mock,
         task_mock,
     ):
-        openml.populate_cache(task_ids=[1, 2], dataset_ids=[3, 4],
-                              flow_ids=[5, 6], run_ids=[7, 8])
-        self.assertEqual(run_mock.call_count, 2)
+        openml.populate_cache(task_ids=[1, 2], dataset_ids=[3, 4], flow_ids=[5, 6], run_ids=[7, 8])
+        assert run_mock.call_count == 2
         for argument, fixture in zip(run_mock.call_args_list, [(7,), (8,)]):
-            self.assertEqual(argument[0], fixture)
+            assert argument[0] == fixture
 
-        self.assertEqual(flow_mock.call_count, 2)
+        assert flow_mock.call_count == 2
         for argument, fixture in zip(flow_mock.call_args_list, [(5,), (6,)]):
-            self.assertEqual(argument[0], fixture)
+            assert argument[0] == fixture
 
-        self.assertEqual(dataset_mock.call_count, 2)
+        assert dataset_mock.call_count == 2
         for argument, fixture in zip(
-                dataset_mock.call_args_list,
-                [(3,), (4,)],
+            dataset_mock.call_args_list,
+            [(3,), (4,)],
         ):
-            self.assertEqual(argument[0], fixture)
+            assert argument[0] == fixture
 
-        self.assertEqual(task_mock.call_count, 2)
+        assert task_mock.call_count == 2
         for argument, fixture in zip(task_mock.call_args_list, [(1,), (2,)]):
-            self.assertEqual(argument[0], fixture)
+            assert argument[0] == fixture
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 88fe8d6ef..22a8bc936 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -1,78 +1,94 @@
-import numpy as np
-import random
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import os
+import random
 from time import time
 
+import numpy as np
+import pytest
+import xmltodict
+from openml_sklearn import SklearnExtension
+from sklearn.base import clone
 from sklearn.dummy import DummyClassifier
-from sklearn.tree import DecisionTreeClassifier
+from sklearn.linear_model import LinearRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import Pipeline
+from sklearn.tree import DecisionTreeClassifier
 
-from openml.testing import TestBase, SimpleImputer
 import openml
-import openml.extensions.sklearn
-
-import pytest
+from openml import OpenMLRun
+from openml.testing import SimpleImputer, TestBase
 
 
 class TestRun(TestBase):
     # Splitting not helpful, these test's don't rely on the server and take
     # less than 1 seconds
 
+    @pytest.mark.test_server()
     def test_tagging(self):
-
         runs = openml.runs.list_runs(size=1)
-        run_id = list(runs.keys())[0]
+        assert not runs.empty, "Test server state is incorrect"
+        run_id = runs["run_id"].iloc[0]
         run = openml.runs.get_run(run_id)
-        tag = "testing_tag_{}_{}".format(self.id(), time())
-        run_list = openml.runs.list_runs(tag=tag)
-        self.assertEqual(len(run_list), 0)
+        # tags can be at most 64 alphanumeric (+ underscore) chars
+        unique_indicator = str(time()).replace(".", "")
+        tag = f"test_tag_TestRun_{unique_indicator}"
+        runs = openml.runs.list_runs(tag=tag)
+        assert len(runs) == 0
         run.push_tag(tag)
-        run_list = openml.runs.list_runs(tag=tag)
-        self.assertEqual(len(run_list), 1)
-        self.assertIn(run_id, run_list)
+        runs = openml.runs.list_runs(tag=tag)
+        assert len(runs) == 1
+        assert run_id in runs["run_id"]
         run.remove_tag(tag)
-        run_list = openml.runs.list_runs(tag=tag)
-        self.assertEqual(len(run_list), 0)
+        runs = openml.runs.list_runs(tag=tag)
+        assert len(runs) == 0
+
+    @staticmethod
+    def _test_prediction_data_equal(run, run_prime):
+        # Determine which attributes are numeric and which not
+        num_cols = np.array(
+            [d_type == "NUMERIC" for _, d_type in run._generate_arff_dict()["attributes"]],
+        )
+        # Get run data consistently
+        #   (For run from server, .data_content does not exist)
+        run_data_content = run.predictions.values
+        run_prime_data_content = run_prime.predictions.values
+
+        # Assert numeric and string parts separately
+        numeric_part = np.array(run_data_content[:, num_cols], dtype=float)
+        numeric_part_prime = np.array(run_prime_data_content[:, num_cols], dtype=float)
+        string_part = run_data_content[:, ~num_cols]
+        string_part_prime = run_prime_data_content[:, ~num_cols]
+        np.testing.assert_array_almost_equal(numeric_part, numeric_part_prime)
+        np.testing.assert_array_equal(string_part, string_part_prime)
 
     def _test_run_obj_equals(self, run, run_prime):
-        for dictionary in ['evaluations', 'fold_evaluations',
-                           'sample_evaluations']:
+        for dictionary in ["evaluations", "fold_evaluations", "sample_evaluations"]:
             if getattr(run, dictionary) is not None:
-                self.assertDictEqual(getattr(run, dictionary),
-                                     getattr(run_prime, dictionary))
+                self.assertDictEqual(getattr(run, dictionary), getattr(run_prime, dictionary))
             else:
                 # should be none or empty
                 other = getattr(run_prime, dictionary)
                 if other is not None:
-                    self.assertDictEqual(other, dict())
-        self.assertEqual(run._create_description_xml(),
-                         run_prime._create_description_xml())
-
-        numeric_part = \
-            np.array(np.array(run.data_content)[:, 0:-2], dtype=float)
-        numeric_part_prime = \
-            np.array(np.array(run_prime.data_content)[:, 0:-2], dtype=float)
-        string_part = np.array(run.data_content)[:, -2:]
-        string_part_prime = np.array(run_prime.data_content)[:, -2:]
-        np.testing.assert_array_almost_equal(numeric_part, numeric_part_prime)
-        np.testing.assert_array_equal(string_part, string_part_prime)
+                    self.assertDictEqual(other, {})
+        assert run._to_xml() == run_prime._to_xml()
+        self._test_prediction_data_equal(run, run_prime)
 
-        if run.trace is not None:
-            run_trace_content = run.trace.trace_to_arff()['data']
-        else:
-            run_trace_content = None
+        # Test trace
+        run_trace_content = run.trace.trace_to_arff()["data"] if run.trace is not None else None
 
         if run_prime.trace is not None:
-            run_prime_trace_content = run_prime.trace.trace_to_arff()['data']
+            run_prime_trace_content = run_prime.trace.trace_to_arff()["data"]
         else:
             run_prime_trace_content = None
 
         if run_trace_content is not None:
+
             def _check_array(array, type_):
                 for line in array:
                     for entry in line:
-                        self.assertIsInstance(entry, type_)
+                        assert isinstance(entry, type_)
 
             int_part = [line[:3] for line in run_trace_content]
             _check_array(int_part, int)
@@ -90,154 +106,295 @@ def _check_array(array, type_):
             bool_part = [line[4] for line in run_trace_content]
             bool_part_prime = [line[4] for line in run_prime_trace_content]
             for bp, bpp in zip(bool_part, bool_part_prime):
-                self.assertIn(bp, ['true', 'false'])
-                self.assertIn(bpp, ['true', 'false'])
+                assert bp in ["true", "false"]
+                assert bpp in ["true", "false"]
             string_part = np.array(run_trace_content)[:, 5:]
             string_part_prime = np.array(run_prime_trace_content)[:, 5:]
 
             np.testing.assert_array_almost_equal(int_part, int_part_prime)
             np.testing.assert_array_almost_equal(float_part, float_part_prime)
-            self.assertEqual(bool_part, bool_part_prime)
+            assert bool_part == bool_part_prime
             np.testing.assert_array_equal(string_part, string_part_prime)
         else:
-            self.assertIsNone(run_prime_trace_content)
+            assert run_prime_trace_content is None
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_to_from_filesystem_vanilla(self):
-
-        model = Pipeline([
-            ('imputer', SimpleImputer(strategy='mean')),
-            ('classifier', DecisionTreeClassifier(max_depth=1)),
-        ])
-        task = openml.tasks.get_task(119)
+        model = Pipeline(
+            [
+                ("imputer", SimpleImputer(strategy="mean")),
+                ("classifier", DecisionTreeClassifier(max_depth=1)),
+            ],
+        )
+        task = openml.tasks.get_task(119)  # diabetes; crossvalidation
         run = openml.runs.run_model_on_task(
             model=model,
             task=task,
             add_local_measures=False,
-            avoid_duplicate_runs=False,
-            upload_flow=True
+            upload_flow=True,
         )
 
         cache_path = os.path.join(
             self.workdir,
-            'runs',
+            "runs",
             str(random.getrandbits(128)),
         )
         run.to_filesystem(cache_path)
 
         run_prime = openml.runs.OpenMLRun.from_filesystem(cache_path)
         # The flow has been uploaded to server, so only the reference flow_id should be present
-        self.assertTrue(run_prime.flow_id is not None)
-        self.assertTrue(run_prime.flow is None)
+        assert run_prime.flow_id is not None
+        assert run_prime.flow is None
         self._test_run_obj_equals(run, run_prime)
         run_prime.publish()
-        TestBase._mark_entity_for_removal('run', run_prime.run_id)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            run_prime.run_id))
+        TestBase._mark_entity_for_removal("run", run_prime.run_id)
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {run_prime.run_id}",
+        )
 
-    @pytest.mark.flaky(reruns=3)
+    @pytest.mark.sklearn()
+    @pytest.mark.flaky()
+    @pytest.mark.test_server()
     def test_to_from_filesystem_search(self):
-
-        model = Pipeline([
-            ('imputer', SimpleImputer(strategy='mean')),
-            ('classifier', DecisionTreeClassifier(max_depth=1)),
-        ])
+        model = Pipeline(
+            [
+                ("imputer", SimpleImputer(strategy="mean")),
+                ("classifier", DecisionTreeClassifier(max_depth=1)),
+            ],
+        )
         model = GridSearchCV(
             estimator=model,
             param_grid={
                 "classifier__max_depth": [1, 2, 3, 4, 5],
-                "imputer__strategy": ['mean', 'median'],
-            }
+                "imputer__strategy": ["mean", "median"],
+            },
         )
 
-        task = openml.tasks.get_task(119)
+        task = openml.tasks.get_task(119)  # diabetes; crossvalidation
         run = openml.runs.run_model_on_task(
             model=model,
             task=task,
             add_local_measures=False,
-            avoid_duplicate_runs=False,
         )
 
-        cache_path = os.path.join(
-            self.workdir,
-            'runs',
-            str(random.getrandbits(128)),
-        )
+        cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
         run.to_filesystem(cache_path)
 
         run_prime = openml.runs.OpenMLRun.from_filesystem(cache_path)
         self._test_run_obj_equals(run, run_prime)
         run_prime.publish()
-        TestBase._mark_entity_for_removal('run', run_prime.run_id)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            run_prime.run_id))
+        TestBase._mark_entity_for_removal("run", run_prime.run_id)
+        TestBase.logger.info(
+            f"collected from {__file__.split('/')[-1]}: {run_prime.run_id}",
+        )
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_to_from_filesystem_no_model(self):
-
-        model = Pipeline([
-            ('imputer', SimpleImputer(strategy='mean')),
-            ('classifier', DummyClassifier()),
-        ])
-        task = openml.tasks.get_task(119)
-        run = openml.runs.run_model_on_task(
-            model=model,
-            task=task,
-            add_local_measures=False,
+        model = Pipeline(
+            [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())],
         )
+        task = openml.tasks.get_task(119)  # diabetes; crossvalidation
+        run = openml.runs.run_model_on_task(model=model, task=task, add_local_measures=False)
 
-        cache_path = os.path.join(
-            self.workdir,
-            'runs',
-            str(random.getrandbits(128)),
-        )
+        cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
         run.to_filesystem(cache_path, store_model=False)
         # obtain run from filesystem
         openml.runs.OpenMLRun.from_filesystem(cache_path, expect_model=False)
         # assert default behaviour is throwing an error
-        with self.assertRaises(ValueError, msg='Could not find model.pkl'):
+        with self.assertRaises(ValueError, msg="Could not find model.pkl"):
             openml.runs.OpenMLRun.from_filesystem(cache_path)
 
+    @staticmethod
+    def _cat_col_selector(X):
+        return X.select_dtypes(include=["object", "category"]).columns
+
+    @staticmethod
+    def _get_models_tasks_for_tests():
+        from sklearn.compose import ColumnTransformer
+        from sklearn.preprocessing import OneHotEncoder
+
+        basic_preprocessing = [
+            (
+                "cat_handling",
+                ColumnTransformer(
+                    transformers=[
+                        (
+                            "cat",
+                            OneHotEncoder(handle_unknown="ignore"),
+                            TestRun._cat_col_selector,
+                        )
+                    ],
+                    remainder="passthrough",
+                ),
+            ),
+            ("imp", SimpleImputer()),
+        ]
+        model_clf = Pipeline(
+            [
+                *basic_preprocessing,
+                ("classifier", DummyClassifier(strategy="prior")),
+            ],
+        )
+        model_reg = Pipeline(
+            [
+                *basic_preprocessing,
+                (
+                    "regressor",
+                    # LR because dummy does not produce enough float-like values
+                    LinearRegression(),
+                ),
+            ],
+        )
+
+        task_clf = openml.tasks.get_task(119)  # diabetes; hold out validation
+        task_reg = openml.tasks.get_task(733)  # quake; crossvalidation
+
+        return [(model_clf, task_clf), (model_reg, task_reg)]
+
+    @staticmethod
+    def assert_run_prediction_data(task, run, model):
+        # -- Get y_pred and y_true as it should be stored in the run
+        n_repeats, n_folds, n_samples = task.get_split_dimensions()
+        if (n_repeats > 1) or (n_samples > 1):
+            raise ValueError("Test does not support this task type's split dimensions.")
+
+        X, y = task.get_X_and_y()
+
+        # Check correctness of y_true and y_pred in run
+        for fold_id in range(n_folds):
+            # Get data for fold
+            _, test_indices = task.get_train_test_split_indices(repeat=0, fold=fold_id, sample=0)
+            train_mask = np.full(len(X), True)
+            train_mask[test_indices] = False
+
+            # Get train / test
+            X_train = X[train_mask]
+            y_train = y[train_mask]
+            X_test = X[~train_mask]
+            y_test = y[~train_mask]
+
+            # Get y_pred
+            y_pred = model.fit(X_train, y_train).predict(X_test)
+
+            # Get stored data for fold
+            saved_fold_data = run.predictions[run.predictions["fold"] == fold_id].sort_values(
+                by="row_id",
+            )
+            saved_y_pred = saved_fold_data["prediction"].values
+            gt_key = "truth" if "truth" in list(saved_fold_data) else "correct"
+            saved_y_test = saved_fold_data[gt_key].values
+
+            assert_method = np.testing.assert_array_almost_equal
+            if task.task_type == "Supervised Classification":
+                assert_method = np.testing.assert_array_equal
+            y_test = y_test.values
+
+            # Assert correctness
+            assert_method(y_pred, saved_y_pred)
+            assert_method(y_test, saved_y_test)
+
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_publish_with_local_loaded_flow(self):
         """
         Publish a run tied to a local flow after it has first been saved to
          and loaded from disk.
         """
-        extension = openml.extensions.sklearn.SklearnExtension()
+        extension = SklearnExtension()
+
+        for model, task in self._get_models_tasks_for_tests():
+            # Make sure the flow does not exist on the server yet.
+            flow = extension.model_to_flow(model)
+            self._add_sentinel_to_flow_name(flow)
+            assert not openml.flows.flow_exists(flow.name, flow.external_version)
+
+            run = openml.runs.run_flow_on_task(
+                flow=flow,
+                task=task,
+                add_local_measures=False,
+                upload_flow=False,
+            )
 
-        model = Pipeline([
-            ('imputer', SimpleImputer(strategy='mean')),
-            ('classifier', DummyClassifier()),
-        ])
-        task = openml.tasks.get_task(119)
+            # Make sure that the flow has not been uploaded as requested.
+            assert not openml.flows.flow_exists(flow.name, flow.external_version)
 
-        # Make sure the flow does not exist on the server yet.
-        flow = extension.model_to_flow(model)
-        self._add_sentinel_to_flow_name(flow)
-        self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
+            # Make sure that the prediction data stored in the run is correct.
+            self.assert_run_prediction_data(task, run, clone(model))
 
-        run = openml.runs.run_flow_on_task(
-            flow=flow,
-            task=task,
-            add_local_measures=False,
-            avoid_duplicate_runs=False,
-            upload_flow=False
-        )
+            cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
+            run.to_filesystem(cache_path)
+            # obtain run from filesystem
+            loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
+            loaded_run.publish()
+
+            # Clean up
+            TestBase._mark_entity_for_removal("run", loaded_run.run_id)
+            TestBase.logger.info(
+                f"collected from {__file__.split('/')[-1]}: {loaded_run.run_id}",
+            )
 
-        # Make sure that the flow has not been uploaded as requested.
-        self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
+            # make sure the flow is published as part of publishing the run.
+            assert openml.flows.flow_exists(flow.name, flow.external_version)
+            openml.runs.get_run(loaded_run.run_id)
+
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
+    @pytest.mark.skip(reason="https://github.com/openml/openml-python/issues/1586")
+    def test_offline_and_online_run_identical(self):
+        extension = SklearnExtension()
+
+        for model, task in self._get_models_tasks_for_tests():
+            # Make sure the flow does not exist on the server yet.
+            flow = extension.model_to_flow(model)
+            self._add_sentinel_to_flow_name(flow)
+            assert not openml.flows.flow_exists(flow.name, flow.external_version)
+
+            run = openml.runs.run_flow_on_task(
+                flow=flow,
+                task=task,
+                add_local_measures=False,
+                upload_flow=False,
+            )
 
-        cache_path = os.path.join(
-            self.workdir,
-            'runs',
-            str(random.getrandbits(128)),
+            # Make sure that the flow has not been uploaded as requested.
+            assert not openml.flows.flow_exists(flow.name, flow.external_version)
+
+            # Load from filesystem
+            cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
+            run.to_filesystem(cache_path)
+            loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
+
+            # Assert identical for offline - offline
+            self._test_run_obj_equals(run, loaded_run)
+
+            # Publish and test for offline - online
+            run.publish()
+            assert openml.flows.flow_exists(flow.name, flow.external_version)
+
+            try:
+                online_run = openml.runs.get_run(run.run_id, ignore_cache=True)
+                self._test_prediction_data_equal(run, online_run)
+            finally:
+                # Clean up
+                TestBase._mark_entity_for_removal("run", run.run_id)
+                TestBase.logger.info(
+                    f"collected from {__file__.split('/')[-1]}: {loaded_run.run_id}",
+                )
+
+    def test_run_setup_string_included_in_xml(self):
+        SETUP_STRING = "setup-string"
+        run = OpenMLRun(
+            task_id=0,
+            flow_id=None,  # if not none, flow parameters are required.
+            dataset_id=0,
+            setup_string=SETUP_STRING,
         )
-        run.to_filesystem(cache_path)
-        # obtain run from filesystem
-        loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
-        loaded_run.publish()
-        TestBase._mark_entity_for_removal('run', loaded_run.run_id)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                            loaded_run.run_id))
-
-        # make sure the flow is published as part of publishing the run.
-        self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version))
-        openml.runs.get_run(loaded_run.run_id)
+        xml = run._to_xml()
+        run_dict = xmltodict.parse(xml)["oml:run"]
+        assert "oml:setup_string" in run_dict
+        assert run_dict["oml:setup_string"] == SETUP_STRING
+
+        recreated_run = openml.runs.functions._create_run_from_xml(xml, from_server=False)
+        assert recreated_run.setup_string == SETUP_STRING
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 2b09ef501..8d5a00f9b 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1,57 +1,104 @@
-import arff
-from distutils.version import LooseVersion
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import ast
 import os
 import random
 import time
-import sys
-import unittest.mock
+import unittest
+import warnings
 
+from openml_sklearn import SklearnExtension, cat, cont
+from packaging.version import Version
+from unittest import mock
+
+import arff
+import joblib
 import numpy as np
+import pandas as pd
+import pytest
+import requests
+import sklearn
+from joblib import parallel_backend
+from sklearn.dummy import DummyClassifier
+from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
+from sklearn.feature_selection import VarianceThreshold
+from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
+from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
+from sklearn.model_selection._search import BaseSearchCV
+from sklearn.naive_bayes import GaussianNB
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.compose import ColumnTransformer
 
 import openml
-import openml.exceptions
 import openml._api_calls
-import sklearn
-import unittest
-import warnings
-import pandas as pd
-
-import openml.extensions.sklearn
-from openml.testing import TestBase, SimpleImputer
+import openml.exceptions
+from openml.exceptions import (
+    OpenMLNotAuthorizedError,
+    OpenMLServerException,
+)
+#from openml.extensions.sklearn import cat, cont
 from openml.runs.functions import (
     _run_task_get_arffcontent,
+    delete_run,
+    format_prediction,
     run_exists,
 )
 from openml.runs.trace import OpenMLRunTrace
-from openml.tasks import TaskTypeEnum
-
-from sklearn.naive_bayes import GaussianNB
-from sklearn.model_selection._search import BaseSearchCV
-from sklearn.tree import DecisionTreeClassifier
-
-from sklearn.dummy import DummyClassifier
-from sklearn.preprocessing import StandardScaler
-from sklearn.feature_selection import VarianceThreshold
-from sklearn.linear_model import LogisticRegression, SGDClassifier, \
-    LinearRegression
-from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
-from sklearn.svm import SVC
-from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, \
-    StratifiedKFold
-from sklearn.pipeline import Pipeline
+from openml.tasks import TaskType
+from openml.testing import (
+    CustomImputer,
+    SimpleImputer,
+    TestBase,
+    check_task_existence,
+    create_request_response,
+)
 
 
 class TestRun(TestBase):
     _multiprocess_can_split_ = True
-    # diabetis dataset, 768 observations, 0 missing vals, 33% holdout set
-    # (253 test obs), no nominal attributes, all numeric attributes
-    TEST_SERVER_TASK_SIMPLE = (119, 0, 253, list(), list(range(8)))
-    TEST_SERVER_TASK_REGRESSION = (738, 0, 718, list(), list(range(8)))
-    # credit-a dataset, 690 observations, 67 missing vals, 33% holdout set
-    # (227 test obs)
-    TEST_SERVER_TASK_MISSING_VALS = (96, 67, 227,
-                                     [0, 3, 4, 5, 6, 8, 9, 11, 12],
-                                     [1, 2, 7, 10, 13, 14])
+    TEST_SERVER_TASK_MISSING_VALS = {
+        "task_id": 96,
+        "n_missing_vals": 67,
+        "n_test_obs": 227,
+        "nominal_indices": [0, 3, 4, 5, 6, 8, 9, 11, 12],
+        "numeric_indices": [1, 2, 7, 10, 13, 14],
+        "task_meta_data": {
+            "task_type": TaskType.SUPERVISED_CLASSIFICATION,
+            "dataset_id": 16,  # credit-a
+            "estimation_procedure_id": 6,
+            "target_name": "class",
+        },
+    }
+    TEST_SERVER_TASK_SIMPLE = {
+        "task_id": 119,
+        "n_missing_vals": 0,
+        "n_test_obs": 253,
+        "nominal_indices": [],
+        "numeric_indices": [*range(8)],
+        "task_meta_data": {
+            "task_type": TaskType.SUPERVISED_CLASSIFICATION,
+            "dataset_id": 20,  # diabetes
+            "estimation_procedure_id": 5,
+            "target_name": "class",
+        },
+    }
+    TEST_SERVER_TASK_REGRESSION = {
+        "task_id": 1605,
+        "n_missing_vals": 0,
+        "n_test_obs": 2178,
+        "nominal_indices": [],
+        "numeric_indices": [*range(8)],
+        "task_meta_data": {
+            "task_type": TaskType.SUPERVISED_REGRESSION,
+            "dataset_id": 123,  # quake
+            "estimation_procedure_id": 7,
+            "target_name": "richter",
+        },
+    }
 
     # Suppress warnings to facilitate testing
     hide_warnings = True
@@ -62,7 +109,7 @@ class TestRun(TestBase):
 
     def setUp(self):
         super().setUp()
-        self.extension = openml.extensions.sklearn.SklearnExtension()
+        self.extension = SklearnExtension()
 
     def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
         # it can take a while for a run to be processed on the OpenML (test)
@@ -74,69 +121,100 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
         # time.time() works in seconds
         start_time = time.time()
         while time.time() - start_time < max_waiting_time_seconds:
+            try:
+                openml.runs.get_run_trace(run_id)
+            except openml.exceptions.OpenMLServerException:
+                time.sleep(10)
+                continue
+
             run = openml.runs.get_run(run_id, ignore_cache=True)
-            if len(run.evaluations) > 0:
-                return
-            else:
+            if run.evaluations is None:
                 time.sleep(10)
-        raise RuntimeError('Could not find any evaluations! Please check whether run {} was '
-                           'evaluated correctly on the server'.format(run_id))
+                continue
+
+            assert len(run.evaluations) > 0, (
+                "Expect not-None evaluations to always contain elements."
+            )
+            return
+
+        raise RuntimeError(
+            f"Could not find any evaluations! Please check whether run {run_id} was "
+            "evaluated correctly on the server",
+        )
 
-    def _compare_predictions(self, predictions, predictions_prime):
-        self.assertEqual(np.array(predictions_prime['data']).shape,
-                         np.array(predictions['data']).shape)
+    def _assert_predictions_equal(self, predictions, predictions_prime):
+        assert np.array(predictions_prime["data"]).shape == np.array(predictions["data"]).shape
 
         # The original search model does not submit confidence
         # bounds, so we can not compare the arff line
         compare_slice = [0, 1, 2, -1, -2]
-        for idx in range(len(predictions['data'])):
+        for idx in range(len(predictions["data"])):
             # depends on the assumption "predictions are in same order"
             # that does not necessarily hold.
             # But with the current code base, it holds.
             for col_idx in compare_slice:
-                val_1 = predictions['data'][idx][col_idx]
-                val_2 = predictions_prime['data'][idx][col_idx]
-                if type(val_1) == float or type(val_2) == float:
+                val_1 = predictions["data"][idx][col_idx]
+                val_2 = predictions_prime["data"][idx][col_idx]
+                if isinstance(val_1, float) or isinstance(val_2, float):
                     self.assertAlmostEqual(
                         float(val_1),
                         float(val_2),
                         places=6,
                     )
                 else:
-                    self.assertEqual(val_1, val_2)
-
-        return True
+                    assert val_1 == val_2
 
-    def _rerun_model_and_compare_predictions(self, run_id, model_prime, seed):
+    def _rerun_model_and_compare_predictions(self, run_id, model_prime, seed, create_task_obj):
         run = openml.runs.get_run(run_id)
-        task = openml.tasks.get_task(run.task_id)
 
         # TODO: assert holdout task
 
         # downloads the predictions of the old task
-        file_id = run.output_files['predictions']
+        file_id = run.output_files["predictions"]
         predictions_url = openml._api_calls._file_id_to_url(file_id)
-        response = openml._api_calls._read_url(predictions_url,
-                                               request_method='get')
+        response = openml._api_calls._download_text_file(predictions_url)
         predictions = arff.loads(response)
-        run_prime = openml.runs.run_model_on_task(
-            model=model_prime,
-            task=task,
-            avoid_duplicate_runs=False,
-            seed=seed,
-        )
+
+        # if create_task_obj=False, task argument in run_model_on_task is specified task_id
+        if create_task_obj:
+            task = openml.tasks.get_task(run.task_id)
+            run_prime = openml.runs.run_model_on_task(
+                model=model_prime,
+                task=task,
+                seed=seed,
+            )
+        else:
+            run_prime = openml.runs.run_model_on_task(
+                model=model_prime,
+                task=run.task_id,
+                seed=seed,
+            )
+
         predictions_prime = run_prime._generate_arff_dict()
 
-        self._compare_predictions(predictions, predictions_prime)
+        self._assert_predictions_equal(predictions, predictions_prime)
+        pd.testing.assert_frame_equal(
+            run.predictions,
+            run_prime.predictions,
+            check_dtype=False,  # Loaded ARFF reads NUMERIC as float, even if integer.
+        )
 
-    def _perform_run(self, task_id, num_instances, n_missing_vals, clf,
-                     flow_expected_rsv=None, seed=1, check_setup=True,
-                     sentinel=None):
+    def _perform_run(
+        self,
+        task_id,
+        num_instances,
+        n_missing_vals,
+        clf,
+        flow_expected_rsv=None,
+        seed=1,
+        check_setup=True,
+        sentinel=None,
+    ):
         """
         Runs a classifier on a task, and performs some basic checks.
         Also uploads the run.
 
-        Parameters:
+        Parameters
         ----------
         task_id : int
 
@@ -163,20 +241,23 @@ def _perform_run(self, task_id, num_instances, n_missing_vals, clf,
         sentinel: optional, str
             in case the sentinel should be user specified
 
-        Returns:
-        --------
+        Returns
+        -------
         run: OpenMLRun
             The performed run (with run id)
         """
-        classes_without_random_state = \
-            ['sklearn.model_selection._search.GridSearchCV',
-             'sklearn.pipeline.Pipeline',
-             'sklearn.linear_model.base.LinearRegression',
-             ]
+        classes_without_random_state = [
+            "sklearn.model_selection._search.GridSearchCV",
+            "sklearn.pipeline.Pipeline",
+        ]
+        if Version(sklearn.__version__) < Version("0.22"):
+            classes_without_random_state.append("sklearn.linear_model.base.LinearRegression")
+        else:
+            classes_without_random_state.append("sklearn.linear_model._base.LinearRegression")
 
         def _remove_random_state(flow):
-            if 'random_state' in flow.parameters:
-                del flow.parameters['random_state']
+            if "random_state" in flow.parameters:
+                del flow.parameters["random_state"]
             for component in flow.components.values():
                 _remove_random_state(component)
 
@@ -184,35 +265,34 @@ def _remove_random_state(flow):
         flow, _ = self._add_sentinel_to_flow_name(flow, sentinel)
         if not openml.flows.flow_exists(flow.name, flow.external_version):
             flow.publish()
-            TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
-            TestBase.logger.info("collected from test_run_functions: {}".format(flow.flow_id))
+            TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+            TestBase.logger.info(f"collected from test_run_functions: {flow.flow_id}")
 
         task = openml.tasks.get_task(task_id)
 
         X, y = task.get_X_and_y()
-        self.assertEqual(np.count_nonzero(np.isnan(X)), n_missing_vals)
+        assert X.isna().sum().sum() == n_missing_vals
         run = openml.runs.run_flow_on_task(
             flow=flow,
             task=task,
             seed=seed,
-            avoid_duplicate_runs=openml.config.avoid_duplicate_runs,
         )
         run_ = run.publish()
-        TestBase._mark_entity_for_removal('run', run.run_id)
-        TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id))
-        self.assertEqual(run_, run)
-        self.assertIsInstance(run.dataset_id, int)
+        TestBase._mark_entity_for_removal("run", run.run_id)
+        TestBase.logger.info(f"collected from test_run_functions: {run.run_id}")
+        assert run_ == run
+        assert isinstance(run.dataset_id, int)
 
         # This is only a smoke check right now
         # TODO add a few asserts here
-        run._create_description_xml()
+        run._to_xml()
         if run.trace is not None:
             # This is only a smoke check right now
             # TODO add a few asserts here
             run.trace.trace_to_arff()
 
         # check arff output
-        self.assertEqual(len(run.data_content), num_instances)
+        assert len(run.data_content) == num_instances
 
         if check_setup:
             # test the initialize setup function
@@ -225,18 +305,18 @@ def _remove_random_state(flow):
             flow_server = self.extension.model_to_flow(clf_server)
 
             if flow.class_name not in classes_without_random_state:
-                error_msg = 'Flow class %s (id=%d) does not have a random ' \
-                            'state parameter' % (flow.class_name, flow.flow_id)
-                self.assertIn('random_state', flow.parameters, error_msg)
+                error_msg = "Flow class %s (id=%d) does not have a random state parameter" % (
+                    flow.class_name,
+                    flow.flow_id,
+                )
+                assert "random_state" in flow.parameters, error_msg
                 # If the flow is initialized from a model without a random
                 # state, the flow is on the server without any random state
-                self.assertEqual(flow.parameters['random_state'], 'null')
+                assert flow.parameters["random_state"] == "null"
                 # As soon as a flow is run, a random state is set in the model.
                 # If a flow is re-instantiated
-                self.assertEqual(flow_local.parameters['random_state'],
-                                 flow_expected_rsv)
-                self.assertEqual(flow_server.parameters['random_state'],
-                                 flow_expected_rsv)
+                assert flow_local.parameters["random_state"] == flow_expected_rsv
+                assert flow_server.parameters["random_state"] == flow_expected_rsv
             _remove_random_state(flow_local)
             _remove_random_state(flow_server)
             openml.flows.assert_flows_equal(flow_local, flow_server)
@@ -247,8 +327,7 @@ def _remove_random_state(flow):
             )
             flow_server2 = self.extension.model_to_flow(clf_server2)
             if flow.class_name not in classes_without_random_state:
-                self.assertEqual(flow_server2.parameters['random_state'],
-                                 flow_expected_rsv)
+                assert flow_server2.parameters["random_state"] == flow_expected_rsv
 
             _remove_random_state(flow_server2)
             openml.flows.assert_flows_equal(flow_local, flow_server2)
@@ -257,7 +336,7 @@ def _remove_random_state(flow):
             # self.assertEqual(clf, clf_prime)
 
         downloaded = openml.runs.get_run(run_.run_id)
-        assert ('openml-python' in downloaded.tags)
+        assert "openml-python" in downloaded.tags
 
         # TODO make sure that these attributes are instantiated when
         # downloading a run? Or make sure that the trace object is created when
@@ -267,9 +346,14 @@ def _remove_random_state(flow):
         # self.assertEqual(run_trace, downloaded_run_trace)
         return run
 
-    def _check_sample_evaluations(self, sample_evaluations, num_repeats,
-                                  num_folds, num_samples,
-                                  max_time_allowed=60000):
+    def _check_sample_evaluations(
+        self,
+        sample_evaluations,
+        num_repeats,
+        num_folds,
+        num_samples,
+        max_time_allowed=60000,
+    ):
         """
         Checks whether the right timing measures are attached to the run
         (before upload). Test is only performed for versions >= Python3.3
@@ -279,72 +363,78 @@ def _check_sample_evaluations(self, sample_evaluations, num_repeats,
         default max_time_allowed (per fold, in milli seconds) = 1 minute,
         quite pessimistic
         """
-
         # a dict mapping from openml measure to a tuple with the minimum and
         # maximum allowed value
         check_measures = {
             # should take at least one millisecond (?)
-            'usercpu_time_millis_testing': (0, max_time_allowed),
-            'usercpu_time_millis_training': (0, max_time_allowed),
-            'usercpu_time_millis': (0, max_time_allowed),
-            'wall_clock_time_millis_training': (0, max_time_allowed),
-            'wall_clock_time_millis_testing': (0, max_time_allowed),
-            'wall_clock_time_millis': (0, max_time_allowed),
-            'predictive_accuracy': (0, 1)
+            "usercpu_time_millis_testing": (0, max_time_allowed),
+            "usercpu_time_millis_training": (0, max_time_allowed),
+            "usercpu_time_millis": (0, max_time_allowed),
+            "wall_clock_time_millis_training": (0, max_time_allowed),
+            "wall_clock_time_millis_testing": (0, max_time_allowed),
+            "wall_clock_time_millis": (0, max_time_allowed),
+            "predictive_accuracy": (0, 1),
         }
 
-        self.assertIsInstance(sample_evaluations, dict)
-        if sys.version_info[:2] >= (3, 3):
-            # this only holds if we are allowed to record time (otherwise some
-            # are missing)
-            self.assertEqual(set(sample_evaluations.keys()),
-                             set(check_measures.keys()))
+        assert isinstance(sample_evaluations, dict)
+        assert set(sample_evaluations.keys()) == set(check_measures.keys())
 
-        for measure in check_measures.keys():
+        for measure in check_measures:
             if measure in sample_evaluations:
                 num_rep_entrees = len(sample_evaluations[measure])
-                self.assertEqual(num_rep_entrees, num_repeats)
+                assert num_rep_entrees == num_repeats
                 for rep in range(num_rep_entrees):
                     num_fold_entrees = len(sample_evaluations[measure][rep])
-                    self.assertEqual(num_fold_entrees, num_folds)
+                    assert num_fold_entrees == num_folds
                     for fold in range(num_fold_entrees):
-                        num_sample_entrees = len(
-                            sample_evaluations[measure][rep][fold])
-                        self.assertEqual(num_sample_entrees, num_samples)
+                        num_sample_entrees = len(sample_evaluations[measure][rep][fold])
+                        assert num_sample_entrees == num_samples
                         for sample in range(num_sample_entrees):
-                            evaluation = sample_evaluations[measure][rep][
-                                fold][sample]
-                            self.assertIsInstance(evaluation, float)
-                            if not os.environ.get('CI_WINDOWS'):
-                                # Either Appveyor is much faster than Travis
-                                # and/or measurements are not as accurate.
-                                # Either way, windows seems to get an eval-time
-                                # of 0 sometimes.
-                                self.assertGreater(evaluation, 0)
-                            self.assertLess(evaluation, max_time_allowed)
-
+                            evaluation = sample_evaluations[measure][rep][fold][sample]
+                            assert isinstance(evaluation, float)
+                            if not (os.environ.get("CI_WINDOWS") or os.name == "nt"):
+                                # Windows seems to get an eval-time of 0 sometimes.
+                                assert evaluation > 0
+                            assert evaluation < max_time_allowed
+
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_regression_on_classif_task(self):
-        task_id = 115
+        task_id = 259  # collins; crossvalidation; has numeric targets
 
         clf = LinearRegression()
         task = openml.tasks.get_task(task_id)
-        with self.assertRaises(AttributeError):
+        # internally dataframe is loaded and targets are categorical
+        # which LinearRegression() cannot handle
+        with pytest.raises(
+            AttributeError, match="'LinearRegression' object has no attribute 'classes_'"
+        ):
             openml.runs.run_model_on_task(
                 model=clf,
                 task=task,
-                avoid_duplicate_runs=False,
             )
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_check_erronous_sklearn_flow_fails(self):
-        task_id = 115
+        task_id = 115  # diabetes; crossvalidation
         task = openml.tasks.get_task(task_id)
 
         # Invalid parameter values
-        clf = LogisticRegression(C='abc', solver='lbfgs')
-        with self.assertRaisesRegex(
-            ValueError,
-            r"Penalty term must be positive; got \(C=u?'abc'\)",  # u? for 2.7/3.4-6 compability
-        ):
+        clf = LogisticRegression(C="abc", solver="lbfgs")
+        # The exact error message depends on scikit-learn version.
+        # Because the sklearn-extension module is to be separated,
+        # I will simply relax specifics of the raised Error.
+        # old: r"Penalty term must be positive; got \(C=u?'abc'\)"
+        # new: sklearn.utils._param_validation.InvalidParameterError:
+        #   The 'C' parameter of LogisticRegression must be a float in the range (0, inf]. Got 'abc' instead.  # noqa: E501
+        try:
+            from sklearn.utils._param_validation import InvalidParameterError
+
+            exceptions = (ValueError, InvalidParameterError)
+        except ImportError:
+            exceptions = (ValueError,)
+        with pytest.raises(exceptions):
             openml.runs.run_model_on_task(
                 task=task,
                 model=clf,
@@ -362,12 +452,21 @@ def test_check_erronous_sklearn_flow_fails(self):
     # execution of the unit tests without the need to add an additional module
     # like unittest2
 
-    def _run_and_upload(self, clf, task_id, n_missing_vals, n_test_obs,
-                        flow_expected_rsv, num_folds=1, num_iterations=5,
-                        seed=1, metric=sklearn.metrics.accuracy_score,
-                        metric_name='predictive_accuracy',
-                        task_type=TaskTypeEnum.SUPERVISED_CLASSIFICATION,
-                        sentinel=None):
+    def _run_and_upload(
+        self,
+        clf,
+        task_id,
+        n_missing_vals,
+        n_test_obs,
+        flow_expected_rsv,
+        num_folds=1,
+        num_iterations=5,
+        seed=1,
+        metric=sklearn.metrics.accuracy_score,
+        metric_name="predictive_accuracy",
+        task_type=TaskType.SUPERVISED_CLASSIFICATION,
+        sentinel=None,
+    ):
         def determine_grid_size(param_grid):
             if isinstance(param_grid, dict):
                 grid_iterations = 1
@@ -380,38 +479,40 @@ def determine_grid_size(param_grid):
                     grid_iterations += determine_grid_size(sub_grid)
                 return grid_iterations
             else:
-                raise TypeError('Param Grid should be of type list '
-                                '(GridSearch only) or dict')
+                raise TypeError("Param Grid should be of type list (GridSearch only) or dict")
 
-        run = self._perform_run(task_id, n_test_obs, n_missing_vals, clf,
-                                flow_expected_rsv=flow_expected_rsv, seed=seed,
-                                sentinel=sentinel)
+        run = self._perform_run(
+            task_id,
+            n_test_obs,
+            n_missing_vals,
+            clf,
+            flow_expected_rsv=flow_expected_rsv,
+            seed=seed,
+            sentinel=sentinel,
+        )
 
         # obtain scores using get_metric_score:
         scores = run.get_metric_fn(metric)
         # compare with the scores in user defined measures
         scores_provided = []
-        for rep in run.fold_evaluations[metric_name].keys():
-            for fold in run.fold_evaluations[metric_name][rep].keys():
-                scores_provided.append(
-                    run.fold_evaluations[metric_name][rep][fold])
-        self.assertEqual(sum(scores_provided), sum(scores))
+        for rep in run.fold_evaluations[metric_name]:
+            for fold in run.fold_evaluations[metric_name][rep]:
+                scores_provided.append(run.fold_evaluations[metric_name][rep][fold])
+        assert sum(scores_provided) == sum(scores)
 
         if isinstance(clf, BaseSearchCV):
-            trace_content = run.trace.trace_to_arff()['data']
+            trace_content = run.trace.trace_to_arff()["data"]
             if isinstance(clf, GridSearchCV):
                 grid_iterations = determine_grid_size(clf.param_grid)
-                self.assertEqual(len(trace_content),
-                                 grid_iterations * num_folds)
+                assert len(trace_content) == grid_iterations * num_folds
             else:
-                self.assertEqual(len(trace_content),
-                                 num_iterations * num_folds)
+                assert len(trace_content) == num_iterations * num_folds
 
             # downloads the best model based on the optimization trace
             # suboptimal (slow), and not guaranteed to work if evaluation
             # engine is behind.
             # TODO: mock this? We have the arff already on the server
-            self._wait_for_processed_run(run.run_id, 200)
+            self._wait_for_processed_run(run.run_id, 600)
             try:
                 model_prime = openml.runs.initialize_model_from_trace(
                     run_id=run.run_id,
@@ -419,31 +520,69 @@ def determine_grid_size(param_grid):
                     fold=0,
                 )
             except openml.exceptions.OpenMLServerException as e:
-                e.additional = "%s; run_id %d" % (e.additional, run.run_id)
+                e.message = "%s; run_id %d" % (e.message, run.run_id)
                 raise e
 
-            self._rerun_model_and_compare_predictions(run.run_id, model_prime,
-                                                      seed)
+            self._rerun_model_and_compare_predictions(
+                run.run_id,
+                model_prime,
+                seed,
+                create_task_obj=True,
+            )
+            self._rerun_model_and_compare_predictions(
+                run.run_id,
+                model_prime,
+                seed,
+                create_task_obj=False,
+            )
         else:
             run_downloaded = openml.runs.get_run(run.run_id)
             sid = run_downloaded.setup_id
             model_prime = openml.setups.initialize_model(sid)
-            self._rerun_model_and_compare_predictions(run.run_id,
-                                                      model_prime, seed)
+            self._rerun_model_and_compare_predictions(
+                run.run_id,
+                model_prime,
+                seed,
+                create_task_obj=True,
+            )
+            self._rerun_model_and_compare_predictions(
+                run.run_id,
+                model_prime,
+                seed,
+                create_task_obj=False,
+            )
 
         # todo: check if runtime is present
-        self._check_fold_timing_evaluations(run.fold_evaluations, 1, num_folds,
-                                            task_type=task_type)
+        self._check_fold_timing_evaluations(
+            fold_evaluations=run.fold_evaluations,
+            num_repeats=1,
+            num_folds=num_folds,
+            task_type=task_type,
+        )
+
+        # Check if run string and print representation do not run into an error
+        #   The above check already verifies that all columns needed for supported
+        #   representations are present.
+        #   Supported: SUPERVISED_CLASSIFICATION, LEARNING_CURVE, SUPERVISED_REGRESSION
+        str(run)
+        self.logger.info(run)
+
         return run
 
-    def _run_and_upload_classification(self, clf, task_id, n_missing_vals,
-                                       n_test_obs, flow_expected_rsv,
-                                       sentinel=None):
+    def _run_and_upload_classification(
+        self,
+        clf,
+        task_id,
+        n_missing_vals,
+        n_test_obs,
+        flow_expected_rsv,
+        sentinel=None,
+    ):
         num_folds = 1  # because of holdout
         num_iterations = 5  # for base search algorithms
         metric = sklearn.metrics.accuracy_score  # metric class
-        metric_name = 'predictive_accuracy'  # openml metric name
-        task_type = TaskTypeEnum.SUPERVISED_CLASSIFICATION  # task type
+        metric_name = "predictive_accuracy"  # openml metric name
+        task_type = TaskType.SUPERVISED_CLASSIFICATION  # task type
 
         return self._run_and_upload(
             clf=clf,
@@ -459,14 +598,20 @@ def _run_and_upload_classification(self, clf, task_id, n_missing_vals,
             sentinel=sentinel,
         )
 
-    def _run_and_upload_regression(self, clf, task_id, n_missing_vals,
-                                   n_test_obs, flow_expected_rsv,
-                                   sentinel=None):
-        num_folds = 1  # because of holdout
+    def _run_and_upload_regression(
+        self,
+        clf,
+        task_id,
+        n_missing_vals,
+        n_test_obs,
+        flow_expected_rsv,
+        sentinel=None,
+    ):
+        num_folds = 10  # because of cross-validation
         num_iterations = 5  # for base search algorithms
         metric = sklearn.metrics.mean_absolute_error  # metric class
-        metric_name = 'mean_absolute_error'  # openml metric name
-        task_type = TaskTypeEnum.SUPERVISED_REGRESSION  # task type
+        metric_name = "mean_absolute_error"  # openml metric name
+        task_type = TaskType.SUPERVISED_REGRESSION  # task type
 
         return self._run_and_upload(
             clf=clf,
@@ -482,36 +627,66 @@ def _run_and_upload_regression(self, clf, task_id, n_missing_vals,
             sentinel=sentinel,
         )
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_and_upload_logistic_regression(self):
-        lr = LogisticRegression(solver='lbfgs')
-        task_id = self.TEST_SERVER_TASK_SIMPLE[0]
-        n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1]
-        n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2]
-        self._run_and_upload_classification(lr, task_id, n_missing_vals,
-                                            n_test_obs, '62501')
-
+        lr = LogisticRegression(solver="lbfgs", max_iter=1000)
+        task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
+        n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
+        n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
+        self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501")
+
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_and_upload_linear_regression(self):
         lr = LinearRegression()
-        task_id = self.TEST_SERVER_TASK_REGRESSION[0]
-        n_missing_vals = self.TEST_SERVER_TASK_REGRESSION[1]
-        n_test_obs = self.TEST_SERVER_TASK_REGRESSION[2]
-        self._run_and_upload_regression(lr, task_id, n_missing_vals,
-                                        n_test_obs, '62501')
+        task_id = self.TEST_SERVER_TASK_REGRESSION["task_id"]
 
-    def test_run_and_upload_pipeline_dummy_pipeline(self):
+        task_meta_data = self.TEST_SERVER_TASK_REGRESSION["task_meta_data"]
+        _task_id = check_task_existence(**task_meta_data)
+        if _task_id is not None:
+            task_id = _task_id
+        else:
+            new_task = openml.tasks.create_task(**task_meta_data)
+            # publishes the new task
+            try:
+                new_task = new_task.publish()
+                task_id = new_task.task_id
+            except OpenMLServerException as e:
+                if e.code == 614:  # Task already exists
+                    # the exception message contains the task_id that was matched in the format
+                    # 'Task already exists. - matched id(s): [xxxx]'
+                    task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+                else:
+                    raise Exception(repr(e))
+            # mark to remove the uploaded task
+            TestBase._mark_entity_for_removal("task", task_id)
+            TestBase.logger.info(f"collected from test_run_functions: {task_id}")
 
-        pipeline1 = Pipeline(steps=[('scaler',
-                                     StandardScaler(with_mean=False)),
-                                    ('dummy',
-                                     DummyClassifier(strategy='prior'))])
-        task_id = self.TEST_SERVER_TASK_SIMPLE[0]
-        n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1]
-        n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2]
-        self._run_and_upload_classification(pipeline1, task_id, n_missing_vals,
-                                            n_test_obs, '62501')
-
-    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
-                     reason="columntransformer introduction in 0.20.0")
+        n_missing_vals = self.TEST_SERVER_TASK_REGRESSION["n_missing_vals"]
+        n_test_obs = self.TEST_SERVER_TASK_REGRESSION["n_test_obs"]
+        self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501")
+
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
+    def test_run_and_upload_pipeline_dummy_pipeline(self):
+        pipeline1 = Pipeline(
+            steps=[
+                ("scaler", StandardScaler(with_mean=False)),
+                ("dummy", DummyClassifier(strategy="prior")),
+            ],
+        )
+        task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
+        n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
+        n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
+        self._run_and_upload_classification(pipeline1, task_id, n_missing_vals, n_test_obs, "62501")
+
+    @pytest.mark.sklearn()
+    @unittest.skipIf(
+        Version(sklearn.__version__) < Version("0.20"),
+        reason="columntransformer introduction in 0.20.0",
+    )
+    @pytest.mark.test_server()
     def test_run_and_upload_column_transformer_pipeline(self):
         import sklearn.compose
         import sklearn.impute
@@ -519,94 +694,168 @@ def test_run_and_upload_column_transformer_pipeline(self):
         def get_ct_cf(nominal_indices, numeric_indices):
             inner = sklearn.compose.ColumnTransformer(
                 transformers=[
-                    ('numeric', sklearn.preprocessing.StandardScaler(),
-                     nominal_indices),
-                    ('nominal', sklearn.preprocessing.OneHotEncoder(
-                        handle_unknown='ignore'), numeric_indices)],
-                remainder='passthrough')
+                    (
+                        "numeric",
+                        make_pipeline(
+                            SimpleImputer(strategy="mean"),
+                            sklearn.preprocessing.StandardScaler(),
+                        ),
+                        numeric_indices,
+                    ),
+                    (
+                        "nominal",
+                        make_pipeline(
+                            CustomImputer(strategy="most_frequent"),
+                            sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore"),
+                        ),
+                        nominal_indices,
+                    ),
+                ],
+                remainder="passthrough",
+            )
             return sklearn.pipeline.Pipeline(
                 steps=[
-                    ('imputer', sklearn.impute.SimpleImputer(
-                        strategy='constant', fill_value=-1)),
-                    ('transformer', inner),
-                    ('classifier', sklearn.tree.DecisionTreeClassifier())
-                ]
+                    ("transformer", inner),
+                    ("classifier", sklearn.tree.DecisionTreeClassifier()),
+                ],
             )
 
         sentinel = self._get_sentinel()
         self._run_and_upload_classification(
-            get_ct_cf(self.TEST_SERVER_TASK_SIMPLE[3],
-                      self.TEST_SERVER_TASK_SIMPLE[4]),
-            self.TEST_SERVER_TASK_SIMPLE[0], self.TEST_SERVER_TASK_SIMPLE[1],
-            self.TEST_SERVER_TASK_SIMPLE[2], '62501', sentinel=sentinel)
+            get_ct_cf(
+                self.TEST_SERVER_TASK_SIMPLE["nominal_indices"],
+                self.TEST_SERVER_TASK_SIMPLE["numeric_indices"],
+            ),
+            self.TEST_SERVER_TASK_SIMPLE["task_id"],
+            self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"],
+            self.TEST_SERVER_TASK_SIMPLE["n_test_obs"],
+            "62501",
+            sentinel=sentinel,
+        )
         # Due to #602, it is important to test this model on two tasks
         # with different column specifications
         self._run_and_upload_classification(
-            get_ct_cf(self.TEST_SERVER_TASK_MISSING_VALS[3],
-                      self.TEST_SERVER_TASK_MISSING_VALS[4]),
-            self.TEST_SERVER_TASK_MISSING_VALS[0],
-            self.TEST_SERVER_TASK_MISSING_VALS[1],
-            self.TEST_SERVER_TASK_MISSING_VALS[2],
-            '62501', sentinel=sentinel)
-
-    def test_run_and_upload_decision_tree_pipeline(self):
-        pipeline2 = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')),
-                                    ('VarianceThreshold', VarianceThreshold()),
-                                    ('Estimator', RandomizedSearchCV(
-                                        DecisionTreeClassifier(),
-                                        {'min_samples_split':
-                                         [2 ** x for x in range(1, 8)],
-                                         'min_samples_leaf':
-                                         [2 ** x for x in range(0, 7)]},
-                                        cv=3, n_iter=10))])
-        task_id = self.TEST_SERVER_TASK_MISSING_VALS[0]
-        n_missing_vals = self.TEST_SERVER_TASK_MISSING_VALS[1]
-        n_test_obs = self.TEST_SERVER_TASK_MISSING_VALS[2]
-        self._run_and_upload_classification(pipeline2, task_id, n_missing_vals,
-                                            n_test_obs, '62501')
+            get_ct_cf(
+                self.TEST_SERVER_TASK_MISSING_VALS["nominal_indices"],
+                self.TEST_SERVER_TASK_MISSING_VALS["numeric_indices"],
+            ),
+            self.TEST_SERVER_TASK_MISSING_VALS["task_id"],
+            self.TEST_SERVER_TASK_MISSING_VALS["n_missing_vals"],
+            self.TEST_SERVER_TASK_MISSING_VALS["n_test_obs"],
+            "62501",
+            sentinel=sentinel,
+        )
 
+    @pytest.mark.sklearn()
+    @unittest.skip("https://github.com/openml/OpenML/issues/1180")
+    @unittest.skipIf(
+        Version(sklearn.__version__) < Version("0.20"),
+        reason="columntransformer introduction in 0.20.0",
+    )
+    @mock.patch("warnings.warn")
+    def test_run_and_upload_knn_pipeline(self, warnings_mock):
+        cat_imp = make_pipeline(
+            SimpleImputer(strategy="most_frequent"),
+            OneHotEncoder(handle_unknown="ignore"),
+        )
+        cont_imp = make_pipeline(CustomImputer(), StandardScaler())
+        from sklearn.compose import ColumnTransformer
+        from sklearn.neighbors import KNeighborsClassifier
+
+        ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
+        pipeline2 = Pipeline(
+            steps=[
+                ("Imputer", ct),
+                ("VarianceThreshold", VarianceThreshold()),
+                (
+                    "Estimator",
+                    RandomizedSearchCV(
+                        KNeighborsClassifier(),
+                        {"n_neighbors": list(range(2, 10))},
+                        cv=3,
+                        n_iter=10,
+                    ),
+                ),
+            ],
+        )
+
+        task_id = self.TEST_SERVER_TASK_MISSING_VALS["task_id"]
+        n_missing_vals = self.TEST_SERVER_TASK_MISSING_VALS["n_missing_vals"]
+        n_test_obs = self.TEST_SERVER_TASK_MISSING_VALS["n_test_obs"]
+        self._run_and_upload_classification(pipeline2, task_id, n_missing_vals, n_test_obs, "62501")
+        # The warning raised is:
+        # "The total space of parameters 8 is smaller than n_iter=10.
+        # Running 8 iterations. For exhaustive searches, use GridSearchCV."
+        # It is raised three times because we once run the model to upload something and then run
+        # it again twice to compare that the predictions are reproducible.
+        warning_msg = (
+            "The total space of parameters 8 is smaller than n_iter=10. "
+            "Running 8 iterations. For exhaustive searches, use GridSearchCV."
+        )
+        call_count = 0
+        for _warnings in warnings_mock.call_args_list:
+            if _warnings[0][0] == warning_msg:
+                call_count += 1
+        assert call_count == 3
+
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_and_upload_gridsearch(self):
-        gridsearch = GridSearchCV(BaggingClassifier(base_estimator=SVC()),
-                                  {"base_estimator__C": [0.01, 0.1, 10],
-                                   "base_estimator__gamma": [0.01, 0.1, 10]})
-        task_id = self.TEST_SERVER_TASK_SIMPLE[0]
-        n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1]
-        n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2]
+        estimator_name = (
+            "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+        )
+        gridsearch = GridSearchCV(
+            BaggingClassifier(**{estimator_name: SVC()}),
+            {f"{estimator_name}__C": [0.01, 0.1, 10], f"{estimator_name}__gamma": [0.01, 0.1, 10]},
+            cv=3,
+        )
+        task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
+        n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
+        n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
         run = self._run_and_upload_classification(
             clf=gridsearch,
             task_id=task_id,
             n_missing_vals=n_missing_vals,
             n_test_obs=n_test_obs,
-            flow_expected_rsv='62501',
+            flow_expected_rsv="62501",
         )
-        self.assertEqual(len(run.trace.trace_iterations), 9)
+        assert len(run.trace.trace_iterations) == 9
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_and_upload_randomsearch(self):
         randomsearch = RandomizedSearchCV(
             RandomForestClassifier(n_estimators=5),
-            {"max_depth": [3, None],
-             "max_features": [1, 2, 3, 4],
-             "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
-             "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-             "bootstrap": [True, False],
-             "criterion": ["gini", "entropy"]},
+            {
+                "max_depth": [3, None],
+                "max_features": [1, 2, 3, 4],
+                "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
+                "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+                "bootstrap": [True, False],
+                "criterion": ["gini", "entropy"],
+            },
             cv=StratifiedKFold(n_splits=2, shuffle=True),
-            n_iter=5)
+            n_iter=5,
+        )
         # The random states for the RandomizedSearchCV is set after the
         # random state of the RandomForestClassifier is set, therefore,
         # it has a different value than the other examples before
-        task_id = self.TEST_SERVER_TASK_SIMPLE[0]
-        n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1]
-        n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2]
+        task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
+        n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
+        n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
         run = self._run_and_upload_classification(
             clf=randomsearch,
             task_id=task_id,
             n_missing_vals=n_missing_vals,
             n_test_obs=n_test_obs,
-            flow_expected_rsv='12172',
+            flow_expected_rsv="12172",
         )
-        self.assertEqual(len(run.trace.trace_iterations), 5)
+        assert len(run.trace.trace_iterations) == 5
+        trace = openml.runs.get_run_trace(run.run_id)
+        assert len(trace.trace_iterations) == 5
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_and_upload_maskedarrays(self):
         # This testcase is important for 2 reasons:
         # 1) it verifies the correct handling of masked arrays (not all
@@ -614,24 +863,27 @@ def test_run_and_upload_maskedarrays(self):
         # 2) it verifies the correct handling of a 2-layered grid search
         gridsearch = GridSearchCV(
             RandomForestClassifier(n_estimators=5),
-            [
-                {'max_features': [2, 4]},
-                {'min_samples_leaf': [1, 10]}
-            ],
-            cv=StratifiedKFold(n_splits=2, shuffle=True)
+            [{"max_features": [2, 4]}, {"min_samples_leaf": [1, 10]}],
+            cv=StratifiedKFold(n_splits=2, shuffle=True),
         )
         # The random states for the GridSearchCV is set after the
         # random state of the RandomForestClassifier is set, therefore,
         # it has a different value than the other examples before
-        task_id = self.TEST_SERVER_TASK_SIMPLE[0]
-        n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1]
-        n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2]
-        self._run_and_upload_classification(gridsearch, task_id,
-                                            n_missing_vals, n_test_obs,
-                                            '12172')
+        task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
+        n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
+        n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
+        self._run_and_upload_classification(
+            gridsearch,
+            task_id,
+            n_missing_vals,
+            n_test_obs,
+            "12172",
+        )
 
     ##########################################################################
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_learning_curve_task_1(self):
         task_id = 801  # diabates dataset
         num_test_instances = 6144  # for learning curve
@@ -640,15 +892,23 @@ def test_learning_curve_task_1(self):
         num_folds = 10
         num_samples = 8
 
-        pipeline1 = Pipeline(steps=[('scaler',
-                                     StandardScaler(with_mean=False)),
-                                    ('dummy',
-                                     DummyClassifier(strategy='prior'))])
-        run = self._perform_run(task_id, num_test_instances, num_missing_vals,
-                                pipeline1, flow_expected_rsv='62501')
-        self._check_sample_evaluations(run.sample_evaluations, num_repeats,
-                                       num_folds, num_samples)
+        pipeline1 = Pipeline(
+            steps=[
+                ("scaler", StandardScaler(with_mean=False)),
+                ("dummy", DummyClassifier(strategy="prior")),
+            ],
+        )
+        run = self._perform_run(
+            task_id,
+            num_test_instances,
+            num_missing_vals,
+            pipeline1,
+            flow_expected_rsv="62501",
+        )
+        self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_learning_curve_task_2(self):
         task_id = 801  # diabates dataset
         num_test_instances = 6144  # for learning curve
@@ -657,138 +917,190 @@ def test_learning_curve_task_2(self):
         num_folds = 10
         num_samples = 8
 
-        pipeline2 = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')),
-                                    ('VarianceThreshold', VarianceThreshold()),
-                                    ('Estimator', RandomizedSearchCV(
-                                        DecisionTreeClassifier(),
-                                        {'min_samples_split':
-                                         [2 ** x for x in range(1, 8)],
-                                         'min_samples_leaf':
-                                         [2 ** x for x in range(0, 7)]},
-                                        cv=3, n_iter=10))])
-        run = self._perform_run(task_id, num_test_instances, num_missing_vals,
-                                pipeline2, flow_expected_rsv='62501')
-        self._check_sample_evaluations(run.sample_evaluations, num_repeats,
-                                       num_folds, num_samples)
-
+        pipeline2 = Pipeline(
+            steps=[
+                ("Imputer", SimpleImputer(strategy="median")),
+                ("VarianceThreshold", VarianceThreshold()),
+                (
+                    "Estimator",
+                    RandomizedSearchCV(
+                        DecisionTreeClassifier(),
+                        {
+                            "min_samples_split": [2**x for x in range(1, 8)],
+                            "min_samples_leaf": [2**x for x in range(7)],
+                        },
+                        cv=3,
+                        n_iter=10,
+                    ),
+                ),
+            ],
+        )
+        run = self._perform_run(
+            task_id,
+            num_test_instances,
+            num_missing_vals,
+            pipeline2,
+            flow_expected_rsv="62501",
+        )
+        self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
+
+    @pytest.mark.sklearn()
+    @unittest.skipIf(
+        Version(sklearn.__version__) < Version("0.21"),
+        reason="Pipelines don't support indexing (used for the assert check)",
+    )
+    @pytest.mark.test_server()
     def test_initialize_cv_from_run(self):
-        randomsearch = RandomizedSearchCV(
-            RandomForestClassifier(n_estimators=5),
-            {"max_depth": [3, None],
-             "max_features": [1, 2, 3, 4],
-             "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
-             "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-             "bootstrap": [True, False],
-             "criterion": ["gini", "entropy"]},
-            cv=StratifiedKFold(n_splits=2, shuffle=True),
-            n_iter=2)
+        randomsearch = Pipeline(
+            [
+                ("enc", OneHotEncoder(handle_unknown="ignore")),
+                (
+                    "rs",
+                    RandomizedSearchCV(
+                        RandomForestClassifier(n_estimators=5),
+                        {
+                            "max_depth": [3, None],
+                            "max_features": [1, 2, 3, 4],
+                            "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
+                            "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+                            "bootstrap": [True, False],
+                            "criterion": ["gini", "entropy"],
+                        },
+                        cv=StratifiedKFold(n_splits=2, shuffle=True),
+                        n_iter=2,
+                    ),
+                ),
+            ],
+        )
 
-        task = openml.tasks.get_task(11)
+        task = openml.tasks.get_task(11)  # kr-vs-kp; holdout
         run = openml.runs.run_model_on_task(
             model=randomsearch,
             task=task,
-            avoid_duplicate_runs=False,
             seed=1,
         )
         run_ = run.publish()
-        TestBase._mark_entity_for_removal('run', run.run_id)
-        TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id))
+        TestBase._mark_entity_for_removal("run", run.run_id)
+        TestBase.logger.info(f"collected from test_run_functions: {run.run_id}")
         run = openml.runs.get_run(run_.run_id)
 
         modelR = openml.runs.initialize_model_from_run(run_id=run.run_id)
         modelS = openml.setups.initialize_model(setup_id=run.setup_id)
 
-        self.assertEqual(modelS.cv.random_state, 62501)
-        self.assertEqual(modelR.cv.random_state, 62501)
+        assert modelS[-1].cv.random_state == 62501
+        assert modelR[-1].cv.random_state == 62501
 
     def _test_local_evaluations(self, run):
-
         # compare with the scores in user defined measures
         accuracy_scores_provided = []
-        for rep in run.fold_evaluations['predictive_accuracy'].keys():
-            for fold in run.fold_evaluations['predictive_accuracy'][rep].\
-                    keys():
+        for rep in run.fold_evaluations["predictive_accuracy"]:
+            for fold in run.fold_evaluations["predictive_accuracy"][rep]:
                 accuracy_scores_provided.append(
-                    run.fold_evaluations['predictive_accuracy'][rep][fold])
+                    run.fold_evaluations["predictive_accuracy"][rep][fold],
+                )
         accuracy_scores = run.get_metric_fn(sklearn.metrics.accuracy_score)
-        np.testing.assert_array_almost_equal(accuracy_scores_provided,
-                                             accuracy_scores)
+        np.testing.assert_array_almost_equal(accuracy_scores_provided, accuracy_scores)
 
         # also check if we can obtain some other scores:
-        tests = [(sklearn.metrics.cohen_kappa_score, {'weights': None}),
-                 (sklearn.metrics.roc_auc_score, {}),
-                 (sklearn.metrics.average_precision_score, {}),
-                 (sklearn.metrics.jaccard_similarity_score, {}),
-                 (sklearn.metrics.precision_score, {'average': 'macro'}),
-                 (sklearn.metrics.brier_score_loss, {})]
-        for test_idx, test in enumerate(tests):
+        tests = [
+            (sklearn.metrics.cohen_kappa_score, {"weights": None}),
+            (sklearn.metrics.roc_auc_score, {}),
+            (sklearn.metrics.average_precision_score, {}),
+            (sklearn.metrics.precision_score, {"average": "macro"}),
+            (sklearn.metrics.brier_score_loss, {}),
+        ]
+        if Version(sklearn.__version__) < Version("0.23"):
+            tests.append((sklearn.metrics.jaccard_similarity_score, {}))
+        else:
+            tests.append((sklearn.metrics.jaccard_score, {}))
+        for _test_idx, test in enumerate(tests):
             alt_scores = run.get_metric_fn(
                 sklearn_fn=test[0],
                 kwargs=test[1],
             )
-            self.assertEqual(len(alt_scores), 10)
+            assert len(alt_scores) == 10
             for idx in range(len(alt_scores)):
-                self.assertGreaterEqual(alt_scores[idx], 0)
-                self.assertLessEqual(alt_scores[idx], 1)
+                assert alt_scores[idx] >= 0
+                assert alt_scores[idx] <= 1
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_local_run_swapped_parameter_order_model(self):
+        clf = DecisionTreeClassifier()
+        australian_task = 595  # Australian; crossvalidation
+        task = openml.tasks.get_task(australian_task)
 
-        # construct sci-kit learn classifier
-        clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
-                              ('estimator', RandomForestClassifier())])
-
-        # download task
-        task = openml.tasks.get_task(7)
-
-        # invoke OpenML run
+        # task and clf are purposely in the old order
         run = openml.runs.run_model_on_task(
-            task, clf,
-            avoid_duplicate_runs=False,
+            task,
+            clf,
             upload_flow=False,
         )
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.sklearn()
+    @pytest.mark.skip("https://github.com/openml/openml-python/issues/1586")
+    @unittest.skipIf(
+        Version(sklearn.__version__) < Version("0.20"),
+        reason="SimpleImputer doesn't handle mixed type DataFrame as input",
+    )
+    @pytest.mark.test_server()
     def test_local_run_swapped_parameter_order_flow(self):
-
         # construct sci-kit learn classifier
-        clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
-                              ('estimator', RandomForestClassifier())])
+        clf = Pipeline(
+            steps=[
+                ("imputer", SimpleImputer(strategy="most_frequent")),
+                ("encoder", OneHotEncoder(handle_unknown="ignore")),
+                ("estimator", RandomForestClassifier(n_estimators=10)),
+            ],
+        )
 
         flow = self.extension.model_to_flow(clf)
         # download task
-        task = openml.tasks.get_task(7)
+        task = openml.tasks.get_task(7)  # kr-vs-kp; crossvalidation
 
         # invoke OpenML run
         run = openml.runs.run_flow_on_task(
-            task, flow,
-            avoid_duplicate_runs=False,
+            task,
+            flow,
             upload_flow=False,
         )
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.skip(reason="https://github.com/openml/openml-python/issues/1586")
+    @pytest.mark.sklearn()
+    @unittest.skipIf(
+        Version(sklearn.__version__) < Version("0.20"),
+        reason="SimpleImputer doesn't handle mixed type DataFrame as input",
+    )
+    @pytest.mark.test_server()
     def test_local_run_metric_score(self):
-
         # construct sci-kit learn classifier
-        clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
-                              ('estimator', RandomForestClassifier())])
+        clf = Pipeline(
+            steps=[
+                ("imputer", SimpleImputer(strategy="most_frequent")),
+                ("encoder", OneHotEncoder(handle_unknown="ignore")),
+                ("estimator", RandomForestClassifier(n_estimators=10)),
+            ],
+        )
 
         # download task
-        task = openml.tasks.get_task(7)
+        task = openml.tasks.get_task(7)  # kr-vs-kp; crossvalidation
 
         # invoke OpenML run
         run = openml.runs.run_model_on_task(
             model=clf,
             task=task,
-            avoid_duplicate_runs=False,
             upload_flow=False,
         )
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.production_server()
     def test_online_run_metric_score(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         # important to use binary classification task,
         # due to assertions
@@ -796,20 +1108,54 @@ def test_online_run_metric_score(self):
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.sklearn()
+    @unittest.skipIf(
+        Version(sklearn.__version__) < Version("0.20"),
+        reason="SimpleImputer doesn't handle mixed type DataFrame as input",
+    )
+    @pytest.mark.test_server()
     def test_initialize_model_from_run(self):
-        clf = sklearn.pipeline.Pipeline(steps=[
-            ('Imputer', SimpleImputer(strategy='median')),
-            ('VarianceThreshold', VarianceThreshold(threshold=0.05)),
-            ('Estimator', GaussianNB())])
-        task = openml.tasks.get_task(11)
+        clf = sklearn.pipeline.Pipeline(
+            steps=[
+                ("Imputer", SimpleImputer(strategy="most_frequent")),
+                ("VarianceThreshold", VarianceThreshold(threshold=0.05)),
+                ("Estimator", GaussianNB()),
+            ],
+        )
+        task_meta_data = {
+            "task_type": TaskType.SUPERVISED_CLASSIFICATION,
+            "dataset_id": 128,  # iris
+            "estimation_procedure_id": 1,
+            "target_name": "class",
+        }
+        _task_id = check_task_existence(**task_meta_data)
+        if _task_id is not None:
+            task_id = _task_id
+        else:
+            new_task = openml.tasks.create_task(**task_meta_data)
+            # publishes the new task
+            try:
+                new_task = new_task.publish()
+                task_id = new_task.task_id
+            except OpenMLServerException as e:
+                if e.code == 614:  # Task already exists
+                    # the exception message contains the task_id that was matched in the format
+                    # 'Task already exists. - matched id(s): [xxxx]'
+                    task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+                else:
+                    raise Exception(repr(e))
+            # mark to remove the uploaded task
+            TestBase._mark_entity_for_removal("task", task_id)
+            TestBase.logger.info(f"collected from test_run_functions: {task_id}")
+
+        task = openml.tasks.get_task(task_id)
         run = openml.runs.run_model_on_task(
             model=clf,
             task=task,
-            avoid_duplicate_runs=False,
         )
         run_ = run.publish()
-        TestBase._mark_entity_for_removal('run', run_.run_id)
-        TestBase.logger.info("collected from test_run_functions: {}".format(run_.run_id))
+        TestBase._mark_entity_for_removal("run", run_.run_id)
+        TestBase.logger.info(f"collected from test_run_functions: {run_.run_id}")
         run = openml.runs.get_run(run_.run_id)
 
         modelR = openml.runs.initialize_model_from_run(run_id=run.run_id)
@@ -821,79 +1167,38 @@ def test_initialize_model_from_run(self):
         openml.flows.assert_flows_equal(flowR, flowL)
         openml.flows.assert_flows_equal(flowS, flowL)
 
-        self.assertEqual(flowS.components['Imputer'].
-                         parameters['strategy'], '"median"')
-        self.assertEqual(flowS.components['VarianceThreshold'].
-                         parameters['threshold'], '0.05')
-
-    def test_get_run_trace(self):
-        # get_run_trace is already tested implicitly in test_run_and_publish
-        # this test is a bit additional.
-        num_iterations = 10
-        num_folds = 1
-        task_id = 119
-
-        task = openml.tasks.get_task(task_id)
-
-        # IMPORTANT! Do not sentinel this flow. is faster if we don't wait
-        # on openml server
-        clf = RandomizedSearchCV(RandomForestClassifier(random_state=42,
-                                                        n_estimators=5),
-
-                                 {"max_depth": [3, None],
-                                  "max_features": [1, 2, 3, 4],
-                                  "bootstrap": [True, False],
-                                  "criterion": ["gini", "entropy"]},
-                                 num_iterations, random_state=42, cv=3)
-
-        # [SPEED] make unit test faster by exploiting run information
-        # from the past
-        try:
-            # in case the run did not exists yet
-            run = openml.runs.run_model_on_task(
-                model=clf,
-                task=task,
-                avoid_duplicate_runs=True,
-            )
-
-            self.assertEqual(
-                len(run.trace.trace_iterations),
-                num_iterations * num_folds,
-            )
-            run = run.publish()
-            TestBase._mark_entity_for_removal('run', run.run_id)
-            TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id))
-            self._wait_for_processed_run(run.run_id, 200)
-            run_id = run.run_id
-        except openml.exceptions.OpenMLRunsExistError as e:
-            # The only error we expect, should fail otherwise.
-            run_ids = [int(run_id) for run_id in e.run_ids]
-            self.assertGreater(len(run_ids), 0)
-            run_id = random.choice(list(run_ids))
-
-        # now the actual unit test ...
-        run_trace = openml.runs.get_run_trace(run_id)
-        self.assertEqual(len(run_trace.trace_iterations), num_iterations * num_folds)
+        assert flowS.components["Imputer"].parameters["strategy"] == '"most_frequent"'
+        assert flowS.components["VarianceThreshold"].parameters["threshold"] == "0.05"
 
+    @pytest.mark.sklearn()
+    @unittest.skipIf(
+        Version(sklearn.__version__) < Version("0.20"),
+        reason="SimpleImputer doesn't handle mixed type DataFrame as input",
+    )
+    @pytest.mark.test_server()
     def test__run_exists(self):
         # would be better to not sentinel these clfs,
         # so we do not have to perform the actual runs
         # and can just check their status on line
         rs = 1
         clfs = [
-            sklearn.pipeline.Pipeline(steps=[
-                ('Imputer', SimpleImputer(strategy='mean')),
-                ('VarianceThreshold', VarianceThreshold(threshold=0.05)),
-                ('Estimator', DecisionTreeClassifier(max_depth=4))
-            ]),
-            sklearn.pipeline.Pipeline(steps=[
-                ('Imputer', SimpleImputer(strategy='most_frequent')),
-                ('VarianceThreshold', VarianceThreshold(threshold=0.1)),
-                ('Estimator', DecisionTreeClassifier(max_depth=4))]
-            )
+            sklearn.pipeline.Pipeline(
+                steps=[
+                    ("Imputer", SimpleImputer(strategy="mean")),
+                    ("VarianceThreshold", VarianceThreshold(threshold=0.05)),
+                    ("Estimator", DecisionTreeClassifier(max_depth=4)),
+                ],
+            ),
+            sklearn.pipeline.Pipeline(
+                steps=[
+                    ("Imputer", SimpleImputer(strategy="most_frequent")),
+                    ("VarianceThreshold", VarianceThreshold(threshold=0.1)),
+                    ("Estimator", DecisionTreeClassifier(max_depth=4)),
+                ],
+            ),
         ]
 
-        task = openml.tasks.get_task(115)
+        task = openml.tasks.get_task(115)  # diabetes; crossvalidation
 
         for clf in clfs:
             try:
@@ -904,48 +1209,53 @@ def test__run_exists(self):
                     task=task,
                     seed=rs,
                     avoid_duplicate_runs=True,
-                    upload_flow=True
+                    upload_flow=True,
                 )
                 run.publish()
-                TestBase._mark_entity_for_removal('run', run.run_id)
-                TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id))
+                TestBase._mark_entity_for_removal("run", run.run_id)
+                TestBase.logger.info(f"collected from test_run_functions: {run.run_id}")
             except openml.exceptions.PyOpenMLError:
                 # run already existed. Great.
                 pass
 
             flow = self.extension.model_to_flow(clf)
             flow_exists = openml.flows.flow_exists(flow.name, flow.external_version)
-            self.assertGreater(flow_exists, 0)
+            assert flow_exists > 0, "Server says flow from run does not exist."
             # Do NOT use get_flow reinitialization, this potentially sets
             # hyperparameter values wrong. Rather use the local model.
             downloaded_flow = openml.flows.get_flow(flow_exists)
             downloaded_flow.model = clf
             setup_exists = openml.setups.setup_exists(downloaded_flow)
-            self.assertGreater(setup_exists, 0)
+            assert setup_exists > 0, "Server says setup of run does not exist."
             run_ids = run_exists(task.task_id, setup_exists)
-            self.assertTrue(run_ids, msg=(run_ids, clf))
+            assert run_ids, (run_ids, clf)
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_with_illegal_flow_id(self):
         # check the case where the user adds an illegal flow id to a
-        # non-existing flow
-        task = openml.tasks.get_task(115)
+        # non-existing flo
+        task = openml.tasks.get_task(115)  # diabetes; crossvalidation
         clf = DecisionTreeClassifier()
         flow = self.extension.model_to_flow(clf)
         flow, _ = self._add_sentinel_to_flow_name(flow, None)
         flow.flow_id = -1
-        expected_message_regex = ("Flow does not exist on the server, "
-                                  "but 'flow.flow_id' is not None.")
-        with self.assertRaisesRegex(openml.exceptions.PyOpenMLError, expected_message_regex):
+        expected_message_regex = (
+            r"Flow does not exist on the server, but 'flow.flow_id' is not None."
+        )
+        with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex):
             openml.runs.run_flow_on_task(
                 task=task,
                 flow=flow,
                 avoid_duplicate_runs=True,
             )
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_with_illegal_flow_id_after_load(self):
         # Same as `test_run_with_illegal_flow_id`, but test this error is also
         # caught if the run is stored to and loaded from disk first.
-        task = openml.tasks.get_task(115)
+        task = openml.tasks.get_task(115)  # diabetes; crossvalidation
         clf = DecisionTreeClassifier()
         flow = self.extension.model_to_flow(clf)
         flow, _ = self._add_sentinel_to_flow_name(flow, None)
@@ -953,62 +1263,63 @@ def test_run_with_illegal_flow_id_after_load(self):
         run = openml.runs.run_flow_on_task(
             task=task,
             flow=flow,
-            avoid_duplicate_runs=False,
-            upload_flow=False
+            upload_flow=False,
         )
 
         cache_path = os.path.join(
             self.workdir,
-            'runs',
+            "runs",
             str(random.getrandbits(128)),
         )
         run.to_filesystem(cache_path)
         loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
 
-        expected_message_regex = ("Flow does not exist on the server, "
-                                  "but 'flow.flow_id' is not None.")
-        with self.assertRaisesRegex(openml.exceptions.PyOpenMLError, expected_message_regex):
+        expected_message_regex = (
+            r"Flow does not exist on the server, but 'flow.flow_id' is not None."
+        )
+        with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex):
             loaded_run.publish()
-            TestBase._mark_entity_for_removal('run', loaded_run.run_id)
-            TestBase.logger.info("collected from test_run_functions: {}".format(loaded_run.run_id))
+            TestBase._mark_entity_for_removal("run", loaded_run.run_id)
+            TestBase.logger.info(f"collected from test_run_functions: {loaded_run.run_id}")
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_with_illegal_flow_id_1(self):
         # Check the case where the user adds an illegal flow id to an existing
         # flow. Comes to a different value error than the previous test
-        task = openml.tasks.get_task(115)
+        task = openml.tasks.get_task(115)  # diabetes; crossvalidation
         clf = DecisionTreeClassifier()
         flow_orig = self.extension.model_to_flow(clf)
         try:
             flow_orig.publish()  # ensures flow exist on server
-            TestBase._mark_entity_for_removal('flow', (flow_orig.flow_id, flow_orig.name))
-            TestBase.logger.info("collected from test_run_functions: {}".format(flow_orig.flow_id))
+            TestBase._mark_entity_for_removal("flow", flow_orig.flow_id, flow_orig.name)
+            TestBase.logger.info(f"collected from test_run_functions: {flow_orig.flow_id}")
         except openml.exceptions.OpenMLServerException:
             # flow already exists
             pass
         flow_new = self.extension.model_to_flow(clf)
 
         flow_new.flow_id = -1
-        expected_message_regex = (
-            "Local flow_id does not match server flow_id: "
-            "'-1' vs '[0-9]+'"
-        )
-        with self.assertRaisesRegex(openml.exceptions.PyOpenMLError, expected_message_regex):
+        expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
+        with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex):
             openml.runs.run_flow_on_task(
                 task=task,
                 flow=flow_new,
                 avoid_duplicate_runs=True,
             )
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_with_illegal_flow_id_1_after_load(self):
         # Same as `test_run_with_illegal_flow_id_1`, but test this error is
         # also caught if the run is stored to and loaded from disk first.
-        task = openml.tasks.get_task(115)
+        task = openml.tasks.get_task(115)  # diabetes; crossvalidation
         clf = DecisionTreeClassifier()
         flow_orig = self.extension.model_to_flow(clf)
         try:
             flow_orig.publish()  # ensures flow exist on server
-            TestBase._mark_entity_for_removal('flow', (flow_orig.flow_id, flow_orig.name))
-            TestBase.logger.info("collected from test_run_functions: {}".format(flow_orig.flow_id))
+            TestBase._mark_entity_for_removal("flow", flow_orig.flow_id, flow_orig.name)
+            TestBase.logger.info(f"collected from test_run_functions: {flow_orig.flow_id}")
         except openml.exceptions.OpenMLServerException:
             # flow already exists
             pass
@@ -1018,39 +1329,42 @@ def test_run_with_illegal_flow_id_1_after_load(self):
         run = openml.runs.run_flow_on_task(
             task=task,
             flow=flow_new,
-            avoid_duplicate_runs=False,
-            upload_flow=False
+            upload_flow=False,
         )
 
         cache_path = os.path.join(
             self.workdir,
-            'runs',
+            "runs",
             str(random.getrandbits(128)),
         )
         run.to_filesystem(cache_path)
         loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
 
-        expected_message_regex = (
-            "Local flow_id does not match server flow_id: "
-            "'-1' vs '[0-9]+'"
-        )
+        expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
         self.assertRaisesRegex(
             openml.exceptions.PyOpenMLError,
             expected_message_regex,
-            loaded_run.publish
+            loaded_run.publish,
         )
 
+    @pytest.mark.sklearn()
+    @unittest.skipIf(
+        Version(sklearn.__version__) < Version("0.20"),
+        reason="OneHotEncoder cannot handle mixed type DataFrame as input",
+    )
+    @pytest.mark.test_server()
     def test__run_task_get_arffcontent(self):
-        task = openml.tasks.get_task(7)
+        task = openml.tasks.get_task(7)  # kr-vs-kp; crossvalidation
         num_instances = 3196
         num_folds = 10
         num_repeats = 1
+        loss = "log" if Version(sklearn.__version__) < Version("1.3") else "log_loss"
 
-        flow = unittest.mock.Mock()
-        flow.name = 'dummy'
-        clf = SGDClassifier(loss='log', random_state=1)
+        clf = make_pipeline(
+            OneHotEncoder(handle_unknown="ignore"),
+            SGDClassifier(loss=loss, random_state=1),
+        )
         res = openml.runs.functions._run_task_get_arffcontent(
-            flow=flow,
             extension=self.extension,
             model=clf,
             task=task,
@@ -1058,204 +1372,280 @@ def test__run_task_get_arffcontent(self):
         )
         arff_datacontent, trace, fold_evaluations, _ = res
         # predictions
-        self.assertIsInstance(arff_datacontent, list)
+        assert isinstance(arff_datacontent, list)
         # trace. SGD does not produce any
-        self.assertIsInstance(trace, type(None))
+        assert isinstance(trace, type(None))
 
-        task_type = TaskTypeEnum.SUPERVISED_CLASSIFICATION
-        self._check_fold_timing_evaluations(fold_evaluations, num_repeats, num_folds,
-                                            task_type=task_type)
+        task_type = TaskType.SUPERVISED_CLASSIFICATION
+        self._check_fold_timing_evaluations(
+            fold_evaluations=fold_evaluations,
+            num_repeats=num_repeats,
+            num_folds=num_folds,
+            task_type=task_type,
+        )
 
         # 10 times 10 fold CV of 150 samples
-        self.assertEqual(len(arff_datacontent), num_instances * num_repeats)
+        assert len(arff_datacontent) == num_instances * num_repeats
         for arff_line in arff_datacontent:
             # check number columns
-            self.assertEqual(len(arff_line), 8)
+            assert len(arff_line) == 8
             # check repeat
-            self.assertGreaterEqual(arff_line[0], 0)
-            self.assertLessEqual(arff_line[0], num_repeats - 1)
+            assert arff_line[0] >= 0
+            assert arff_line[0] <= num_repeats - 1
             # check fold
-            self.assertGreaterEqual(arff_line[1], 0)
-            self.assertLessEqual(arff_line[1], num_folds - 1)
+            assert arff_line[1] >= 0
+            assert arff_line[1] <= num_folds - 1
             # check row id
-            self.assertGreaterEqual(arff_line[2], 0)
-            self.assertLessEqual(arff_line[2], num_instances - 1)
+            assert arff_line[2] >= 0
+            assert arff_line[2] <= num_instances - 1
+            # check prediction and ground truth columns
+            assert arff_line[4] in ["won", "nowin"]
+            assert arff_line[5] in ["won", "nowin"]
             # check confidences
-            self.assertAlmostEqual(sum(arff_line[4:6]), 1.0)
-            self.assertIn(arff_line[6], ['won', 'nowin'])
-            self.assertIn(arff_line[7], ['won', 'nowin'])
+            self.assertAlmostEqual(sum(arff_line[6:]), 1.0)
 
     def test__create_trace_from_arff(self):
-        with open(self.static_cache_dir + '/misc/trace.arff',
-                  'r') as arff_file:
+        with open(self.static_cache_dir / "misc" / "trace.arff") as arff_file:
             trace_arff = arff.load(arff_file)
         OpenMLRunTrace.trace_from_arff(trace_arff)
 
+    @pytest.mark.production_server()
     def test_get_run(self):
         # this run is not available on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         run = openml.runs.get_run(473351)
-        self.assertEqual(run.dataset_id, 357)
-        self.assertEqual(run.evaluations['f_measure'], 0.841225)
-        for i, value in [(0, 0.840918),
-                         (1, 0.839458),
-                         (2, 0.839613),
-                         (3, 0.842571),
-                         (4, 0.839567),
-                         (5, 0.840922),
-                         (6, 0.840985),
-                         (7, 0.847129),
-                         (8, 0.84218),
-                         (9, 0.844014)]:
-            self.assertEqual(run.fold_evaluations['f_measure'][0][i], value)
-        assert ('weka' in run.tags)
-        assert ('weka_3.7.12' in run.tags)
+        assert run.dataset_id == 357
+        assert run.evaluations["f_measure"] == 0.841225
+        for i, value in [
+            (0, 0.840918),
+            (1, 0.839458),
+            (2, 0.839613),
+            (3, 0.842571),
+            (4, 0.839567),
+            (5, 0.840922),
+            (6, 0.840985),
+            (7, 0.847129),
+            (8, 0.84218),
+            (9, 0.844014),
+        ]:
+            assert run.fold_evaluations["f_measure"][0][i] == value
+        assert "weka" in run.tags
+        assert "weka_3.7.12" in run.tags
+        assert run.predictions_url.endswith(
+            "/data/download/1667125/weka_generated_predictions4575715871712251329.arff"
+        )
 
     def _check_run(self, run):
-        self.assertIsInstance(run, dict)
-        self.assertEqual(len(run), 7)
-
+        # This tests that the API returns seven entries for each run
+        # Check out https://openml.org/api/v1/xml/run/list/flow/1154
+        # They are run_id, task_id, task_type_id, setup_id, flow_id, uploader, upload_time
+        # error_message and run_details exist, too, but are not used so far. We need to update
+        # this check once they are used!
+        assert isinstance(run, dict)
+        assert len(run) == 8, str(run)
+
+    @pytest.mark.production_server()
     def test_get_runs_list(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
-        runs = openml.runs.list_runs(id=[2], show_errors=True)
-        self.assertEqual(len(runs), 1)
-        for rid in runs:
-            self._check_run(runs[rid])
+        self.use_production_server()
+        runs = openml.runs.list_runs(id=[2], display_errors=True)
+        assert len(runs) == 1
+        for run in runs.to_dict(orient="index").values():
+            self._check_run(run)
 
+    @pytest.mark.test_server()
     def test_list_runs_empty(self):
         runs = openml.runs.list_runs(task=[0])
-        if len(runs) > 0:
-            raise ValueError('UnitTest Outdated, got somehow results')
-
-        self.assertIsInstance(runs, dict)
-
-    def test_list_runs_output_format(self):
-        runs = openml.runs.list_runs(size=1000, output_format='dataframe')
-        self.assertIsInstance(runs, pd.DataFrame)
+        assert runs.empty
 
+    @pytest.mark.production_server()
     def test_get_runs_list_by_task(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         task_ids = [20]
         runs = openml.runs.list_runs(task=task_ids)
-        self.assertGreaterEqual(len(runs), 590)
-        for rid in runs:
-            self.assertIn(runs[rid]['task_id'], task_ids)
-            self._check_run(runs[rid])
+        assert len(runs) >= 590
+        for run in runs.to_dict(orient="index").values():
+            assert run["task_id"] in task_ids
+            self._check_run(run)
         num_runs = len(runs)
 
         task_ids.append(21)
         runs = openml.runs.list_runs(task=task_ids)
-        self.assertGreaterEqual(len(runs), num_runs + 1)
-        for rid in runs:
-            self.assertIn(runs[rid]['task_id'], task_ids)
-            self._check_run(runs[rid])
+        assert len(runs) >= num_runs + 1
+        for run in runs.to_dict(orient="index").values():
+            assert run["task_id"] in task_ids
+            self._check_run(run)
 
+    @pytest.mark.production_server()
     def test_get_runs_list_by_uploader(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         # 29 is Dominik Kirchhoff
         uploader_ids = [29]
 
         runs = openml.runs.list_runs(uploader=uploader_ids)
-        self.assertGreaterEqual(len(runs), 2)
-        for rid in runs:
-            self.assertIn(runs[rid]['uploader'], uploader_ids)
-            self._check_run(runs[rid])
+        assert len(runs) >= 2
+        for run in runs.to_dict(orient="index").values():
+            assert run["uploader"] in uploader_ids
+            self._check_run(run)
         num_runs = len(runs)
 
         uploader_ids.append(274)
 
         runs = openml.runs.list_runs(uploader=uploader_ids)
-        self.assertGreaterEqual(len(runs), num_runs + 1)
-        for rid in runs:
-            self.assertIn(runs[rid]['uploader'], uploader_ids)
-            self._check_run(runs[rid])
+        assert len(runs) >= num_runs + 1
+        for run in runs.to_dict(orient="index").values():
+            assert run["uploader"] in uploader_ids
+            self._check_run(run)
 
+    @pytest.mark.production_server()
     def test_get_runs_list_by_flow(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow_ids = [1154]
         runs = openml.runs.list_runs(flow=flow_ids)
-        self.assertGreaterEqual(len(runs), 1)
-        for rid in runs:
-            self.assertIn(runs[rid]['flow_id'], flow_ids)
-            self._check_run(runs[rid])
+        assert len(runs) >= 1
+        for run in runs.to_dict(orient="index").values():
+            assert run["flow_id"] in flow_ids
+            self._check_run(run)
         num_runs = len(runs)
 
         flow_ids.append(1069)
         runs = openml.runs.list_runs(flow=flow_ids)
-        self.assertGreaterEqual(len(runs), num_runs + 1)
-        for rid in runs:
-            self.assertIn(runs[rid]['flow_id'], flow_ids)
-            self._check_run(runs[rid])
+        assert len(runs) >= num_runs + 1
+        for run in runs.to_dict(orient="index").values():
+            assert run["flow_id"] in flow_ids
+            self._check_run(run)
 
+    @pytest.mark.production_server()
     def test_get_runs_pagination(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         uploader_ids = [1]
         size = 10
         max = 100
         for i in range(0, max, size):
-            runs = openml.runs.list_runs(offset=i, size=size,
-                                         uploader=uploader_ids)
-            self.assertGreaterEqual(size, len(runs))
-            for rid in runs:
-                self.assertIn(runs[rid]["uploader"], uploader_ids)
+            runs = openml.runs.list_runs(offset=i, size=size, uploader=uploader_ids)
+            assert size >= len(runs)
+            for run in runs.to_dict(orient="index").values():
+                assert run["uploader"] in uploader_ids
 
+    @pytest.mark.production_server()
     def test_get_runs_list_by_filters(self):
         # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
+        self.use_production_server()
         ids = [505212, 6100]
         tasks = [2974, 339]
         uploaders_1 = [1, 2]
         uploaders_2 = [29, 274]
         flows = [74, 1718]
 
-        '''
+        """
         Since the results are taken by batch size, the function does not
         throw an OpenMLServerError anymore. Instead it throws a
         TimeOutException. For the moment commented out.
-        '''
+        """
         # self.assertRaises(openml.exceptions.OpenMLServerError,
         # openml.runs.list_runs)
 
         runs = openml.runs.list_runs(id=ids)
-        self.assertEqual(len(runs), 2)
+        assert len(runs) == 2
 
         runs = openml.runs.list_runs(task=tasks)
-        self.assertGreaterEqual(len(runs), 2)
+        assert len(runs) >= 2
 
         runs = openml.runs.list_runs(uploader=uploaders_2)
-        self.assertGreaterEqual(len(runs), 10)
+        assert len(runs) >= 10
 
         runs = openml.runs.list_runs(flow=flows)
-        self.assertGreaterEqual(len(runs), 100)
+        assert len(runs) >= 100
 
-        runs = openml.runs.list_runs(id=ids, task=tasks, uploader=uploaders_1)
+        runs = openml.runs.list_runs(
+            id=ids,
+            task=tasks,
+            uploader=uploaders_1,
+        )
+        assert len(runs) == 2
 
-    @unittest.skip("API currently broken: https://github.com/openml/OpenML/issues/948")
+    @pytest.mark.production_server()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_runs_list_by_tag(self):
-        # TODO: comes from live, no such lists on test
-        openml.config.server = self.production_server
-        runs = openml.runs.list_runs(tag='curves')
-        self.assertGreaterEqual(len(runs), 1)
-
-    def test_run_on_dataset_with_missing_labels(self):
+        # We don't have tagged runs on the test server
+        self.use_production_server()
+        # Don't remove the size restriction: this query is too expensive without
+        runs = openml.runs.list_runs(tag="curves", size=2)
+        assert len(runs) >= 1
+
+    @pytest.mark.sklearn()
+    @unittest.skipIf(
+        Version(sklearn.__version__) < Version("0.20"),
+        reason="columntransformer introduction in 0.20.0",
+    )
+    @pytest.mark.test_server()
+    def test_run_on_dataset_with_missing_labels_dataframe(self):
         # Check that _run_task_get_arffcontent works when one of the class
         # labels only declared in the arff file, but is not present in the
         # actual data
+        task = openml.tasks.get_task(2)  # anneal; crossvalidation
 
-        flow = unittest.mock.Mock()
-        flow.name = 'dummy'
-        task = openml.tasks.get_task(2)
+        from sklearn.compose import ColumnTransformer
 
-        model = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')),
-                                ('Estimator', DecisionTreeClassifier())])
+        cat_imp = make_pipeline(
+            SimpleImputer(strategy="most_frequent"),
+            OneHotEncoder(handle_unknown="ignore"),
+        )
+        cont_imp = make_pipeline(CustomImputer(), StandardScaler())
+        ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
+        model = Pipeline(
+            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())],
+        )  # build a sklearn classifier
+
+        data_content, _, _, _ = _run_task_get_arffcontent(
+            model=model,
+            task=task,
+            extension=self.extension,
+            add_local_measures=True,
+        )
+        # 2 folds, 5 repeats; keep in mind that this task comes from the test
+        # server, the task on the live server is different
+        assert len(data_content) == 4490
+        for row in data_content:
+            # repeat, fold, row_id, 6 confidences, prediction and correct label
+            assert len(row) == 12
+
+    @pytest.mark.sklearn()
+    @unittest.skipIf(
+        Version(sklearn.__version__) < Version("0.20"),
+        reason="columntransformer introduction in 0.20.0",
+    )
+    @pytest.mark.test_server()
+    def test_run_on_dataset_with_missing_labels_array(self):
+        # Check that _run_task_get_arffcontent works when one of the class
+        # labels only declared in the arff file, but is not present in the
+        # actual data
+        task = openml.tasks.get_task(2)  # anneal; crossvalidation
+        # task_id=2 on test server has 38 columns with 6 numeric columns
+        cont_idx = [3, 4, 8, 32, 33, 34]
+        cat_idx = list(set(np.arange(38)) - set(cont_idx))
+        cont = np.array([False] * 38)
+        cat = np.array([False] * 38)
+        cont[cont_idx] = True
+        cat[cat_idx] = True
+
+        from sklearn.compose import ColumnTransformer
+
+        cat_imp = make_pipeline(
+            SimpleImputer(strategy="most_frequent"),
+            OneHotEncoder(handle_unknown="ignore"),
+        )
+        cont_imp = make_pipeline(CustomImputer(), StandardScaler())
+        ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
+        model = Pipeline(
+            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())],
+        )  # build a sklearn classifier
 
         data_content, _, _, _ = _run_task_get_arffcontent(
-            flow=flow,
             model=model,
             task=task,
             extension=self.extension,
@@ -1263,36 +1653,385 @@ def test_run_on_dataset_with_missing_labels(self):
         )
         # 2 folds, 5 repeats; keep in mind that this task comes from the test
         # server, the task on the live server is different
-        self.assertEqual(len(data_content), 4490)
+        assert len(data_content) == 4490
         for row in data_content:
             # repeat, fold, row_id, 6 confidences, prediction and correct label
-            self.assertEqual(len(row), 12)
+            assert len(row) == 12
 
+    @pytest.mark.test_server()
     def test_get_cached_run(self):
-        openml.config.cache_directory = self.static_cache_dir
+        openml.config.set_root_cache_directory(self.static_cache_dir)
         openml.runs.functions._get_cached_run(1)
 
     def test_get_uncached_run(self):
-        openml.config.cache_directory = self.static_cache_dir
-        with self.assertRaises(openml.exceptions.OpenMLCacheException):
+        openml.config.set_root_cache_directory(self.static_cache_dir)
+        with pytest.raises(openml.exceptions.OpenMLCacheException):
             openml.runs.functions._get_cached_run(10)
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_flow_on_task_downloaded_flow(self):
         model = sklearn.ensemble.RandomForestClassifier(n_estimators=33)
         flow = self.extension.model_to_flow(model)
         flow.publish(raise_error_if_exists=False)
-        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
-        TestBase.logger.info("collected from test_run_functions: {}".format(flow.flow_id))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+        TestBase.logger.info(f"collected from test_run_functions: {flow.flow_id}")
 
         downloaded_flow = openml.flows.get_flow(flow.flow_id)
-        task = openml.tasks.get_task(119)  # diabetes
+        task = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE["task_id"])
         run = openml.runs.run_flow_on_task(
             flow=downloaded_flow,
             task=task,
-            avoid_duplicate_runs=False,
             upload_flow=False,
         )
 
         run.publish()
-        TestBase._mark_entity_for_removal('run', run.run_id)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run.run_id))
+        TestBase._mark_entity_for_removal("run", run.run_id)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {run.run_id}")
+
+    @pytest.mark.production_server()
+    def test_format_prediction_non_supervised(self):
+        # non-supervised tasks don't exist on the test server
+        self.use_production_server()
+        clustering = openml.tasks.get_task(126033, download_data=False)
+        ignored_input = [0] * 5
+        with pytest.raises(
+            NotImplementedError, match=r"Formatting for <class '[\w.]+'> is not supported."
+        ):
+            format_prediction(clustering, *ignored_input)
+
+    @pytest.mark.test_server()
+    def test_format_prediction_classification_no_probabilities(self):
+        classification = openml.tasks.get_task(
+            self.TEST_SERVER_TASK_SIMPLE["task_id"],
+            download_data=False,
+        )
+        ignored_input = [0] * 5
+        with pytest.raises(ValueError, match="`proba` is required for classification task"):
+            format_prediction(classification, *ignored_input, proba=None)
+
+    @pytest.mark.test_server()
+    def test_format_prediction_classification_incomplete_probabilities(self):
+        classification = openml.tasks.get_task(
+            self.TEST_SERVER_TASK_SIMPLE["task_id"],
+            download_data=False,
+        )
+        ignored_input = [0] * 5
+        incomplete_probabilities = {c: 0.2 for c in classification.class_labels[1:]}
+        with pytest.raises(ValueError, match="Each class should have a predicted probability"):
+            format_prediction(classification, *ignored_input, proba=incomplete_probabilities)
+
+    @pytest.mark.test_server()
+    def test_format_prediction_task_without_classlabels_set(self):
+        classification = openml.tasks.get_task(
+            self.TEST_SERVER_TASK_SIMPLE["task_id"],
+            download_data=False,
+        )
+        classification.class_labels = None
+        ignored_input = [0] * 5
+        with pytest.raises(ValueError, match="The classification task must have class labels set"):
+            format_prediction(classification, *ignored_input, proba={})
+
+    @pytest.mark.test_server()
+    def test_format_prediction_task_learning_curve_sample_not_set(self):
+        learning_curve = openml.tasks.get_task(801, download_data=False)  # diabetes;crossvalidation
+        probabilities = {c: 0.2 for c in learning_curve.class_labels}
+        ignored_input = [0] * 5
+        with pytest.raises(ValueError, match="`sample` can not be none for LearningCurveTask"):
+            format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities)
+
+    @pytest.mark.test_server()
+    def test_format_prediction_task_regression(self):
+        task_meta_data = self.TEST_SERVER_TASK_REGRESSION["task_meta_data"]
+        _task_id = check_task_existence(**task_meta_data)
+        if _task_id is not None:
+            task_id = _task_id
+        else:
+            new_task = openml.tasks.create_task(**task_meta_data)
+            # publishes the new task
+            try:
+                new_task = new_task.publish()
+                task_id = new_task.task_id
+            except OpenMLServerException as e:
+                if e.code == 614:  # Task already exists
+                    # the exception message contains the task_id that was matched in the format
+                    # 'Task already exists. - matched id(s): [xxxx]'
+                    task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+                else:
+                    raise Exception(repr(e))
+            # mark to remove the uploaded task
+            TestBase._mark_entity_for_removal("task", task_id)
+            TestBase.logger.info(f"collected from test_run_functions: {task_id}")
+
+        regression = openml.tasks.get_task(task_id, download_data=False)
+        ignored_input = [0] * 5
+        res = format_prediction(regression, *ignored_input)
+        self.assertListEqual(res, [0] * 5)
+
+
+    @unittest.skipIf(
+        Version(sklearn.__version__) < Version("0.20"),
+        reason="SimpleImputer doesn't handle mixed type DataFrame as input",
+    )
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
+    def test_delete_run(self):
+        rs = np.random.randint(1, 2**31 - 1)
+        clf = sklearn.pipeline.Pipeline(
+            steps=[
+                (f"test_server_imputer_{rs}", SimpleImputer()),
+                ("estimator", DecisionTreeClassifier()),
+            ],
+        )
+        task = openml.tasks.get_task(32)  # diabetes; crossvalidation
+
+        run = openml.runs.run_model_on_task(
+            model=clf, task=task, seed=rs,
+        )
+        run.publish()
+
+        with pytest.raises(openml.exceptions.OpenMLRunsExistError):
+            openml.runs.run_model_on_task(model=clf, task=task, seed=rs, avoid_duplicate_runs=True)
+
+        TestBase._mark_entity_for_removal("run", run.run_id)
+        TestBase.logger.info(f"collected from test_run_functions: {run.run_id}")
+
+        _run_id = run.run_id
+        assert delete_run(_run_id)
+
+    @pytest.mark.skip(reason="run id is in problematic state on test server due to PR#1454")
+    @unittest.skipIf(
+        Version(sklearn.__version__) < Version("0.20"),
+        reason="SimpleImputer doesn't handle mixed type DataFrame as input",
+    )
+    @pytest.mark.sklearn()
+    def test_initialize_model_from_run_nonstrict(self):
+        # We cannot guarantee that a run with an older version exists on the server.
+        # Thus, we test it simply with a run that we know exists that might not be loose.
+        # This tests all lines of code for OpenML but not the initialization, which we do not want to guarantee anyhow.
+        _ = openml.runs.initialize_model_from_run(run_id=1, strict_version=False)
+
+
+@mock.patch.object(requests.Session, "delete")
+def test_delete_run_not_owned(mock_delete, test_files_directory, test_api_key):
+    content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_not_owned.xml"
+    mock_delete.return_value = create_request_response(
+        status_code=412,
+        content_filepath=content_file,
+    )
+
+    with pytest.raises(
+        OpenMLNotAuthorizedError,
+        match="The run can not be deleted because it was not uploaded by you.",
+    ):
+        openml.runs.delete_run(40_000)
+
+    run_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/run/40000"
+    assert run_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+
+
+@mock.patch.object(requests.Session, "delete")
+def test_delete_run_success(mock_delete, test_files_directory, test_api_key):
+    content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_successful.xml"
+    mock_delete.return_value = create_request_response(
+        status_code=200,
+        content_filepath=content_file,
+    )
+
+    success = openml.runs.delete_run(10591880)
+    assert success
+
+    run_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/run/10591880"
+    assert run_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+
+
+@mock.patch.object(requests.Session, "delete")
+def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key):
+    content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_not_exist.xml"
+    mock_delete.return_value = create_request_response(
+        status_code=412,
+        content_filepath=content_file,
+    )
+
+    with pytest.raises(
+        OpenMLServerException,
+        match="Run does not exist",
+    ):
+        openml.runs.delete_run(9_999_999)
+
+    run_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/run/9999999"
+    assert run_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+
+
+@pytest.mark.sklearn()
+@unittest.skipIf(
+    Version(sklearn.__version__) < Version("0.21"),
+    reason="couldn't perform local tests successfully w/o bloating RAM",
+    )
+@unittest.skipIf(
+    Version(sklearn.__version__) >= Version("1.8"),
+    reason="predictions differ significantly",
+    )
+@mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
+@pytest.mark.test_server()
+def test__run_task_get_arffcontent_2(parallel_mock):
+    """Tests if a run executed in parallel is collated correctly."""
+    task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
+    x, y = task.get_X_and_y()
+    num_instances = x.shape[0]
+    line_length = 6 + len(task.class_labels)
+    loss = "log" if Version(sklearn.__version__) < Version("1.3") else "log_loss"
+    clf = sklearn.pipeline.Pipeline(
+        [
+            (
+                "cat_handling",
+                ColumnTransformer(
+                    transformers=[
+                        (
+                            "cat",
+                            OneHotEncoder(handle_unknown="ignore"),
+                            x.select_dtypes(include=["object", "category"]).columns,
+                        )
+                    ],
+                    remainder="passthrough",
+                ),
+            ),
+            ("clf", SGDClassifier(loss=loss, random_state=1)),
+        ]
+    )
+    n_jobs = 2
+    backend = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
+    from openml_sklearn import SklearnExtension
+    extension = SklearnExtension()
+    with parallel_backend(backend, n_jobs=n_jobs):
+        res = openml.runs.functions._run_task_get_arffcontent(
+            extension=extension,
+            model=clf,
+            task=task,
+            add_local_measures=True,
+            n_jobs=n_jobs,
+        )
+    # This unit test will fail if joblib is unable to distribute successfully since the
+    # function _run_model_on_fold is being mocked out. However, for a new spawned worker, it
+    # is not and the mock call_count should remain 0 while the subsequent check of actual
+    # results should also hold, only on successful distribution of tasks to workers.
+    # The _prevent_optimize_n_jobs() is a function executed within the _run_model_on_fold()
+    # block and mocking this function doesn't affect rest of the pipeline, but is adequately
+    # indicative if _run_model_on_fold() is being called or not.
+    assert parallel_mock.call_count == 0
+    assert isinstance(res[0], list)
+    assert len(res[0]) == num_instances
+    assert len(res[0][0]) == line_length
+    assert len(res[2]) == 7
+    assert len(res[3]) == 7
+    expected_scores = [
+        0.9625,
+        0.953125,
+        0.965625,
+        0.9125,
+        0.98125,
+        0.975,
+        0.9247648902821317,
+        0.9404388714733543,
+        0.9780564263322884,
+        0.9623824451410659,
+    ]
+    scores = [v for k, v in res[2]["predictive_accuracy"][0].items()]
+    np.testing.assert_array_almost_equal(
+        scores,
+        expected_scores,
+        decimal=2,
+        err_msg="Observed performance scores deviate from expected ones.",
+    )
+
+
+@pytest.mark.sklearn()
+@unittest.skipIf(
+    Version(sklearn.__version__) < Version("0.21"),
+    reason="couldn't perform local tests successfully w/o bloating RAM",
+    )
+@mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
+@pytest.mark.parametrize(
+    ("n_jobs", "backend", "call_count"),
+    [
+        # `None` picks the backend based on joblib version (loky or multiprocessing) and
+        # spawns multiple processes if n_jobs != 1, which means the mock is not applied.
+        (2, None, 0),
+        (-1, None, 0),
+        (1, None, 10),  # with n_jobs=1 the mock *is* applied, since there is no new subprocess
+        (1, "sequential", 10),
+        (1, "threading", 10),
+        (-1, "threading", 10),  # the threading backend does preserve mocks even with parallelizing
+    ]
+)
+@pytest.mark.test_server()
+def test_joblib_backends(parallel_mock, n_jobs, backend, call_count):
+    """Tests evaluation of a run using various joblib backends and n_jobs."""
+    if backend is None:
+        backend = (
+            "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
+        )
+
+    task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
+    x, y = task.get_X_and_y()
+    num_instances = x.shape[0]
+    line_length = 6 + len(task.class_labels)
+
+    clf = sklearn.model_selection.RandomizedSearchCV(
+        estimator=sklearn.pipeline.Pipeline(
+            [
+                (
+                    "cat_handling",
+                    ColumnTransformer(
+                        transformers=[
+                            (
+                                "cat",
+                                OneHotEncoder(handle_unknown="ignore"),
+                                x.select_dtypes(include=["object", "category"]).columns,
+                            )
+                        ],
+                        remainder="passthrough",
+                    ),
+                ),
+                ("clf", sklearn.ensemble.RandomForestClassifier(n_estimators=5)),
+            ]
+        ),
+        param_distributions={
+            "clf__max_depth": [3, None],
+            "clf__max_features": [1, 2, 3, 4],
+            "clf__min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
+            "clf__min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+            "clf__bootstrap": [True, False],
+            "clf__criterion": ["gini", "entropy"],
+        },
+        random_state=1,
+        cv=sklearn.model_selection.StratifiedKFold(
+            n_splits=2,
+            shuffle=True,
+            random_state=1,
+        ),
+        n_iter=5,
+        n_jobs=n_jobs,
+    )
+    from openml_sklearn import SklearnExtension
+
+    extension = SklearnExtension()
+    with parallel_backend(backend, n_jobs=n_jobs):
+        res = openml.runs.functions._run_task_get_arffcontent(
+            extension=extension,
+            model=clf,
+            task=task,
+            add_local_measures=True,
+            n_jobs=n_jobs,
+        )
+    assert type(res[0]) == list
+    assert len(res[0]) == num_instances
+    assert len(res[0][0]) == line_length
+    # usercpu_time_millis_* not recorded when n_jobs > 1
+    # *_time_millis_* not recorded when n_jobs = -1
+    assert len(res[2]["predictive_accuracy"][0]) == 10
+    assert len(res[3]["predictive_accuracy"][0]) == 10
+    assert parallel_mock.call_count == call_count
\ No newline at end of file
diff --git a/tests/test_runs/test_trace.py b/tests/test_runs/test_trace.py
index 29f3a1554..bdf9de42d 100644
--- a/tests/test_runs/test_trace.py
+++ b/tests/test_runs/test_trace.py
@@ -1,3 +1,8 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import pytest
+
 from openml.runs import OpenMLRunTrace, OpenMLTraceIteration
 from openml.testing import TestBase
 
@@ -12,7 +17,7 @@ def test_get_selected_iteration(self):
                         repeat=i,
                         fold=j,
                         iteration=5,
-                        setup_string='parameter_%d%d%d' % (i, j, k),
+                        setup_string="parameter_%d%d%d" % (i, j, k),
                         evaluation=1.0 * i + 0.1 * j + 0.01 * k,
                         selected=(i == j and i == k and i == 2),
                         parameters=None,
@@ -21,67 +26,53 @@ def test_get_selected_iteration(self):
 
         trace = OpenMLRunTrace(-1, trace_iterations=trace_iterations)
         # This next one should simply not fail
-        self.assertEqual(trace.get_selected_iteration(2, 2), 2)
-        with self.assertRaisesRegex(
-            ValueError,
-                'Could not find the selected iteration for rep/fold 3/3',
+        assert trace.get_selected_iteration(2, 2) == 2
+        with pytest.raises(
+            ValueError, match="Could not find the selected iteration for rep/fold 3/3"
         ):
-
             trace.get_selected_iteration(3, 3)
 
     def test_initialization(self):
-        """Check all different ways to fail the initialization """
-        with self.assertRaisesRegex(
-            ValueError,
-            'Trace content not available.',
-        ):
-            OpenMLRunTrace.generate(attributes='foo', content=None)
-        with self.assertRaisesRegex(
-            ValueError,
-            'Trace attributes not available.',
-        ):
-            OpenMLRunTrace.generate(attributes=None, content='foo')
-        with self.assertRaisesRegex(
-            ValueError,
-            'Trace content is empty.'
-        ):
-            OpenMLRunTrace.generate(attributes='foo', content=[])
-        with self.assertRaisesRegex(
-            ValueError,
-            'Trace_attributes and trace_content not compatible:'
-        ):
-            OpenMLRunTrace.generate(attributes=['abc'], content=[[1, 2]])
+        """Check all different ways to fail the initialization"""
+        with pytest.raises(ValueError, match="Trace content not available."):
+            OpenMLRunTrace.generate(attributes="foo", content=None)
+        with pytest.raises(ValueError, match="Trace attributes not available."):
+            OpenMLRunTrace.generate(attributes=None, content="foo")
+        with pytest.raises(ValueError, match="Trace content is empty."):
+            OpenMLRunTrace.generate(attributes="foo", content=[])
+        with pytest.raises(ValueError, match="Trace_attributes and trace_content not compatible:"):
+            OpenMLRunTrace.generate(attributes=["abc"], content=[[1, 2]])
 
     def test_duplicate_name(self):
         # Test that the user does not pass a parameter which has the same name
         # as one of the required trace attributes
         trace_attributes = [
-            ('repeat', 'NUMERICAL'),
-            ('fold', 'NUMERICAL'),
-            ('iteration', 'NUMERICAL'),
-            ('evaluation', 'NUMERICAL'),
-            ('selected', ['true', 'false']),
-            ('repeat', 'NUMERICAL'),
+            ("repeat", "NUMERICAL"),
+            ("fold", "NUMERICAL"),
+            ("iteration", "NUMERICAL"),
+            ("evaluation", "NUMERICAL"),
+            ("selected", ["true", "false"]),
+            ("repeat", "NUMERICAL"),
         ]
-        trace_content = [[0, 0, 0, 0.5, 'true', 1], [0, 0, 0, 0.9, 'false', 2]]
-        with self.assertRaisesRegex(
+        trace_content = [[0, 0, 0, 0.5, "true", 1], [0, 0, 0, 0.9, "false", 2]]
+        with pytest.raises(
             ValueError,
-            'Either setup_string or parameters needs to be passed as argument.'
+            match="Either `setup_string` or `parameters` needs to be passed as argument.",
         ):
             OpenMLRunTrace.generate(trace_attributes, trace_content)
 
         trace_attributes = [
-            ('repeat', 'NUMERICAL'),
-            ('fold', 'NUMERICAL'),
-            ('iteration', 'NUMERICAL'),
-            ('evaluation', 'NUMERICAL'),
-            ('selected', ['true', 'false']),
-            ('sunshine', 'NUMERICAL'),
+            ("repeat", "NUMERICAL"),
+            ("fold", "NUMERICAL"),
+            ("iteration", "NUMERICAL"),
+            ("evaluation", "NUMERICAL"),
+            ("selected", ["true", "false"]),
+            ("sunshine", "NUMERICAL"),
         ]
-        trace_content = [[0, 0, 0, 0.5, 'true', 1], [0, 0, 0, 0.9, 'false', 2]]
-        with self.assertRaisesRegex(
+        trace_content = [[0, 0, 0, 0.5, "true", 1], [0, 0, 0, 0.9, "false", 2]]
+        with pytest.raises(
             ValueError,
-            'Encountered unknown attribute sunshine that does not start with '
-            'prefix parameter_'
+            match="Encountered unknown attribute sunshine that does not start with "
+            "prefix parameter_",
         ):
             OpenMLRunTrace.generate(trace_attributes, trace_content)
diff --git a/tests/test_setups/__init__.py b/tests/test_setups/__init__.py
index dc5287024..245c252db 100644
--- a/tests/test_setups/__init__.py
+++ b/tests/test_setups/__init__.py
@@ -1,3 +1,5 @@
+# License: BSD 3-Clause
+
 # Dummy to allow mock classes in the test files to have a version number for
 # their parent module
-__version__ = '0.1'
+__version__ = "0.1"
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index 16e149544..30943ea70 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -1,17 +1,20 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import hashlib
 import time
 import unittest.mock
+import os
+import pandas as pd
+import pytest
+import sklearn.base
+import sklearn.naive_bayes
+import sklearn.tree
+from openml_sklearn import SklearnExtension
 
 import openml
 import openml.exceptions
-import openml.extensions.sklearn
 from openml.testing import TestBase
-from typing import Dict
-import pandas as pd
-
-import sklearn.tree
-import sklearn.naive_bayes
-import sklearn.base
 
 
 def get_sentinel():
@@ -19,83 +22,90 @@ def get_sentinel():
     # identified by its name and external version online. Having a unique
     #  name allows us to publish the same flow in each test run
     md5 = hashlib.md5()
-    md5.update(str(time.time()).encode('utf-8'))
+    md5.update(str(time.time()).encode("utf-8"))
     sentinel = md5.hexdigest()[:10]
-    sentinel = 'TEST%s' % sentinel
-    return sentinel
+    return f"TEST{sentinel}"
 
 
 class TestSetupFunctions(TestBase):
     _multiprocess_can_split_ = True
 
     def setUp(self):
-        self.extension = openml.extensions.sklearn.SklearnExtension()
+        self.extension = SklearnExtension()
         super().setUp()
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_nonexisting_setup_exists(self):
         # first publish a non-existing flow
         sentinel = get_sentinel()
         # because of the sentinel, we can not use flows that contain subflows
         dectree = sklearn.tree.DecisionTreeClassifier()
         flow = self.extension.model_to_flow(dectree)
-        flow.name = 'TEST%s%s' % (sentinel, flow.name)
+        flow.name = f"TEST{sentinel}{flow.name}"
         flow.publish()
-        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
 
         # although the flow exists (created as of previous statement),
         # we can be sure there are no setups (yet) as it was just created
         # and hasn't been ran
         setup_id = openml.setups.setup_exists(flow)
-        self.assertFalse(setup_id)
+        assert not setup_id
 
     def _existing_setup_exists(self, classif):
-
         flow = self.extension.model_to_flow(classif)
-        flow.name = 'TEST%s%s' % (get_sentinel(), flow.name)
+        flow.name = f"TEST{get_sentinel()}{flow.name}"
         flow.publish()
-        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id))
+        TestBase._mark_entity_for_removal("flow", flow.flow_id, flow.name)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow.flow_id}")
 
         # although the flow exists, we can be sure there are no
         # setups (yet) as it hasn't been ran
         setup_id = openml.setups.setup_exists(flow)
-        self.assertFalse(setup_id)
+        assert not setup_id
         setup_id = openml.setups.setup_exists(flow)
-        self.assertFalse(setup_id)
+        assert not setup_id
 
         # now run the flow on an easy task:
-        task = openml.tasks.get_task(115)  # diabetes
+        task = openml.tasks.get_task(115)  # diabetes; crossvalidation
         run = openml.runs.run_flow_on_task(flow, task)
         # spoof flow id, otherwise the sentinel is ignored
         run.flow_id = flow.flow_id
         run.publish()
-        TestBase._mark_entity_for_removal('run', run.run_id)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run.run_id))
+        TestBase._mark_entity_for_removal("run", run.run_id)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {run.run_id}")
         # download the run, as it contains the right setup id
         run = openml.runs.get_run(run.run_id)
 
         # execute the function we are interested in
         setup_id = openml.setups.setup_exists(flow)
-        self.assertEqual(setup_id, run.setup_id)
+        assert setup_id == run.setup_id
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_existing_setup_exists_1(self):
         def side_effect(self):
             self.var_smoothing = 1e-9
             self.priors = None
+
         with unittest.mock.patch.object(
-                sklearn.naive_bayes.GaussianNB,
-                '__init__',
-                side_effect,
+            sklearn.naive_bayes.GaussianNB,
+            "__init__",
+            side_effect,
         ):
             # Check a flow with zero hyperparameters
             nb = sklearn.naive_bayes.GaussianNB()
             self._existing_setup_exists(nb)
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_exisiting_setup_exists_2(self):
         # Check a flow with one hyperparameter
         self._existing_setup_exists(sklearn.naive_bayes.GaussianNB())
 
+    @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_existing_setup_exists_3(self):
         # Check a flow with many hyperparameters
         self._existing_setup_exists(
@@ -105,13 +115,13 @@ def test_existing_setup_exists_3(self):
                 # Not setting the random state will make this flow fail as running it
                 # will add a random random_state.
                 random_state=1,
-            )
+            ),
         )
 
+    @pytest.mark.production_server()
     def test_get_setup(self):
+        self.use_production_server()
         # no setups in default test server
-        openml.config.server = 'https://www.openml.org/api/v1/xml/'
-
         # contains all special cases, 0 params, 1 param, n params.
         # Non scikitlearn flows.
         setups = [18, 19, 20, 118]
@@ -121,65 +131,61 @@ def test_get_setup(self):
             current = openml.setups.get_setup(setups[idx])
             assert current.flow_id > 0
             if num_params[idx] == 0:
-                self.assertIsNone(current.parameters)
+                assert current.parameters is None
             else:
-                self.assertEqual(len(current.parameters), num_params[idx])
+                assert len(current.parameters) == num_params[idx]
 
+    @pytest.mark.production_server()
     def test_setup_list_filter_flow(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         flow_id = 5873
 
         setups = openml.setups.list_setups(flow=flow_id)
 
-        self.assertGreater(len(setups), 0)  # TODO: please adjust 0
-        for setup_id in setups.keys():
-            self.assertEqual(setups[setup_id].flow_id, flow_id)
+        assert len(setups) >= 2
+        for setup_id in setups:
+            assert setups[setup_id].flow_id == flow_id
 
+    @pytest.mark.test_server()
     def test_list_setups_empty(self):
         setups = openml.setups.list_setups(setup=[0])
         if len(setups) > 0:
-            raise ValueError('UnitTest Outdated, got somehow results')
+            raise ValueError("UnitTest Outdated, got somehow results")
 
-        self.assertIsInstance(setups, dict)
+        assert isinstance(setups, dict)
 
+    @pytest.mark.production_server()
     def test_list_setups_output_format(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         flow_id = 6794
-        setups = openml.setups.list_setups(flow=flow_id, output_format='object', size=10)
-        self.assertIsInstance(setups, Dict)
-        self.assertIsInstance(setups[list(setups.keys())[0]],
-                              openml.setups.setup.OpenMLSetup)
-        self.assertEqual(len(setups), 10)
-
-        setups = openml.setups.list_setups(flow=flow_id, output_format='dataframe', size=10)
-        self.assertIsInstance(setups, pd.DataFrame)
-        self.assertEqual(len(setups), 10)
+        setups = openml.setups.list_setups(flow=flow_id, size=10)
+        assert isinstance(setups, dict)
+        assert isinstance(setups[next(iter(setups.keys()))], openml.setups.setup.OpenMLSetup)
+        assert len(setups) == 10
 
-        setups = openml.setups.list_setups(flow=flow_id, output_format='dict', size=10)
-        self.assertIsInstance(setups, Dict)
-        self.assertIsInstance(setups[list(setups.keys())[0]], Dict)
-        self.assertEqual(len(setups), 10)
+        setups = openml.setups.list_setups(flow=flow_id, size=10, output_format="dataframe")
+        assert isinstance(setups, pd.DataFrame)
+        assert len(setups) == 10
 
+    @pytest.mark.test_server()
     def test_setuplist_offset(self):
-        # TODO: remove after pull on live for better testing
-        # openml.config.server = self.production_server
-
         size = 10
         setups = openml.setups.list_setups(offset=0, size=size)
-        self.assertEqual(len(setups), size)
+        assert len(setups) == size
         setups2 = openml.setups.list_setups(offset=size, size=size)
-        self.assertEqual(len(setups2), size)
+        assert len(setups2) == size
 
         all = set(setups.keys()).union(setups2.keys())
 
-        self.assertEqual(len(all), size * 2)
+        assert len(all) == size * 2
 
+    @pytest.mark.test_server()
     def test_get_cached_setup(self):
-        openml.config.cache_directory = self.static_cache_dir
+        openml.config.set_root_cache_directory(self.static_cache_dir)
         openml.setups.functions._get_cached_setup(1)
 
     def test_get_uncached_setup(self):
-        openml.config.cache_directory = self.static_cache_dir
-        with self.assertRaises(openml.exceptions.OpenMLCacheException):
+        openml.config.set_root_cache_directory(self.static_cache_dir)
+        with pytest.raises(openml.exceptions.OpenMLCacheException):
             openml.setups.functions._get_cached_setup(10)
diff --git a/tests/test_extensions/test_sklearn_extension/__init__.py b/tests/test_study/__init__.py
similarity index 100%
rename from tests/test_extensions/test_sklearn_extension/__init__.py
rename to tests/test_study/__init__.py
diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py
deleted file mode 100644
index 1d9c56d54..000000000
--- a/tests/test_study/test_study_examples.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from openml.testing import TestBase, SimpleImputer
-
-
-class TestStudyFunctions(TestBase):
-    _multiprocess_can_split_ = True
-    """Test the example code of Bischl et al. (2018)"""
-
-    def test_Figure1a(self):
-        """Test listing in Figure 1a on a single task and the old OpenML100 study.
-
-        The original listing is pasted into the comment below because it the actual unit test
-        differs a bit, as for example it does not run for all tasks, but only a single one.
-
-        import openml
-        import sklearn.tree, sklearn.preprocessing
-        benchmark_suite = openml.study.get_study('OpenML-CC18','tasks') # obtain the benchmark suite
-        clf = sklearn.pipeline.Pipeline(steps=[('imputer',sklearn.preprocessing.Imputer()),  ('estimator',sklearn.tree.DecisionTreeClassifier())]) # build a sklearn classifier
-        for task_id in benchmark_suite.tasks:                          # iterate over all tasks
-            task = openml.tasks.get_task(task_id)                        # download the OpenML task
-            X, y = task.get_X_and_y()                                    # get the data (not used in this example)
-            openml.config.apikey = 'FILL_IN_OPENML_API_KEY'              # set the OpenML Api Key
-            run = openml.runs.run_model_on_task(task,clf)                # run classifier on splits (requires API key)
-            score = run.get_metric_fn(sklearn.metrics.accuracy_score) # print accuracy score
-            print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name,score.mean()))
-            run.publish()                                                # publish the experiment on OpenML (optional)
-            print('URL for run: %s/run/%d' %(openml.config.server,run.run_id))
-        """  # noqa: E501
-        import openml
-        import sklearn.metrics
-        import sklearn.pipeline
-        import sklearn.preprocessing
-        import sklearn.tree
-
-        benchmark_suite = openml.study.get_study(
-            'OpenML100', 'tasks'
-        )  # obtain the benchmark suite
-        clf = sklearn.pipeline.Pipeline(
-            steps=[
-                ('imputer', SimpleImputer()),
-                ('estimator', sklearn.tree.DecisionTreeClassifier())
-            ]
-        )  # build a sklearn classifier
-        for task_id in benchmark_suite.tasks[:1]:  # iterate over all tasks
-            task = openml.tasks.get_task(task_id)  # download the OpenML task
-            X, y = task.get_X_and_y()  # get the data (not used in this example)
-            openml.config.apikey = openml.config.apikey  # set the OpenML Api Key
-            run = openml.runs.run_model_on_task(
-                clf, task, avoid_duplicate_runs=False
-            )  # run classifier on splits (requires API key)
-            score = run.get_metric_fn(
-                sklearn.metrics.accuracy_score
-            )  # print accuracy score
-            print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name, score.mean()))
-            run.publish()  # publish the experiment on OpenML (optional)
-            TestBase._mark_entity_for_removal('run', run.run_id)
-            TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                                run.run_id))
-            print('URL for run: %s/run/%d' % (openml.config.server, run.run_id))
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index 33ba0c452..7dc6b6d2a 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -1,214 +1,264 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import pytest
+import unittest
+
 import openml
 import openml.study
 from openml.testing import TestBase
-import pandas as pd
 
 
 class TestStudyFunctions(TestBase):
     _multiprocess_can_split_ = True
 
+    @pytest.mark.production_server()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_study_old(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         study = openml.study.get_study(34)
-        self.assertEqual(len(study.data), 105)
-        self.assertEqual(len(study.tasks), 105)
-        self.assertEqual(len(study.flows), 27)
-        self.assertEqual(len(study.setups), 30)
-        self.assertIsNone(study.runs)
+        assert len(study.data) == 105
+        assert len(study.tasks) == 105
+        assert len(study.flows) == 27
+        assert len(study.setups) == 30
+        assert study.runs is None
 
+    @pytest.mark.production_server()
     def test_get_study_new(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         study = openml.study.get_study(123)
-        self.assertEqual(len(study.data), 299)
-        self.assertEqual(len(study.tasks), 299)
-        self.assertEqual(len(study.flows), 5)
-        self.assertEqual(len(study.setups), 1253)
-        self.assertEqual(len(study.runs), 1693)
+        assert len(study.data) == 299
+        assert len(study.tasks) == 299
+        assert len(study.flows) == 5
+        assert len(study.setups) == 1253
+        assert len(study.runs) == 1693
 
+    @pytest.mark.production_server()
     def test_get_openml100(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
-        study = openml.study.get_study('OpenML100', 'tasks')
-        self.assertIsInstance(study, openml.study.OpenMLBenchmarkSuite)
-        study_2 = openml.study.get_suite('OpenML100')
-        self.assertIsInstance(study_2, openml.study.OpenMLBenchmarkSuite)
-        self.assertEqual(study.id, study_2.id)
+        study = openml.study.get_study("OpenML100", "tasks")
+        assert isinstance(study, openml.study.OpenMLBenchmarkSuite)
+        study_2 = openml.study.get_suite("OpenML100")
+        assert isinstance(study_2, openml.study.OpenMLBenchmarkSuite)
+        assert study.study_id == study_2.study_id
 
+    @pytest.mark.production_server()
     def test_get_study_error(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
-        with self.assertRaisesRegex(
-            ValueError,
-            "Unexpected entity type 'task' reported by the server, expected 'run'",
+        with pytest.raises(
+            ValueError, match="Unexpected entity type 'task' reported by the server, expected 'run'"
         ):
             openml.study.get_study(99)
 
+    @pytest.mark.production_server()
     def test_get_suite(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
         study = openml.study.get_suite(99)
-        self.assertEqual(len(study.data), 72)
-        self.assertEqual(len(study.tasks), 72)
-        self.assertIsNone(study.flows)
-        self.assertIsNone(study.runs)
-        self.assertIsNone(study.setups)
+        assert len(study.data) == 72
+        assert len(study.tasks) == 72
+        assert study.flows is None
+        assert study.runs is None
+        assert study.setups is None
 
+    @pytest.mark.production_server()
     def test_get_suite_error(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
 
-        with self.assertRaisesRegex(
-            ValueError,
-            "Unexpected entity type 'run' reported by the server, expected 'task'",
+        with pytest.raises(
+            ValueError, match="Unexpected entity type 'run' reported by the server, expected 'task'"
         ):
             openml.study.get_suite(123)
 
+    @pytest.mark.test_server()
     def test_publish_benchmark_suite(self):
         fixture_alias = None
-        fixture_name = 'unit tested benchmark suite'
-        fixture_descr = 'bla'
+        fixture_name = "unit tested benchmark suite"
+        fixture_descr = "bla"
         fixture_task_ids = [1, 2, 3]
 
         study = openml.study.create_benchmark_suite(
             alias=fixture_alias,
             name=fixture_name,
             description=fixture_descr,
-            task_ids=fixture_task_ids
+            task_ids=fixture_task_ids,
         )
-        study_id = study.publish()
-        TestBase._mark_entity_for_removal('study', study_id)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], study_id))
+        study.publish()
+        TestBase._mark_entity_for_removal("study", study.id)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {study.id}")
 
-        self.assertGreater(study_id, 0)
+        assert study.id > 0
 
         # verify main meta data
-        study_downloaded = openml.study.get_suite(study_id)
-        self.assertEqual(study_downloaded.alias, fixture_alias)
-        self.assertEqual(study_downloaded.name, fixture_name)
-        self.assertEqual(study_downloaded.description, fixture_descr)
-        self.assertEqual(study_downloaded.main_entity_type, 'task')
+        study_downloaded = openml.study.get_suite(study.id)
+        assert study_downloaded.alias == fixture_alias
+        assert study_downloaded.name == fixture_name
+        assert study_downloaded.description == fixture_descr
+        assert study_downloaded.main_entity_type == "task"
         # verify resources
-        self.assertIsNone(study_downloaded.flows)
-        self.assertIsNone(study_downloaded.setups)
-        self.assertIsNone(study_downloaded.runs)
-        self.assertGreater(len(study_downloaded.data), 0)
-        self.assertLessEqual(len(study_downloaded.data), len(fixture_task_ids))
+        assert study_downloaded.flows is None
+        assert study_downloaded.setups is None
+        assert study_downloaded.runs is None
+        assert len(study_downloaded.data) > 0
+        assert len(study_downloaded.data) <= len(fixture_task_ids)
         self.assertSetEqual(set(study_downloaded.tasks), set(fixture_task_ids))
 
         # attach more tasks
         tasks_additional = [4, 5, 6]
-        openml.study.attach_to_study(study_id, tasks_additional)
-        study_downloaded = openml.study.get_suite(study_id)
+        openml.study.attach_to_study(study.id, tasks_additional)
+        study_downloaded = openml.study.get_suite(study.id)
         # verify again
-        self.assertSetEqual(set(study_downloaded.tasks),
-                            set(fixture_task_ids + tasks_additional))
+        self.assertSetEqual(set(study_downloaded.tasks), set(fixture_task_ids + tasks_additional))
         # test detach function
-        openml.study.detach_from_study(study_id, fixture_task_ids)
-        study_downloaded = openml.study.get_suite(study_id)
-        self.assertSetEqual(set(study_downloaded.tasks),
-                            set(tasks_additional))
+        openml.study.detach_from_study(study.id, fixture_task_ids)
+        study_downloaded = openml.study.get_suite(study.id)
+        self.assertSetEqual(set(study_downloaded.tasks), set(tasks_additional))
 
         # test status update function
-        openml.study.update_suite_status(study_id, 'deactivated')
-        study_downloaded = openml.study.get_suite(study_id)
-        self.assertEqual(study_downloaded.status, 'deactivated')
+        openml.study.update_suite_status(study.id, "deactivated")
+        study_downloaded = openml.study.get_suite(study.id)
+        assert study_downloaded.status == "deactivated"
         # can't delete study, now it's not longer in preparation
 
+    def _test_publish_empty_study_is_allowed(self, explicit: bool):
+        runs: list[int] | None = [] if explicit else None
+        kind = "explicit" if explicit else "implicit"
+
+        study = openml.study.create_study(
+            name=f"empty-study-{kind}",
+            description=f"a study with no runs attached {kind}ly",
+            run_ids=runs,
+        )
+
+        study.publish()
+        TestBase._mark_entity_for_removal("study", study.id)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {study.id}")
+
+        assert study.id > 0
+        study_downloaded = openml.study.get_study(study.id)
+        assert study_downloaded.main_entity_type == "run"
+        assert study_downloaded.runs is None
+
+    @pytest.mark.test_server()
+    def test_publish_empty_study_explicit(self):
+        self._test_publish_empty_study_is_allowed(explicit=True)
+
+    @pytest.mark.test_server()
+    def test_publish_empty_study_implicit(self):
+        self._test_publish_empty_study_is_allowed(explicit=False)
+
+    @pytest.mark.flaky()
+    @pytest.mark.test_server()
     def test_publish_study(self):
         # get some random runs to attach
-        run_list = openml.runs.list_runs(size=10)
-        self.assertEqual(len(run_list), 10)
+        run_list = openml.evaluations.list_evaluations("predictive_accuracy", size=10)
+        assert len(run_list) == 10
 
         fixt_alias = None
-        fixt_name = 'unit tested study'
-        fixt_descr = 'bla'
-        fixt_flow_ids = set([run['flow_id'] for run in run_list.values()])
-        fixt_task_ids = set([run['task_id'] for run in run_list.values()])
-        fixt_setup_ids = set([run['setup_id']for run in run_list.values()])
+        fixt_name = "unit tested study"
+        fixt_descr = "bla"
+        fixt_flow_ids = {evaluation.flow_id for evaluation in run_list.values()}
+        fixt_task_ids = {evaluation.task_id for evaluation in run_list.values()}
+        fixt_setup_ids = {evaluation.setup_id for evaluation in run_list.values()}
 
         study = openml.study.create_study(
             alias=fixt_alias,
             benchmark_suite=None,
             name=fixt_name,
             description=fixt_descr,
-            run_ids=list(run_list.keys())
+            run_ids=list(run_list.keys()),
         )
-        study_id = study.publish()
-        # not tracking upload for delete since _delete_entity called end of function
-        # asserting return status from openml.study.delete_study()
-        self.assertGreater(study_id, 0)
-        study_downloaded = openml.study.get_study(study_id)
-        self.assertEqual(study_downloaded.alias, fixt_alias)
-        self.assertEqual(study_downloaded.name, fixt_name)
-        self.assertEqual(study_downloaded.description, fixt_descr)
-        self.assertEqual(study_downloaded.main_entity_type, 'run')
+        study.publish()
+        TestBase._mark_entity_for_removal("study", study.id)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {study.id}")
+        assert study.id > 0
+        study_downloaded = openml.study.get_study(study.id)
+        assert study_downloaded.alias == fixt_alias
+        assert study_downloaded.name == fixt_name
+        assert study_downloaded.description == fixt_descr
+        assert study_downloaded.main_entity_type == "run"
 
         self.assertSetEqual(set(study_downloaded.runs), set(run_list.keys()))
         self.assertSetEqual(set(study_downloaded.setups), set(fixt_setup_ids))
         self.assertSetEqual(set(study_downloaded.flows), set(fixt_flow_ids))
         self.assertSetEqual(set(study_downloaded.tasks), set(fixt_task_ids))
 
-        # attach more runs
-        run_list_additional = openml.runs.list_runs(size=10, offset=10)
-        openml.study.attach_to_study(study_id,
-                                     list(run_list_additional.keys()))
-        study_downloaded = openml.study.get_study(study_id)
+        # test whether the list run function also handles study data fine
+        run_ids = openml.runs.list_runs(study=study.id) # returns DF
+        self.assertSetEqual(set(run_ids["run_id"]), set(study_downloaded.runs))
+
+        # test whether the list evaluation function also handles study data fine
+        run_ids = openml.evaluations.list_evaluations( # returns list of objects
+            "predictive_accuracy",
+            size=None,
+            study=study.id,
+            output_format="object", # making the default explicit
+        )
+        self.assertSetEqual(set(run_ids), set(study_downloaded.runs))
+
+        # attach more runs, since we fetch 11 here, at least one is non-overlapping
+        run_list_additional = openml.runs.list_runs(size=11, offset=10)
+        run_list_additional = set(run_list_additional["run_id"]) - set(run_ids)
+        openml.study.attach_to_study(study.id, list(run_list_additional))
+        study_downloaded = openml.study.get_study(study.id)
         # verify again
-        all_run_ids = set(run_list_additional.keys()) | set(run_list.keys())
+        all_run_ids = run_list_additional | set(run_list.keys())
         self.assertSetEqual(set(study_downloaded.runs), all_run_ids)
 
         # test detach function
-        openml.study.detach_from_study(study_id, list(run_list.keys()))
-        study_downloaded = openml.study.get_study(study_id)
-        self.assertSetEqual(set(study_downloaded.runs),
-                            set(run_list_additional.keys()))
+        openml.study.detach_from_study(study.id, list(run_list.keys()))
+        study_downloaded = openml.study.get_study(study.id)
+        self.assertSetEqual(set(study_downloaded.runs), run_list_additional)
 
         # test status update function
-        openml.study.update_study_status(study_id, 'deactivated')
-        study_downloaded = openml.study.get_study(study_id)
-        self.assertEqual(study_downloaded.status, 'deactivated')
+        openml.study.update_study_status(study.id, "deactivated")
+        study_downloaded = openml.study.get_study(study.id)
+        assert study_downloaded.status == "deactivated"
 
-        res = openml.study.delete_study(study_id)
-        self.assertTrue(res)
+        res = openml.study.delete_study(study.id)
+        assert res
 
+    @pytest.mark.test_server()
     def test_study_attach_illegal(self):
         run_list = openml.runs.list_runs(size=10)
-        self.assertEqual(len(run_list), 10)
+        assert len(run_list) == 10
         run_list_more = openml.runs.list_runs(size=20)
-        self.assertEqual(len(run_list_more), 20)
+        assert len(run_list_more) > 10  # a fresh db should have 15 evaluated runs
 
         study = openml.study.create_study(
             alias=None,
             benchmark_suite=None,
-            name='study with illegal runs',
-            description='none',
-            run_ids=list(run_list.keys())
+            name="study with illegal runs",
+            description="none",
+            run_ids=list(run_list["run_id"]),
         )
-        study_id = study.publish()
-        TestBase._mark_entity_for_removal('study', study_id)
-        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], study_id))
-        study_original = openml.study.get_study(study_id)
-
-        with self.assertRaisesRegex(openml.exceptions.OpenMLServerException,
-                                    'Problem attaching entities.'):
+        study.publish()
+        TestBase._mark_entity_for_removal("study", study.id)
+        TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {study.id}")
+        study_original = openml.study.get_study(study.id)
+
+        with pytest.raises(
+            openml.exceptions.OpenMLServerException,
+            match="Problem attaching entities.",
+        ):
             # run id does not exists
-            openml.study.attach_to_study(study_id, [0])
+            openml.study.attach_to_study(study.id, [0])
 
-        with self.assertRaisesRegex(openml.exceptions.OpenMLServerException,
-                                    'Problem attaching entities.'):
+        with pytest.raises(
+            openml.exceptions.OpenMLServerException,
+            match="Problem attaching entities.",
+        ):
             # some runs already attached
-            openml.study.attach_to_study(study_id, list(run_list_more.keys()))
-        study_downloaded = openml.study.get_study(study_id)
+            openml.study.attach_to_study(study.id, list(run_list_more["run_id"]))
+        study_downloaded = openml.study.get_study(study.id)
         self.assertListEqual(study_original.runs, study_downloaded.runs)
 
+    @unittest.skip("It is unclear when we can expect the test to pass or fail.")
     def test_study_list(self):
-        study_list = openml.study.list_studies(status='in_preparation')
-        # might fail if server is recently resetted
-        self.assertGreater(len(study_list), 2)
-
-    def test_study_list_output_format(self):
-        study_list = openml.study.list_studies(status='in_preparation',
-                                               output_format='dataframe')
-        self.assertIsInstance(study_list, pd.DataFrame)
-        self.assertGreater(len(study_list), 2)
+        study_list = openml.study.list_studies(status="in_preparation")
+        # might fail if server is recently reset
+        assert len(study_list) >= 2
diff --git a/tests/test_tasks/__init__.py b/tests/test_tasks/__init__.py
index e823eb2c7..26488a8cc 100644
--- a/tests/test_tasks/__init__.py
+++ b/tests/test_tasks/__init__.py
@@ -1,7 +1,9 @@
-from .test_task import OpenMLTaskTest
+# License: BSD 3-Clause
+
 from .test_supervised_task import OpenMLSupervisedTaskTest
+from .test_task import OpenMLTaskTest
 
 __all__ = [
-    'OpenMLTaskTest',
-    'OpenMLSupervisedTaskTest',
+    "OpenMLTaskTest",
+    "OpenMLSupervisedTaskTest",
 ]
diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py
index e5b7c4415..65dcebc1d 100644
--- a/tests/test_tasks/test_classification_task.py
+++ b/tests/test_tasks/test_classification_task.py
@@ -1,40 +1,43 @@
-import numpy as np
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import pandas as pd
+import pytest
+
+from openml.tasks import TaskType, get_task
 
-from openml.tasks import get_task
 from .test_supervised_task import OpenMLSupervisedTaskTest
 
 
 class OpenMLClassificationTaskTest(OpenMLSupervisedTaskTest):
-
     __test__ = True
 
     def setUp(self, n_levels: int = 1):
+        super().setUp()
+        self.task_id = 119  # diabetes
+        self.task_type = TaskType.SUPERVISED_CLASSIFICATION
+        self.estimation_procedure = 5
 
-        super(OpenMLClassificationTaskTest, self).setUp()
-        self.task_id = 119
-        self.task_type_id = 1
-        self.estimation_procedure = 1
-
-    def test_get_X_and_Y(self):
-
-        X, Y = super(OpenMLClassificationTaskTest, self).test_get_X_and_Y()
-        self.assertEqual((768, 8), X.shape)
-        self.assertIsInstance(X, np.ndarray)
-        self.assertEqual((768, ), Y.shape)
-        self.assertIsInstance(Y, np.ndarray)
-        self.assertEqual(Y.dtype, int)
-
+    @pytest.mark.test_server()
     def test_download_task(self):
+        task = super().test_download_task()
+        assert task.task_id == self.task_id
+        assert task.task_type_id == TaskType.SUPERVISED_CLASSIFICATION
+        assert task.dataset_id == 20
+        assert task.estimation_procedure_id == self.estimation_procedure
 
-        task = super(OpenMLClassificationTaskTest, self).test_download_task()
-        self.assertEqual(task.task_id, self.task_id)
-        self.assertEqual(task.task_type_id, 1)
-        self.assertEqual(task.dataset_id, 20)
-
+    @pytest.mark.test_server()
     def test_class_labels(self):
-
         task = get_task(self.task_id)
-        self.assertEqual(
-            task.class_labels,
-            ['tested_negative', 'tested_positive']
-        )
+        assert task.class_labels == ["tested_negative", "tested_positive"]
+
+
+@pytest.mark.test_server()
+def test_get_X_and_Y():
+    task = get_task(119)
+    X, Y = task.get_X_and_y()
+    assert X.shape == (768, 8)
+    assert isinstance(X, pd.DataFrame)
+    assert Y.shape == (768,)
+    assert isinstance(Y, pd.Series)
+    assert pd.api.types.is_categorical_dtype(Y)
diff --git a/tests/test_tasks/test_clustering_task.py b/tests/test_tasks/test_clustering_task.py
index 168b798d1..29f5663c4 100644
--- a/tests/test_tasks/test_clustering_task.py
+++ b/tests/test_tasks/test_clustering_task.py
@@ -1,34 +1,43 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import pytest
+
 import openml
+from openml.exceptions import OpenMLServerException
+from openml.tasks import TaskType
 from openml.testing import TestBase
+
 from .test_task import OpenMLTaskTest
-from openml.exceptions import OpenMLServerException
 
 
 class OpenMLClusteringTaskTest(OpenMLTaskTest):
-
     __test__ = True
 
     def setUp(self, n_levels: int = 1):
-
-        super(OpenMLClusteringTaskTest, self).setUp()
+        super().setUp()
         self.task_id = 146714
-        self.task_type_id = 5
+        self.task_type = TaskType.CLUSTERING
         self.estimation_procedure = 17
 
+    @pytest.mark.production_server()
     def test_get_dataset(self):
         # no clustering tasks on test server
-        openml.config.server = self.production_server
+        self.use_production_server()
         task = openml.tasks.get_task(self.task_id)
         task.get_dataset()
 
+    @pytest.mark.production_server()
+    @pytest.mark.test_server()
     def test_download_task(self):
         # no clustering tasks on test server
-        openml.config.server = self.production_server
-        task = super(OpenMLClusteringTaskTest, self).test_download_task()
-        self.assertEqual(task.task_id, self.task_id)
-        self.assertEqual(task.task_type_id, 5)
-        self.assertEqual(task.dataset_id, 36)
+        self.use_production_server()
+        task = super().test_download_task()
+        assert task.task_id == self.task_id
+        assert task.task_type_id == TaskType.CLUSTERING
+        assert task.dataset_id == 36
 
+    @pytest.mark.test_server()
     def test_upload_task(self):
         compatible_datasets = self._get_compatible_rand_dataset()
         for i in range(100):
@@ -36,14 +45,15 @@ def test_upload_task(self):
                 dataset_id = compatible_datasets[i % len(compatible_datasets)]
                 # Upload a clustering task without a ground truth.
                 task = openml.tasks.create_task(
-                    task_type_id=self.task_type_id,
+                    task_type=self.task_type,
                     dataset_id=dataset_id,
-                    estimation_procedure_id=self.estimation_procedure
+                    estimation_procedure_id=self.estimation_procedure,
+                )
+                task = task.publish()
+                TestBase._mark_entity_for_removal("task", task.id)
+                TestBase.logger.info(
+                    f"collected from {__file__.split('/')[-1]}: {task.id}",
                 )
-                task_id = task.publish()
-                TestBase._mark_entity_for_removal('task', task_id)
-                TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                                    task_id))
                 # success
                 break
             except OpenMLServerException as e:
@@ -56,5 +66,5 @@ def test_upload_task(self):
                     raise e
         else:
             raise ValueError(
-                'Could not create a valid task for task type ID {}'.format(self.task_type_id)
+                f"Could not create a valid task for task type ID {self.task_type}",
             )
diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py
index 625252606..465d9c0be 100644
--- a/tests/test_tasks/test_learning_curve_task.py
+++ b/tests/test_tasks/test_learning_curve_task.py
@@ -1,40 +1,40 @@
-import numpy as np
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import pandas as pd
+import pytest
+
+from openml.tasks import TaskType, get_task
 
-from openml.tasks import get_task
 from .test_supervised_task import OpenMLSupervisedTaskTest
 
 
 class OpenMLLearningCurveTaskTest(OpenMLSupervisedTaskTest):
-
     __test__ = True
 
     def setUp(self, n_levels: int = 1):
-
-        super(OpenMLLearningCurveTaskTest, self).setUp()
-        self.task_id = 801
-        self.task_type_id = 3
+        super().setUp()
+        self.task_id = 801  # diabetes
+        self.task_type = TaskType.LEARNING_CURVE
         self.estimation_procedure = 13
 
+    @pytest.mark.test_server()
     def test_get_X_and_Y(self):
-
-        X, Y = super(OpenMLLearningCurveTaskTest, self).test_get_X_and_Y()
-        self.assertEqual((768, 8), X.shape)
-        self.assertIsInstance(X, np.ndarray)
-        self.assertEqual((768, ), Y.shape)
-        self.assertIsInstance(Y, np.ndarray)
-        self.assertEqual(Y.dtype, int)
-
+        X, Y = super().test_get_X_and_Y()
+        assert X.shape == (768, 8)
+        assert isinstance(X, pd.DataFrame)
+        assert Y.shape == (768,)
+        assert isinstance(Y, pd.Series)
+        assert pd.api.types.is_categorical_dtype(Y)
+
+    @pytest.mark.test_server()
     def test_download_task(self):
+        task = super().test_download_task()
+        assert task.task_id == self.task_id
+        assert task.task_type_id == TaskType.LEARNING_CURVE
+        assert task.dataset_id == 20
 
-        task = super(OpenMLLearningCurveTaskTest, self).test_download_task()
-        self.assertEqual(task.task_id, self.task_id)
-        self.assertEqual(task.task_type_id, 3)
-        self.assertEqual(task.dataset_id, 20)
-
+    @pytest.mark.test_server()
     def test_class_labels(self):
-
         task = get_task(self.task_id)
-        self.assertEqual(
-            task.class_labels,
-            ['tested_negative', 'tested_positive']
-        )
+        assert task.class_labels == ["tested_negative", "tested_positive"]
diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py
index 57ff964cd..26d7dc94b 100644
--- a/tests/test_tasks/test_regression_task.py
+++ b/tests/test_tasks/test_regression_task.py
@@ -1,31 +1,67 @@
-import numpy as np
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import ast
+
+import pandas as pd
+import pytest
+
+import openml
+from openml.exceptions import OpenMLServerException
+from openml.tasks import TaskType
+from openml.testing import TestBase, check_task_existence
 
 from .test_supervised_task import OpenMLSupervisedTaskTest
 
 
 class OpenMLRegressionTaskTest(OpenMLSupervisedTaskTest):
-
     __test__ = True
 
     def setUp(self, n_levels: int = 1):
+        super().setUp()
+        self.estimation_procedure = 9
+        task_meta_data = {
+            "task_type": TaskType.SUPERVISED_REGRESSION,
+            "dataset_id": 105,  # wisconsin
+            "estimation_procedure_id": self.estimation_procedure, # non default value to test estimation procedure id
+            "target_name": "time",
+        }
+        _task_id = check_task_existence(**task_meta_data)
+        if _task_id is not None:
+            task_id = _task_id
+        else:
+            new_task = openml.tasks.create_task(**task_meta_data)
+            # publishes the new task
+            try:
+                new_task = new_task.publish()
+                task_id = new_task.task_id
+                # mark to remove the uploaded task
+                TestBase._mark_entity_for_removal("task", task_id)
+                TestBase.logger.info(f"collected from test_run_functions: {task_id}")
+            except OpenMLServerException as e:
+                if e.code == 614:  # Task already exists
+                    # the exception message contains the task_id that was matched in the format
+                    # 'Task already exists. - matched id(s): [xxxx]'
+                    task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+                else:
+                    raise Exception(repr(e))
+        self.task_id = task_id
+        self.task_type = TaskType.SUPERVISED_REGRESSION
 
-        super(OpenMLRegressionTaskTest, self).setUp()
-        self.task_id = 625
-        self.task_type_id = 2
-        self.estimation_procedure = 7
 
+    @pytest.mark.test_server()
     def test_get_X_and_Y(self):
+        X, Y = super().test_get_X_and_Y()
+        assert X.shape == (194, 32)
+        assert isinstance(X, pd.DataFrame)
+        assert Y.shape == (194,)
+        assert isinstance(Y, pd.Series)
+        assert pd.api.types.is_numeric_dtype(Y)
 
-        X, Y = super(OpenMLRegressionTaskTest, self).test_get_X_and_Y()
-        self.assertEqual((194, 32), X.shape)
-        self.assertIsInstance(X, np.ndarray)
-        self.assertEqual((194,), Y.shape)
-        self.assertIsInstance(Y, np.ndarray)
-        self.assertEqual(Y.dtype, float)
-
+    @pytest.mark.test_server()
     def test_download_task(self):
-
-        task = super(OpenMLRegressionTaskTest, self).test_download_task()
-        self.assertEqual(task.task_id, self.task_id)
-        self.assertEqual(task.task_type_id, 2)
-        self.assertEqual(task.dataset_id, 105)
+        task = super().test_download_task()
+        assert task.task_id == self.task_id
+        assert task.task_type_id == TaskType.SUPERVISED_REGRESSION
+        assert task.dataset_id == 105
+        assert task.estimation_procedure_id == self.estimation_procedure
diff --git a/tests/test_tasks/test_split.py b/tests/test_tasks/test_split.py
index 763bb15f7..e3320ae80 100644
--- a/tests/test_tasks/test_split.py
+++ b/tests/test_tasks/test_split.py
@@ -1,5 +1,11 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import inspect
 import os
+import shutil
+import tempfile
+from pathlib import Path
 
 import numpy as np
 
@@ -14,73 +20,85 @@ class OpenMLSplitTest(TestBase):
     def setUp(self):
         __file__ = inspect.getfile(OpenMLSplitTest)
         self.directory = os.path.dirname(__file__)
-        # This is for dataset
-        self.arff_filename = os.path.join(
-            self.directory, "..", "files", "org", "openml", "test",
-            "tasks", "1882", "datasplits.arff"
+        source_arff = (
+            Path(self.directory).parent
+            / "files"
+            / "org"
+            / "openml"
+            / "test"
+            / "tasks"
+            / "1882"
+            / "datasplits.arff"
         )
-        self.pd_filename = self.arff_filename.replace(".arff", ".pkl.py3")
+        # Use a unique temp directory for each test to avoid race conditions
+        # when running tests in parallel (see issue #1641)
+        self._temp_dir = tempfile.TemporaryDirectory()
+        self.arff_filepath = Path(self._temp_dir.name) / "datasplits.arff"
+        shutil.copy(source_arff, self.arff_filepath)
+        self.pd_filename = self.arff_filepath.with_suffix(".pkl.py3")
 
     def tearDown(self):
+        # Clean up the entire temp directory
         try:
-            os.remove(self.pd_filename)
+            self._temp_dir.cleanup()
         except (OSError, FileNotFoundError):
-            #  Replaced bare except. Not sure why these exceptions are acceptable.
             pass
 
     def test_eq(self):
-        split = OpenMLSplit._from_arff_file(self.arff_filename)
-        self.assertEqual(split, split)
+        split = OpenMLSplit._from_arff_file(self.arff_filepath)
+        assert split == split
 
-        split2 = OpenMLSplit._from_arff_file(self.arff_filename)
+        split2 = OpenMLSplit._from_arff_file(self.arff_filepath)
         split2.name = "a"
-        self.assertNotEqual(split, split2)
+        assert split != split2
 
-        split2 = OpenMLSplit._from_arff_file(self.arff_filename)
+        split2 = OpenMLSplit._from_arff_file(self.arff_filepath)
         split2.description = "a"
-        self.assertNotEqual(split, split2)
+        assert split != split2
 
-        split2 = OpenMLSplit._from_arff_file(self.arff_filename)
-        split2.split[10] = dict()
-        self.assertNotEqual(split, split2)
+        split2 = OpenMLSplit._from_arff_file(self.arff_filepath)
+        split2.split[10] = {}
+        assert split != split2
 
-        split2 = OpenMLSplit._from_arff_file(self.arff_filename)
-        split2.split[0][10] = dict()
-        self.assertNotEqual(split, split2)
+        split2 = OpenMLSplit._from_arff_file(self.arff_filepath)
+        split2.split[0][10] = {}
+        assert split != split2
 
     def test_from_arff_file(self):
-        split = OpenMLSplit._from_arff_file(self.arff_filename)
-        self.assertIsInstance(split.split, dict)
-        self.assertIsInstance(split.split[0], dict)
-        self.assertIsInstance(split.split[0][0], dict)
-        self.assertIsInstance(split.split[0][0][0][0], np.ndarray)
-        self.assertIsInstance(split.split[0][0][0].train, np.ndarray)
-        self.assertIsInstance(split.split[0][0][0].train, np.ndarray)
-        self.assertIsInstance(split.split[0][0][0][1], np.ndarray)
-        self.assertIsInstance(split.split[0][0][0].test, np.ndarray)
-        self.assertIsInstance(split.split[0][0][0].test, np.ndarray)
+        split = OpenMLSplit._from_arff_file(self.arff_filepath)
+        assert isinstance(split.split, dict)
+        assert isinstance(split.split[0], dict)
+        assert isinstance(split.split[0][0], dict)
+        assert isinstance(split.split[0][0][0][0], np.ndarray)
+        assert isinstance(split.split[0][0][0].train, np.ndarray)
+        assert isinstance(split.split[0][0][0].train, np.ndarray)
+        assert isinstance(split.split[0][0][0][1], np.ndarray)
+        assert isinstance(split.split[0][0][0].test, np.ndarray)
+        assert isinstance(split.split[0][0][0].test, np.ndarray)
         for i in range(10):
             for j in range(10):
-                self.assertGreaterEqual(split.split[i][j][0].train.shape[0], 808)
-                self.assertGreaterEqual(split.split[i][j][0].test.shape[0], 89)
-                self.assertEqual(split.split[i][j][0].train.shape[0]
-                                 + split.split[i][j][0].test.shape[0],
-                                 898)
+                assert split.split[i][j][0].train.shape[0] >= 808
+                assert split.split[i][j][0].test.shape[0] >= 89
+                assert (
+                    split.split[i][j][0].train.shape[0] + split.split[i][j][0].test.shape[0] == 898
+                )
 
     def test_get_split(self):
-        split = OpenMLSplit._from_arff_file(self.arff_filename)
+        split = OpenMLSplit._from_arff_file(self.arff_filepath)
         train_split, test_split = split.get(fold=5, repeat=2)
-        self.assertEqual(train_split.shape[0], 808)
-        self.assertEqual(test_split.shape[0], 90)
+        assert train_split.shape[0] == 808
+        assert test_split.shape[0] == 90
         self.assertRaisesRegex(
             ValueError,
             "Repeat 10 not known",
             split.get,
-            10, 2,
+            10,
+            2,
         )
         self.assertRaisesRegex(
             ValueError,
             "Fold 10 not known",
             split.get,
-            2, 10,
+            2,
+            10,
         )
diff --git a/tests/test_tasks/test_supervised_task.py b/tests/test_tasks/test_supervised_task.py
index f7112b1cf..99df3cace 100644
--- a/tests/test_tasks/test_supervised_task.py
+++ b/tests/test_tasks/test_supervised_task.py
@@ -1,9 +1,13 @@
-from typing import Tuple
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import unittest
 
-import numpy as np
+import pandas as pd
 
 from openml.tasks import get_task
+import pytest
+
 from .test_task import OpenMLTaskTest
 
 
@@ -18,18 +22,14 @@ class OpenMLSupervisedTaskTest(OpenMLTaskTest):
     @classmethod
     def setUpClass(cls):
         if cls is OpenMLSupervisedTaskTest:
-            raise unittest.SkipTest(
-                "Skip OpenMLSupervisedTaskTest tests,"
-                " it's a base class"
-            )
-        super(OpenMLSupervisedTaskTest, cls).setUpClass()
+            raise unittest.SkipTest("Skip OpenMLSupervisedTaskTest tests," " it's a base class")
+        super().setUpClass()
 
     def setUp(self, n_levels: int = 1):
+        super().setUp()
 
-        super(OpenMLSupervisedTaskTest, self).setUp()
-
-    def test_get_X_and_Y(self) -> Tuple[np.ndarray, np.ndarray]:
-
+    @pytest.mark.test_server()
+    def test_get_X_and_Y(self) -> tuple[pd.DataFrame, pd.Series]:
         task = get_task(self.task_id)
         X, Y = task.get_X_and_y()
         return X, Y
diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py
index 3066d9ce9..1d0df1210 100644
--- a/tests/test_tasks/test_task.py
+++ b/tests/test_tasks/test_task.py
@@ -1,17 +1,18 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import unittest
-from typing import List
 from random import randint, shuffle
 
-from openml.exceptions import OpenMLServerException
-from openml.testing import TestBase
+import pytest
+
 from openml.datasets import (
     get_dataset,
     list_datasets,
 )
-from openml.tasks import (
-    create_task,
-    get_task
-)
+from openml.exceptions import OpenMLServerException
+from openml.tasks import TaskType, create_task, get_task
+from openml.testing import TestBase
 
 
 class OpenMLTaskTest(TestBase):
@@ -25,22 +26,18 @@ class OpenMLTaskTest(TestBase):
     @classmethod
     def setUpClass(cls):
         if cls is OpenMLTaskTest:
-            raise unittest.SkipTest(
-                "Skip OpenMLTaskTest tests,"
-                " it's a base class"
-            )
-        super(OpenMLTaskTest, cls).setUpClass()
+            raise unittest.SkipTest("Skip OpenMLTaskTest tests," " it's a base class")
+        super().setUpClass()
 
     def setUp(self, n_levels: int = 1):
+        super().setUp()
 
-        super(OpenMLTaskTest, self).setUp()
-
+    @pytest.mark.test_server()
     def test_download_task(self):
-
         return get_task(self.task_id)
 
+    @pytest.mark.test_server()
     def test_upload_task(self):
-
         # We don't know if the task in question already exists, so we try a few times. Checking
         # beforehand would not be an option because a concurrent unit test could potentially
         # create the same task and make this unit test fail (i.e. getting a dataset and creating
@@ -51,16 +48,17 @@ def test_upload_task(self):
                 dataset_id = compatible_datasets[i % len(compatible_datasets)]
                 # TODO consider implementing on the diff task types.
                 task = create_task(
-                    task_type_id=self.task_type_id,
+                    task_type=self.task_type,
                     dataset_id=dataset_id,
                     target_name=self._get_random_feature(dataset_id),
-                    estimation_procedure_id=self.estimation_procedure
+                    estimation_procedure_id=self.estimation_procedure,
                 )
 
-                task_id = task.publish()
-                TestBase._mark_entity_for_removal('task', task_id)
-                TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
-                                                                    task_id))
+                task.publish()
+                TestBase._mark_entity_for_removal("task", task.id)
+                TestBase.logger.info(
+                    f"collected from {__file__.split('/')[-1]}: {task.id}",
+                )
                 # success
                 break
             except OpenMLServerException as e:
@@ -73,34 +71,23 @@ def test_upload_task(self):
                     raise e
         else:
             raise ValueError(
-                'Could not create a valid task for task type ID {}'.format(self.task_type_id)
+                f"Could not create a valid task for task type ID {self.task_type}",
             )
 
-    def _get_compatible_rand_dataset(self) -> List:
-
-        compatible_datasets = []
-        active_datasets = list_datasets(status='active')
+    def _get_compatible_rand_dataset(self) -> list:
+        active_datasets = list_datasets(status="active")
 
         # depending on the task type, find either datasets
         # with only symbolic features or datasets with only
         # numerical features.
-        if self.task_type_id == 2:
-            # regression task
-            for dataset_id, dataset_info in active_datasets.items():
-                if 'NumberOfSymbolicFeatures' in dataset_info:
-                    if dataset_info['NumberOfSymbolicFeatures'] == 0:
-                        compatible_datasets.append(dataset_id)
-        elif self.task_type_id == 5:
-            # clustering task
-            compatible_datasets = list(active_datasets.keys())
+        if self.task_type == TaskType.SUPERVISED_REGRESSION:
+            compatible_datasets = active_datasets[active_datasets["NumberOfSymbolicFeatures"] == 0]
+        elif self.task_type == TaskType.CLUSTERING:
+            compatible_datasets = active_datasets
         else:
-            for dataset_id, dataset_info in active_datasets.items():
-                # extra checks because of:
-                # https://github.com/openml/OpenML/issues/959
-                if 'NumberOfNumericFeatures' in dataset_info:
-                    if dataset_info['NumberOfNumericFeatures'] == 0:
-                        compatible_datasets.append(dataset_id)
+            compatible_datasets = active_datasets[active_datasets["NumberOfNumericFeatures"] == 0]
 
+        compatible_datasets = list(compatible_datasets["did"])
         # in-place shuffling
         shuffle(compatible_datasets)
         return compatible_datasets
@@ -110,17 +97,16 @@ def _get_compatible_rand_dataset(self) -> List:
         # return compatible_datasets[random_dataset_pos]
 
     def _get_random_feature(self, dataset_id: int) -> str:
-
         random_dataset = get_dataset(dataset_id)
         # necessary loop to overcome string and date type
         # features.
         while True:
             random_feature_index = randint(0, len(random_dataset.features) - 1)
             random_feature = random_dataset.features[random_feature_index]
-            if self.task_type_id == 2:
-                if random_feature.data_type == 'numeric':
+            if self.task_type == TaskType.SUPERVISED_REGRESSION:
+                if random_feature.data_type == "numeric":
                     break
             else:
-                if random_feature.data_type == 'nominal':
+                if random_feature.data_type == "nominal":
                     break
         return random_feature.name
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index f773752d5..df3c0a3b6 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -1,171 +1,196 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
 import os
+import unittest
+from typing import cast
 from unittest import mock
 
-from openml.testing import TestBase
-from openml import OpenMLSplit, OpenMLTask
-from openml.exceptions import OpenMLCacheException
-import openml
-import unittest
 import pandas as pd
+import pytest
+import requests
+
+import openml
+from openml import OpenMLSplit, OpenMLTask
+from openml.exceptions import OpenMLCacheException, OpenMLNotAuthorizedError, OpenMLServerException
+from openml.tasks import TaskType
+from openml.testing import TestBase, create_request_response
 
 
 class TestTask(TestBase):
     _multiprocess_can_split_ = True
 
     def setUp(self):
-        super(TestTask, self).setUp()
+        super().setUp()
 
     def tearDown(self):
-        super(TestTask, self).tearDown()
+        super().tearDown()
 
+    @pytest.mark.test_server()
     def test__get_cached_tasks(self):
-        openml.config.cache_directory = self.static_cache_dir
+        openml.config.set_root_cache_directory(self.static_cache_dir)
         tasks = openml.tasks.functions._get_cached_tasks()
-        self.assertIsInstance(tasks, dict)
-        self.assertEqual(len(tasks), 3)
-        self.assertIsInstance(list(tasks.values())[0], OpenMLTask)
+        assert isinstance(tasks, dict)
+        assert len(tasks) == 3
+        assert isinstance(next(iter(tasks.values())), OpenMLTask)
 
+    @pytest.mark.test_server()
     def test__get_cached_task(self):
-        openml.config.cache_directory = self.static_cache_dir
+        openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.functions._get_cached_task(1)
-        self.assertIsInstance(task, OpenMLTask)
+        assert isinstance(task, OpenMLTask)
 
     def test__get_cached_task_not_cached(self):
-        openml.config.cache_directory = self.static_cache_dir
+        openml.config.set_root_cache_directory(self.static_cache_dir)
         self.assertRaisesRegex(
             OpenMLCacheException,
-            'Task file for tid 2 not cached',
+            "Task file for tid 2 not cached",
             openml.tasks.functions._get_cached_task,
             2,
         )
 
+    @pytest.mark.test_server()
     def test__get_estimation_procedure_list(self):
-        estimation_procedures = openml.tasks.functions.\
-            _get_estimation_procedure_list()
-        self.assertIsInstance(estimation_procedures, list)
-        self.assertIsInstance(estimation_procedures[0], dict)
-        self.assertEqual(estimation_procedures[0]['task_type_id'], 1)
+        estimation_procedures = openml.tasks.functions._get_estimation_procedure_list()
+        assert isinstance(estimation_procedures, list)
+        assert isinstance(estimation_procedures[0], dict)
+        assert estimation_procedures[0]["task_type_id"] == TaskType.SUPERVISED_CLASSIFICATION
 
+    @pytest.mark.production_server()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_list_clustering_task(self):
+        self.use_production_server()
         # as shown by #383, clustering tasks can give list/dict casting problems
-        openml.config.server = self.production_server
-        openml.tasks.list_tasks(task_type_id=5, size=10)
+        openml.tasks.list_tasks(task_type=TaskType.CLUSTERING, size=10)
         # the expected outcome is that it doesn't crash. No assertions.
 
     def _check_task(self, task):
-        self.assertEqual(type(task), dict)
-        self.assertGreaterEqual(len(task), 2)
-        self.assertIn('did', task)
-        self.assertIsInstance(task['did'], int)
-        self.assertIn('status', task)
-        self.assertIsInstance(task['status'], str)
-        self.assertIn(task['status'],
-                      ['in_preparation', 'active', 'deactivated'])
-
+        assert type(task) == dict
+        assert len(task) >= 2
+        assert "did" in task
+        assert isinstance(task["did"], int)
+        assert "status" in task
+        assert isinstance(task["status"], str)
+        assert task["status"] in ["in_preparation", "active", "deactivated"]
+
+    @pytest.mark.test_server()
     def test_list_tasks_by_type(self):
-        num_curves_tasks = 200  # number is flexible, check server if fails
-        ttid = 3
-        tasks = openml.tasks.list_tasks(task_type_id=ttid)
-        self.assertGreaterEqual(len(tasks), num_curves_tasks)
-        for tid in tasks:
-            self.assertEqual(ttid, tasks[tid]["ttid"])
-            self._check_task(tasks[tid])
-
-    def test_list_tasks_output_format(self):
-        ttid = 3
-        tasks = openml.tasks.list_tasks(task_type_id=ttid, output_format='dataframe')
-        self.assertIsInstance(tasks, pd.DataFrame)
-        self.assertGreater(len(tasks), 100)
-
+        num_curves_tasks = 198  # number is flexible, check server if fails
+        ttid = TaskType.LEARNING_CURVE
+        tasks = openml.tasks.list_tasks(task_type=ttid)
+        assert len(tasks) >= num_curves_tasks
+        for task in tasks.to_dict(orient="index").values():
+            assert ttid == task["ttid"]
+            self._check_task(task)
+
+    @pytest.mark.test_server()
+    def test_list_tasks_length(self):
+        ttid = TaskType.LEARNING_CURVE
+        tasks = openml.tasks.list_tasks(task_type=ttid)
+        assert len(tasks) > 100
+
+    @pytest.mark.test_server()
     def test_list_tasks_empty(self):
-        tasks = openml.tasks.list_tasks(tag='NoOneWillEverUseThisTag')
-        if len(tasks) > 0:
-            raise ValueError('UnitTest Outdated, got somehow results (tag is used, please adapt)')
-
-        self.assertIsInstance(tasks, dict)
+        tasks = openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag")
+        assert tasks.empty
 
-    @unittest.skip("Server will currently incorrectly return only 99 tasks."
-                   "See https://github.com/openml/OpenML/issues/980")
+    @pytest.mark.test_server()
     def test_list_tasks_by_tag(self):
-        num_basic_tasks = 100  # number is flexible, check server if fails
-        tasks = openml.tasks.list_tasks(tag='OpenML100')
-        self.assertGreaterEqual(len(tasks), num_basic_tasks)
-        for tid in tasks:
-            self._check_task(tasks[tid])
-
+        # Server starts with 99 active tasks with the tag, and one 'in_preparation',
+        # so depending on the processing of the last dataset, there may be 99 or 100 matches.
+        num_basic_tasks = 99
+        tasks = openml.tasks.list_tasks(tag="OpenML100")
+        assert len(tasks) >= num_basic_tasks
+        for task in tasks.to_dict(orient="index").values():
+            self._check_task(task)
+
+    @pytest.mark.test_server()
     def test_list_tasks(self):
         tasks = openml.tasks.list_tasks()
-        self.assertGreaterEqual(len(tasks), 900)
-        for tid in tasks:
-            self._check_task(tasks[tid])
+        assert len(tasks) >= 900
+        for task in tasks.to_dict(orient="index").values():
+            self._check_task(task)
 
+    @pytest.mark.test_server()
     def test_list_tasks_paginate(self):
         size = 10
         max = 100
         for i in range(0, max, size):
             tasks = openml.tasks.list_tasks(offset=i, size=size)
-            self.assertGreaterEqual(size, len(tasks))
-            for tid in tasks:
-                self._check_task(tasks[tid])
+            assert size >= len(tasks)
+            for task in tasks.to_dict(orient="index").values():
+                self._check_task(task)
 
+    @pytest.mark.test_server()
     def test_list_tasks_per_type_paginate(self):
-        size = 10
+        size = 40
         max = 100
-        task_types = 4
-        for j in range(1, task_types):
+        task_types = [
+            TaskType.SUPERVISED_CLASSIFICATION,
+            TaskType.SUPERVISED_REGRESSION,
+            TaskType.LEARNING_CURVE,
+        ]
+        for j in task_types:
             for i in range(0, max, size):
-                tasks = openml.tasks.list_tasks(task_type_id=j, offset=i, size=size)
-                self.assertGreaterEqual(size, len(tasks))
-                for tid in tasks:
-                    self.assertEqual(j, tasks[tid]["ttid"])
-                    self._check_task(tasks[tid])
+                tasks = openml.tasks.list_tasks(task_type=j, offset=i, size=size)
+                assert size >= len(tasks)
+                for task in tasks.to_dict(orient="index").values():
+                    assert j == task["ttid"]
+                    self._check_task(task)
 
+    @pytest.mark.test_server()
     def test__get_task(self):
-        openml.config.cache_directory = self.static_cache_dir
+        openml.config.set_root_cache_directory(self.static_cache_dir)
         openml.tasks.get_task(1882)
 
-    @unittest.skip("Please await outcome of discussion: https://github.com/openml/OpenML/issues/776")  # noqa: E501
+    @unittest.skip(
+        "Please await outcome of discussion: https://github.com/openml/OpenML/issues/776",
+    )
+    @pytest.mark.production_server()
     def test__get_task_live(self):
+        self.use_production_server()
         # Test the following task as it used to throw an Unicode Error.
         # https://github.com/openml/openml-python/issues/378
-        openml.config.server = self.production_server
         openml.tasks.get_task(34536)
 
+    @pytest.mark.test_server()
     def test_get_task(self):
-        task = openml.tasks.get_task(1)
-        self.assertIsInstance(task, OpenMLTask)
-        self.assertTrue(os.path.exists(os.path.join(
-            self.workdir, 'org', 'openml', 'test', "tasks", "1", "task.xml",
-        )))
-        self.assertTrue(os.path.exists(os.path.join(
-            self.workdir, 'org', 'openml', 'test', "tasks", "1", "datasplits.arff"
-        )))
-        self.assertTrue(os.path.exists(os.path.join(
-            self.workdir, 'org', 'openml', 'test', "datasets", "1", "dataset.arff"
-        )))
+        task = openml.tasks.get_task(1, download_data=True)  # anneal; crossvalidation
+        assert isinstance(task, OpenMLTask)
+        assert os.path.exists(
+            os.path.join(openml.config.get_cache_directory(), "tasks", "1", "task.xml")
+        )
+        assert not os.path.exists(
+            os.path.join(openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff")
+        )
+        assert os.path.exists(
+            os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset_1.pq")
+        )
 
+    @pytest.mark.test_server()
     def test_get_task_lazy(self):
-        task = openml.tasks.get_task(2, download_data=False)
-        self.assertIsInstance(task, OpenMLTask)
-        self.assertTrue(os.path.exists(os.path.join(
-            self.workdir, 'org', 'openml', 'test', "tasks", "2", "task.xml",
-        )))
-        self.assertEqual(task.class_labels, ['1', '2', '3', '4', '5', 'U'])
-
-        self.assertFalse(os.path.exists(os.path.join(
-            self.workdir, 'org', 'openml', 'test', "tasks", "2", "datasplits.arff"
-        )))
+        task = openml.tasks.get_task(2, download_data=False)  # anneal; crossvalidation
+        assert isinstance(task, OpenMLTask)
+        assert os.path.exists(
+            os.path.join(openml.config.get_cache_directory(), "tasks", "2", "task.xml")
+        )
+        assert task.class_labels == ["1", "2", "3", "4", "5", "U"]
+
+        assert not os.path.exists(
+            os.path.join(openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff")
+        )
         # Since the download_data=False is propagated to get_dataset
-        self.assertFalse(os.path.exists(os.path.join(
-            self.workdir, 'org', 'openml', 'test', "datasets", "2", "dataset.arff"
-        )))
+        assert not os.path.exists(
+            os.path.join(openml.config.get_cache_directory(), "datasets", "2", "dataset.arff")
+        )
 
         task.download_split()
-        self.assertTrue(os.path.exists(os.path.join(
-            self.workdir, 'org', 'openml', 'test', "tasks", "2", "datasplits.arff"
-        )))
+        assert os.path.exists(
+            os.path.join(openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff")
+        )
 
-    @mock.patch('openml.tasks.functions.get_dataset')
+    @mock.patch("openml.tasks.functions.get_dataset")
+    @pytest.mark.test_server()
     def test_removal_upon_download_failure(self, get_dataset):
         class WeirdException(Exception):
             pass
@@ -177,21 +202,21 @@ def assert_and_raise(*args, **kwargs):
 
         get_dataset.side_effect = assert_and_raise
         try:
-            openml.tasks.get_task(1)
+            openml.tasks.get_task(1)  # anneal; crossvalidation
         except WeirdException:
             pass
         # Now the file should no longer exist
-        self.assertFalse(os.path.exists(
-            os.path.join(os.getcwd(), "tasks", "1", "tasks.xml")
-        ))
+        assert not os.path.exists(os.path.join(os.getcwd(), "tasks", "1", "tasks.xml"))
 
+    @pytest.mark.test_server()
     def test_get_task_with_cache(self):
-        openml.config.cache_directory = self.static_cache_dir
+        openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.get_task(1)
-        self.assertIsInstance(task, OpenMLTask)
+        assert isinstance(task, OpenMLTask)
 
+    @pytest.mark.production_server()
     def test_get_task_different_types(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         # Regression task
         openml.tasks.functions.get_task(5001)
         # Learning curve
@@ -199,19 +224,94 @@ def test_get_task_different_types(self):
         # Issue 538, get_task failing with clustering task.
         openml.tasks.functions.get_task(126033)
 
+    @pytest.mark.test_server()
     def test_download_split(self):
-        task = openml.tasks.get_task(1)
+        task = openml.tasks.get_task(1)  # anneal; crossvalidation
         split = task.download_split()
-        self.assertEqual(type(split), OpenMLSplit)
-        self.assertTrue(os.path.exists(os.path.join(
-            self.workdir, 'org', 'openml', 'test', "tasks", "1", "datasplits.arff"
-        )))
+        assert type(split) == OpenMLSplit
+        assert os.path.exists(
+            os.path.join(openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff")
+        )
 
     def test_deletion_of_cache_dir(self):
         # Simple removal
         tid_cache_dir = openml.utils._create_cache_directory_for_id(
-            'tasks', 1,
+            "tasks",
+            1,
         )
-        self.assertTrue(os.path.exists(tid_cache_dir))
-        openml.utils._remove_cache_dir_for_id('tasks', tid_cache_dir)
-        self.assertFalse(os.path.exists(tid_cache_dir))
+        assert os.path.exists(tid_cache_dir)
+        openml.utils._remove_cache_dir_for_id("tasks", tid_cache_dir)
+        assert not os.path.exists(tid_cache_dir)
+
+
+@mock.patch.object(requests.Session, "delete")
+def test_delete_task_not_owned(mock_delete, test_files_directory, test_api_key):
+    content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_owned.xml"
+    mock_delete.return_value = create_request_response(
+        status_code=412,
+        content_filepath=content_file,
+    )
+
+    with pytest.raises(
+        OpenMLNotAuthorizedError,
+        match="The task can not be deleted because it was not uploaded by you.",
+    ):
+        openml.tasks.delete_task(1)
+
+    task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/1"
+    assert task_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+
+
+@mock.patch.object(requests.Session, "delete")
+def test_delete_task_with_run(mock_delete, test_files_directory, test_api_key):
+    content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_has_runs.xml"
+    mock_delete.return_value = create_request_response(
+        status_code=412,
+        content_filepath=content_file,
+    )
+
+    with pytest.raises(
+        OpenMLNotAuthorizedError,
+        match="The task can not be deleted because it still has associated entities:",
+    ):
+        openml.tasks.delete_task(3496)
+
+    task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/3496"
+    assert task_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+
+
+@mock.patch.object(requests.Session, "delete")
+def test_delete_success(mock_delete, test_files_directory, test_api_key):
+    content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_successful.xml"
+    mock_delete.return_value = create_request_response(
+        status_code=200,
+        content_filepath=content_file,
+    )
+
+    success = openml.tasks.delete_task(361323)
+    assert success
+
+    task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/361323"
+    assert task_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
+
+
+@mock.patch.object(requests.Session, "delete")
+def test_delete_unknown_task(mock_delete, test_files_directory, test_api_key):
+    content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_exist.xml"
+    mock_delete.return_value = create_request_response(
+        status_code=412,
+        content_filepath=content_file,
+    )
+
+    with pytest.raises(
+        OpenMLServerException,
+        match="Task does not exist",
+    ):
+        openml.tasks.delete_task(9_999_999)
+
+    task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/9999999"
+    assert task_url == mock_delete.call_args.args[0]
+    assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
index 4a0789414..9316d0876 100644
--- a/tests/test_tasks/test_task_methods.py
+++ b/tests/test_tasks/test_task_methods.py
@@ -1,45 +1,62 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
 from time import time
 
 import openml
 from openml.testing import TestBase
+import pytest
 
 
 # Common methods between tasks
 class OpenMLTaskMethodsTest(TestBase):
-
     def setUp(self):
-        super(OpenMLTaskMethodsTest, self).setUp()
+        super().setUp()
 
     def tearDown(self):
-        super(OpenMLTaskMethodsTest, self).tearDown()
+        super().tearDown()
 
+    @pytest.mark.test_server()
     def test_tagging(self):
-        task = openml.tasks.get_task(1)
-        tag = "testing_tag_{}_{}".format(self.id(), time())
-        task_list = openml.tasks.list_tasks(tag=tag)
-        self.assertEqual(len(task_list), 0)
+        task = openml.tasks.get_task(1)  # anneal; crossvalidation
+        # tags can be at most 64 alphanumeric (+ underscore) chars
+        unique_indicator = str(time()).replace(".", "")
+        tag = f"test_tag_OpenMLTaskMethodsTest_{unique_indicator}"
+        tasks = openml.tasks.list_tasks(tag=tag)
+        assert len(tasks) == 0
         task.push_tag(tag)
-        task_list = openml.tasks.list_tasks(tag=tag)
-        self.assertEqual(len(task_list), 1)
-        self.assertIn(1, task_list)
+        tasks = openml.tasks.list_tasks(tag=tag)
+        assert len(tasks) == 1
+        assert 1 in tasks["tid"]
         task.remove_tag(tag)
-        task_list = openml.tasks.list_tasks(tag=tag)
-        self.assertEqual(len(task_list), 0)
+        tasks = openml.tasks.list_tasks(tag=tag)
+        assert len(tasks) == 0
 
+    @pytest.mark.test_server()
     def test_get_train_and_test_split_indices(self):
-        openml.config.cache_directory = self.static_cache_dir
+        openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.get_task(1882)
         train_indices, test_indices = task.get_train_test_split_indices(0, 0)
-        self.assertEqual(16, train_indices[0])
-        self.assertEqual(395, train_indices[-1])
-        self.assertEqual(412, test_indices[0])
-        self.assertEqual(364, test_indices[-1])
+        assert train_indices[0] == 16
+        assert train_indices[-1] == 395
+        assert test_indices[0] == 412
+        assert test_indices[-1] == 364
         train_indices, test_indices = task.get_train_test_split_indices(2, 2)
-        self.assertEqual(237, train_indices[0])
-        self.assertEqual(681, train_indices[-1])
-        self.assertEqual(583, test_indices[0])
-        self.assertEqual(24, test_indices[-1])
-        self.assertRaisesRegexp(ValueError, "Fold 10 not known",
-                                task.get_train_test_split_indices, 10, 0)
-        self.assertRaisesRegexp(ValueError, "Repeat 10 not known",
-                                task.get_train_test_split_indices, 0, 10)
+        assert train_indices[0] == 237
+        assert train_indices[-1] == 681
+        assert test_indices[0] == 583
+        assert test_indices[-1] == 24
+        self.assertRaisesRegex(
+            ValueError,
+            "Fold 10 not known",
+            task.get_train_test_split_indices,
+            10,
+            0,
+        )
+        self.assertRaisesRegex(
+            ValueError,
+            "Repeat 10 not known",
+            task.get_train_test_split_indices,
+            0,
+            10,
+        )
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index d8ecca92a..75f24ebf0 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -1,90 +1,181 @@
-from openml.testing import TestBase
-import numpy as np
+from __future__ import annotations
+
+import os
+import unittest.mock
+import pytest
 import openml
-import sys
+from openml.testing import _check_dataset
+
 
-if sys.version_info[0] >= 3:
-    from unittest import mock
-else:
-    import mock
+@pytest.fixture()
+def min_number_tasks_on_test_server() -> int:
+    """After a reset at least 1068 tasks are on the test server"""
+    return 1068
 
 
-class OpenMLTaskTest(TestBase):
-    _multiprocess_can_split_ = True
-    _batch_size = 25
+@pytest.fixture()
+def min_number_datasets_on_test_server() -> int:
+    """After a reset at least 127 datasets are on the test server"""
+    return 127
 
-    def mocked_perform_api_call(call, request_method):
-        # TODO: JvR: Why is this not a staticmethod?
-        url = openml.config.server + '/' + call
-        return openml._api_calls._read_url(url, request_method=request_method)
 
-    def test_list_all(self):
-        openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
+@pytest.fixture()
+def min_number_flows_on_test_server() -> int:
+    """After a reset at least 127 flows are on the test server"""
+    return 15
 
-    @mock.patch('openml._api_calls._perform_api_call',
-                side_effect=mocked_perform_api_call)
-    def test_list_all_few_results_available(self, _perform_api_call):
-        # we want to make sure that the number of api calls is only 1.
-        # Although we have multiple versions of the iris dataset, there is only
-        # one with this name/version combination
 
-        datasets = openml.datasets.list_datasets(size=1000,
-                                                 data_name='iris',
-                                                 data_version=1)
-        self.assertEqual(len(datasets), 1)
-        self.assertEqual(_perform_api_call.call_count, 1)
+@pytest.fixture()
+def min_number_setups_on_test_server() -> int:
+    """After a reset at least 20 setups are on the test server"""
+    return 50
 
-    def test_list_all_for_datasets(self):
-        required_size = 127  # default test server reset value
-        datasets = openml.datasets.list_datasets(batch_size=self._batch_size, size=required_size)
 
-        self.assertEqual(len(datasets), required_size)
-        for did in datasets:
-            self._check_dataset(datasets[did])
+@pytest.fixture()
+def min_number_runs_on_test_server() -> int:
+    """After a reset at least 21 runs are on the test server"""
+    return 15
 
-    def test_list_datasets_with_high_size_parameter(self):
-        datasets_a = openml.datasets.list_datasets()
-        datasets_b = openml.datasets.list_datasets(size=np.inf)
 
-        # note that in the meantime the number of datasets could have increased
-        # due to tests that run in parallel.
-        # instead of equality of size of list, checking if a valid subset
-        a = set(datasets_a.keys())
-        b = set(datasets_b.keys())
-        self.assertTrue(b.issubset(a))
+@pytest.fixture()
+def min_number_evaluations_on_test_server() -> int:
+    """After a reset at least 8 evaluations are on the test server"""
+    return 8
 
-    def test_list_all_for_tasks(self):
-        required_size = 1068  # default test server reset value
-        tasks = openml.tasks.list_tasks(batch_size=self._batch_size, size=required_size)
 
-        self.assertEqual(len(tasks), required_size)
+def _mocked_perform_api_call(call, request_method):
+    url = openml.config.server + "/" + call
+    return openml._api_calls._download_text_file(url)
 
-    def test_list_all_for_flows(self):
-        required_size = 15  # default test server reset value
-        flows = openml.flows.list_flows(batch_size=self._batch_size, size=required_size)
 
-        self.assertEqual(len(flows), required_size)
+@pytest.mark.test_server()
+def test_list_all():
+    openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
 
-    def test_list_all_for_setups(self):
-        required_size = 50
-        # TODO apparently list_setups function does not support kwargs
-        setups = openml.setups.list_setups(size=required_size)
 
-        # might not be on test server after reset, please rerun test at least once if fails
-        self.assertEqual(len(setups), required_size)
+@pytest.mark.test_server()
+def test_list_all_for_tasks(min_number_tasks_on_test_server):
+    tasks = openml.tasks.list_tasks(size=min_number_tasks_on_test_server)
+    assert min_number_tasks_on_test_server == len(tasks)
 
-    def test_list_all_for_runs(self):
-        required_size = 48
-        runs = openml.runs.list_runs(batch_size=self._batch_size, size=required_size)
 
-        # might not be on test server after reset, please rerun test at least once if fails
-        self.assertEqual(len(runs), required_size)
+@pytest.mark.test_server()
+def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
+    # By setting the batch size one lower than the minimum we guarantee at least two
+    # batches and at the same time do as few batches (roundtrips) as possible.
+    batch_size = min_number_tasks_on_test_server - 1
+    batches = openml.utils._list_all(
+        listing_call=openml.tasks.functions._list_tasks,
+        batch_size=batch_size,
+    )
+    assert len(batches) >= 2
+    assert min_number_tasks_on_test_server <= sum(len(batch) for batch in batches)
 
-    def test_list_all_for_evaluations(self):
-        required_size = 57
-        # TODO apparently list_evaluations function does not support kwargs
-        evaluations = openml.evaluations.list_evaluations(function='predictive_accuracy',
-                                                          size=required_size)
 
-        # might not be on test server after reset, please rerun test at least once if fails
-        self.assertEqual(len(evaluations), required_size)
+@pytest.mark.test_server()
+def test_list_all_for_datasets(min_number_datasets_on_test_server):
+    datasets = openml.datasets.list_datasets(
+        size=min_number_datasets_on_test_server,
+    )
+
+    assert min_number_datasets_on_test_server == len(datasets)
+    for dataset in datasets.to_dict(orient="index").values():
+        _check_dataset(dataset)
+
+
+@pytest.mark.test_server()
+def test_list_all_for_flows(min_number_flows_on_test_server):
+    flows = openml.flows.list_flows(size=min_number_flows_on_test_server)
+    assert min_number_flows_on_test_server == len(flows)
+
+
+@pytest.mark.flaky()  # Other tests might need to upload runs first
+@pytest.mark.test_server()
+def test_list_all_for_setups(min_number_setups_on_test_server):
+    # TODO apparently list_setups function does not support kwargs
+    setups = openml.setups.list_setups(size=min_number_setups_on_test_server)
+    assert min_number_setups_on_test_server == len(setups)
+
+
+@pytest.mark.flaky()  # Other tests might need to upload runs first
+@pytest.mark.test_server()
+def test_list_all_for_runs(min_number_runs_on_test_server):
+    runs = openml.runs.list_runs(size=min_number_runs_on_test_server)
+    assert min_number_runs_on_test_server == len(runs)
+
+
+@pytest.mark.flaky()  # Other tests might need to upload runs first
+@pytest.mark.test_server()
+def test_list_all_for_evaluations(min_number_evaluations_on_test_server):
+    # TODO apparently list_evaluations function does not support kwargs
+    evaluations = openml.evaluations.list_evaluations(
+        function="predictive_accuracy",
+        size=min_number_evaluations_on_test_server,
+    )
+    assert min_number_evaluations_on_test_server == len(evaluations)
+
+
+@unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=_mocked_perform_api_call)
+@pytest.mark.test_server()
+def test_list_all_few_results_available(_perform_api_call):
+    datasets = openml.datasets.list_datasets(size=1000, data_name="iris", data_version=1)
+    assert len(datasets) == 1, "only one iris dataset version 1 should be present"
+    assert _perform_api_call.call_count == 1, "expect just one call to get one dataset"
+
+
+@unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033")
+@unittest.mock.patch("openml.config.get_cache_directory")
+def test__create_cache_directory(config_mock, tmp_path):
+    config_mock.return_value = tmp_path
+    openml.utils._create_cache_directory("abc")
+    assert (tmp_path / "abc").exists()
+
+    subdir = tmp_path / "def"
+    subdir.mkdir()
+    subdir.chmod(0o444)
+    config_mock.return_value = subdir
+    with pytest.raises(
+        openml.exceptions.OpenMLCacheException,
+        match="Cannot create cache directory",
+    ):
+        openml.utils._create_cache_directory("ghi")
+
+
+@pytest.mark.test_server()
+def test_correct_test_server_download_state():
+    """This test verifies that the test server downloads the data from the correct source.
+
+    If this tests fails, it is highly likely that the test server is not configured correctly.
+    Usually, this means that the test server is serving data from the task with the same ID from the production server.
+    That is, it serves parquet files wrongly associated with the test server's task.
+    """
+    task = openml.tasks.get_task(119)
+    dataset = task.get_dataset()
+    assert len(dataset.features) == dataset.get_data()[0].shape[1]
+
+@unittest.mock.patch("openml.config.get_cache_directory")
+def test_get_cache_size(config_mock,tmp_path):
+    """
+    Test that the OpenML cache size utility correctly reports the cache directory
+    size before and after fetching a dataset.
+
+    This test uses a temporary directory (tmp_path) as the cache location by
+    patching the configuration via config_mock. It verifies two conditions:
+    empty cache and after dataset fetch. 
+
+    Parameters
+    ----------
+    config_mock : unittest.mock.Mock
+         A mock that overrides the configured cache directory to point to tmp_path.
+    tmp_path : pathlib.Path
+         A pytest-provided temporary directory used as an isolated cache location.
+    """
+    
+    config_mock.return_value = tmp_path
+    cache_size = openml.utils.get_cache_size()
+    assert cache_size == 0
+    sub_dir = tmp_path / "subdir"
+    sub_dir.mkdir()
+    (sub_dir / "nested_file.txt").write_bytes(b"b" * 100)
+    
+    assert openml.utils.get_cache_size() == 100