diff --git a/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md
similarity index 100%
rename from ISSUE_TEMPLATE.md
rename to .github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md
diff --git a/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
similarity index 95%
rename from PULL_REQUEST_TEMPLATE.md
rename to .github/PULL_REQUEST_TEMPLATE.md
index 5584e6438..89ad09697 100644
--- a/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -5,7 +5,7 @@ the contribution guidelines: https://github.com/openml/openml-python/blob/main/C
 Please make sure that:
 
 * the title of the pull request is descriptive
-* this pull requests is against the `develop` branch
+* this pull requests is against the `main` branch
 * for any new functionality, consider adding a relevant example
 * add unit tests for new functionalities
     * collect files uploaded to test server using _mark_entity_for_removal()
diff --git a/.github/workflows/dist.yaml b/.github/workflows/dist.yaml
index b81651cea..ecf6f0a7f 100644
--- a/.github/workflows/dist.yaml
+++ b/.github/workflows/dist.yaml
@@ -23,11 +23,11 @@ jobs:
   dist:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Setup Python
       uses: actions/setup-python@v5
       with:
-        python-version: 3.8
+        python-version: "3.10"
     - name: Build dist
       run: |
         pip install build
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index b583b6423..1a5a36a87 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -22,13 +22,13 @@ jobs:
   build-and-deploy:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
         with:
           fetch-depth: 0
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: "3.10"
       - name: Install dependencies
         run: |
           pip install -e .[docs,examples]
diff --git a/.github/workflows/release_docker.yaml b/.github/workflows/release_docker.yaml
index fc629a4e4..fcea357e4 100644
--- a/.github/workflows/release_docker.yaml
+++ b/.github/workflows/release_docker.yaml
@@ -34,7 +34,7 @@ jobs:
           password: ${{ secrets.DOCKERHUB_TOKEN }}
 
       - name: Check out the repo
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Extract metadata (tags, labels) for Docker Hub
         id: meta_dockerhub
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 31cdff602..dc0995fc6 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,3 +1,4 @@
+---
 name: Tests
 
 on:
@@ -21,72 +22,87 @@ concurrency:
 
 jobs:
   test:
-    name: (${{ matrix.os }}, Py${{ matrix.python-version }}, sk${{ matrix.scikit-learn }}, sk-only:${{ matrix.sklearn-only }})
+    name: (${{ matrix.os }},Py${{ matrix.python-version }},sk${{ matrix.scikit-learn }}${{ matrix.pandas-version != '' && format(',pd:{0}', matrix.pandas-version) || '' }},sk-only:${{ matrix.sklearn-only }})
     runs-on: ${{ matrix.os }}
+
     strategy:
+      fail-fast: false
       matrix:
-        python-version: ["3.9"]
-        scikit-learn: ["1.0.*", "1.1.*", "1.2.*", "1.3.*", "1.4.*", "1.5.*"]
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+        scikit-learn: ["1.3.*", "1.4.*", "1.5.*", "1.6.*", "1.7.*"]
         os: [ubuntu-latest]
         sklearn-only: ["true"]
+
+        exclude:
+          # (python, sklearn) combinations for which there is no PyPI release
+          # scikit-learn 1.3
+          - python-version: "3.13"
+            scikit-learn: "1.3.*"
+          - python-version: "3.14"
+            scikit-learn: "1.3.*"
+          # scikit-learn 1.4
+          - python-version: "3.13"
+            scikit-learn: "1.4.*"
+          - python-version: "3.14"
+            scikit-learn: "1.4.*"
+          # scikit-learn 1.5
+          - python-version: "3.14"
+            scikit-learn: "1.5.*"
+          # scikit-learn 1.6
+          - python-version: "3.14"
+            scikit-learn: "1.6.*"
+          # scikit-learn 1.7 is installed with pandas 3
+          - python-version: "3.10"
+            scikit-learn: "1.7.*"
+
+
         include:
+          # Full test run on ubuntu, 3.14
           - os: ubuntu-latest
-            python-version: "3.8"  # no scikit-learn 0.23 release for Python 3.9
-            scikit-learn: "0.23.1"
-            sklearn-only: "true"
-          # scikit-learn 0.24 relies on scipy defaults, so we need to fix the version
-          # c.f. https://github.com/openml/openml-python/pull/1267
-          - os: ubuntu-latest
-            python-version: "3.9"
-            scikit-learn: "0.24"
-            scipy: "1.10.0"
-            sklearn-only: "true"
-          # Do a Windows and Ubuntu test for _all_ openml functionality
-          # I am not sure why these are on 3.8 and older scikit-learn
+            python-version: "3.14"
+            scikit-learn: "1.7.*"
+            sklearn-only: "false"
+
+          # Full test run on Windows
           - os: windows-latest
-            python-version: "3.8"
-            scikit-learn: 0.24.*
-            scipy: "1.10.0"
-            sklearn-only: 'false'
-          # Include a code cov version
+            python-version: "3.12"
+            scikit-learn: "1.5.*"
+            sklearn-only: "false"
+
+          # Coverage run
           - os: ubuntu-latest
+            python-version: "3.12"
+            scikit-learn: "1.5.*"
+            sklearn-only: "false"
             code-cov: true
-            python-version: "3.8"
-            scikit-learn: 0.23.1
-            sklearn-only: 'false'
-      fail-fast:  false
 
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
       with:
         fetch-depth: 2
+
     - name: Setup Python ${{ matrix.python-version }}
-      if: matrix.os != 'windows-latest'  # windows-latest only uses preinstalled Python (3.9.13)
       uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
-    - name: Install test dependencies
+
+    - name: Install test dependencies, scikit-learn, and pandas
+      shell: bash
       run: |
         python -m pip install --upgrade pip
-        pip install -e .[test]
-    - name: Install scikit-learn ${{ matrix.scikit-learn }}
-      run: |
-        pip install scikit-learn==${{ matrix.scikit-learn }}
-    - name: Install numpy for Python 3.8
-      # Python 3.8 & scikit-learn<0.24 requires numpy<=1.23.5
-      if: ${{ matrix.python-version == '3.8' && matrix.scikit-learn == '0.23.1' }}
-      run: |
-        pip install numpy==1.23.5
-    - name: "Install NumPy 1.x and SciPy <1.11 for scikit-learn < 1.4"
-      if: ${{ contains(fromJSON('["1.0.*", "1.1.*", "1.2.*", "1.3.*"]'), matrix.scikit-learn) }}
-      run: |
-        # scipy has a change to the 'mode' behavior which breaks scikit-learn < 1.4
-        # numpy 2.0 has several breaking changes
-        pip install "numpy<2.0" "scipy<1.11"
-    - name: Install scipy ${{ matrix.scipy }}
-      if: ${{ matrix.scipy }}
-      run: |
-        pip install scipy==${{ matrix.scipy }}
+        pip install -e .[test] scikit-learn==${{ matrix.scikit-learn }}
+
+        # scikit-learn 1.7+ requires pandas 3.x, earlier versions use pandas 2.x
+        version="${{ matrix.scikit-learn }}"
+        major=$(echo "$version" | cut -d. -f1)
+        minor=$(echo "$version" | cut -d. -f2)
+
+        if [[ "$major" -gt 1 ]] || { [[ "$major" -eq 1 ]] && [[ "$minor" -ge 7 ]]; }; then
+          pip install "pandas==3.*"
+        else
+          pip install "pandas==2.*"
+        fi
+
     - name: Store repository status
       id: status-before
       if: matrix.os != 'windows-latest'
@@ -94,28 +110,95 @@ jobs:
         git_status=$(git status --porcelain -b)
         echo "BEFORE=$git_status" >> $GITHUB_ENV
         echo "Repository status before tests: $git_status"
+
+    - name: Clone Services
+      if: matrix.os == 'ubuntu-latest'
+      id: clone-services
+      run: |
+        git clone --depth 1 https://github.com/openml/services.git
+
+    - name: Start Docker Services
+      id: start-services
+      if: matrix.os == 'ubuntu-latest'
+      working-directory: ./services
+      run: |
+        chmod -R a+rw ./data
+        chmod -R a+rw ./logs
+        docker compose --profile rest-api --profile minio --profile evaluation-engine up -d
+
+        echo "Waiting for PHP API to boot..."
+        timeout 60s bash -c 'until [ "$(docker inspect -f {{.State.Health.Status}} openml-php-rest-api)" == "healthy" ]; do sleep 5; done'
+
+        echo "Final Verification: Gateway Connectivity..."
+        curl -sSfL http://localhost:8000/api/v1/xml/data/1 | head -n 15
+
+        docker container ls
+
     - name: Show installed dependencies
       run: python -m pip list
+
     - name: Run tests on Ubuntu Test
       if: matrix.os == 'ubuntu-latest'
+      env:
+        OPENML_TEST_SERVER_ADMIN_KEY: ${{ secrets.OPENML_TEST_SERVER_ADMIN_KEY }}
+        OPENML_USE_LOCAL_SERVICES: "true"
       run: |
-        if [ ${{ matrix.code-cov }} ]; then codecov='--cov=openml --long  --cov-report=xml'; fi
-        # Most of the time, running only the scikit-learn tests is sufficient
-        if [ ${{ matrix.sklearn-only }} = 'true' ]; then marks='sklearn and not production'; else marks='not production'; fi
-        echo pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
+        if [ "${{ matrix.code-cov }}" = "true" ]; then
+          codecov="--cov=openml --long --cov-report=xml"
+        fi
+
+        if [ "${{ matrix.sklearn-only }}" = "true" ]; then
+          marks="sklearn and not production_server"
+        else
+          marks="not production_server"
+        fi
+
         pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
+
     - name: Run tests on Ubuntu Production
       if: matrix.os == 'ubuntu-latest'
+      env:
+        OPENML_TEST_SERVER_ADMIN_KEY: ${{ secrets.OPENML_TEST_SERVER_ADMIN_KEY }}
+        OPENML_USE_LOCAL_SERVICES: "true"
       run: |
-        if [ ${{ matrix.code-cov }} ]; then codecov='--cov=openml --long  --cov-report=xml'; fi
-        # Most of the time, running only the scikit-learn tests is sufficient
-        if [ ${{ matrix.sklearn-only }} = 'true' ]; then marks='sklearn and production'; else marks='production'; fi
-        echo pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
+        if [ "${{ matrix.code-cov }}" = "true" ]; then
+          codecov="--cov=openml --long --cov-report=xml"
+        fi
+
+        if [ "${{ matrix.sklearn-only }}" = "true" ]; then
+          marks="sklearn and production_server"
+        else
+          marks="production_server"
+        fi
+
         pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
+
     - name: Run tests on Windows
       if: matrix.os == 'windows-latest'
+      env:
+        OPENML_TEST_SERVER_ADMIN_KEY: ${{ secrets.OPENML_TEST_SERVER_ADMIN_KEY }}
       run: |  # we need a separate step because of the bash-specific if-statement in the previous one.
-        pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1
+        pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1 -m "not test_server"
+
+    - name: Upload coverage
+      if: matrix.code-cov && always()
+      uses: codecov/codecov-action@v4
+      with:
+        files: coverage.xml
+        token: ${{ secrets.CODECOV_TOKEN }}
+        fail_ci_if_error: true
+        verbose: true
+
+    - name: Dump server logs
+      if: always() && steps.start-services.outcome == 'success'
+      run: |
+        docker logs openml-php-rest-api -t
+
+    - name: Cleanup Docker setup
+      if: always() && steps.clone-services.outcome == 'success'
+      run: |
+        sudo rm -rf services
+
     - name: Check for files left behind by test
       if: matrix.os != 'windows-latest' && always()
       run: |
@@ -127,11 +210,30 @@ jobs:
             echo "Not all generated files have been deleted!"
             exit 1
         fi
-    - name: Upload coverage
-      if: matrix.code-cov && always()
-      uses: codecov/codecov-action@v4
-      with:
-        files: coverage.xml
-        token: ${{ secrets.CODECOV_TOKEN }}
-        fail_ci_if_error: true
-        verbose: true
+
+  dummy_windows_py_sk024:
+    name: (windows-latest, Py, sk0.24.*, sk-only:false)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Dummy step
+        run: |
+          echo "This is a temporary dummy job."
+          echo "Always succeeds."
+
+  dummy_windows_py_sk023:
+    name: (ubuntu-latest, Py3.8, sk0.23.1, sk-only:false)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Dummy step
+        run: |
+          echo "This is a temporary dummy job."
+          echo "Always succeeds."
+
+  dummy_docker:
+    name: docker
+    runs-on: ubuntu-latest
+    steps:
+      - name: Dummy step
+        run: |
+          echo "This is a temporary dummy docker job."
+          echo "Always succeeds."
diff --git a/.gitignore b/.gitignore
index 132070bf3..d512c0ee6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -88,6 +88,8 @@ target/
 .idea
 *.swp
 .vscode
+.cursorignore
+.cursorindexingignore
 
 # MYPY
 .mypy_cache
@@ -96,4 +98,17 @@ dmypy.sock
 
 # Tests
 .pytest_cache
-.venv
\ No newline at end of file
+
+# Virtual environments
+oenv/
+venv/
+.env/
+.venv
+.venv/
+
+# Python cache
+__pycache__/
+*.pyc
+
+# Ruff
+.ruff-cache/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 95e2a5239..0987bad90 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,7 +7,7 @@ files: |
   )/.*\.py$
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.7.3
+    rev: v0.14.10
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix, --no-cache]
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 35ab30b4a..d194525ef 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -44,7 +44,7 @@ To contribute to the openml-python package, follow these steps:
 
 0. Determine how you want to contribute (see above).
 1. Set up your local development environment.
-   1. Fork and clone the `openml-python` repository. Then, create a new branch from the ``develop`` branch. If you are new to `git`, see our [detailed documentation](#basic-git-workflow), or rely on your favorite IDE.   
+   1. Fork and clone the `openml-python` repository. Then, create a new branch from the ``main`` branch. If you are new to `git`, see our [detailed documentation](#basic-git-workflow), or rely on your favorite IDE.   
    2. [Install the local dependencies](#install-local-dependencies) to run the tests for your contribution.
    3. [Test your installation](#testing-your-installation) to ensure everything is set up correctly.
 4. Implement your contribution. If contributing to the documentation, see [here](#contributing-to-the-documentation).
@@ -91,14 +91,25 @@ pytest tests/test_datasets/test_dataset.py::OpenMLDatasetTest
 pytest tests/test_datasets/test_dataset.py::OpenMLDatasetTest::test_get_data
 ```
 
-To test your new contribution, add [unit tests](https://github.com/openml/openml-python/tree/develop/tests), and, if needed, [examples](https://github.com/openml/openml-python/tree/develop/examples) for any new functionality being introduced. Some notes on unit tests and examples:
+To test your new contribution, add [unit tests](https://github.com/openml/openml-python/tree/main/tests), and, if needed, [examples](https://github.com/openml/openml-python/tree/main/examples) for any new functionality being introduced. Some notes on unit tests and examples:
 * If a unit test contains an upload to the test server, please ensure that it is followed by a file collection for deletion, to prevent the test server from bulking up. For example, `TestBase._mark_entity_for_removal('data', dataset.dataset_id)`, `TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))`.
 * Please ensure that the example is run on the test server by beginning with the call to `openml.config.start_using_configuration_for_example()`, which is done by default for tests derived from `TestBase`.
 * Add the `@pytest.mark.sklearn` marker to your unit tests if they have a dependency on scikit-learn.
 
+#### Running Tests That Require Admin Privileges
+
+Some tests require admin privileges on the test server and will be automatically skipped unless you provide an admin API key. For regular contributors, the tests will skip gracefully. For core contributors who need to run these tests locally, you can set up the key by exporting the variable as below before running the tests:
+
+```bash
+# For windows
+$env:OPENML_TEST_SERVER_ADMIN_KEY = "admin-key"
+# For linux/mac
+export OPENML_TEST_SERVER_ADMIN_KEY="admin-key"
+```
+
 ### Pull Request Checklist
 
-You can go to the `openml-python` GitHub repository to create the pull request by [comparing the branch](https://github.com/openml/openml-python/compare) from your fork with the `develop` branch of the `openml-python` repository. When creating a pull request, make sure to follow the comments and structured provided by the template on GitHub.
+You can go to the `openml-python` GitHub repository to create the pull request by [comparing the branch](https://github.com/openml/openml-python/compare) from your fork with the `main` branch of the `openml-python` repository. When creating a pull request, make sure to follow the comments and structured provided by the template on GitHub.
 
 **An incomplete contribution** -- where you expect to do more work before
 receiving a full review -- should be submitted as a `draft`. These may be useful
@@ -116,7 +127,7 @@ in the PR description.
 
 The preferred workflow for contributing to openml-python is to
 fork the [main repository](https://github.com/openml/openml-python) on
-GitHub, clone, check out the branch `develop`, and develop on a new branch
+GitHub, clone, check out the branch `main`, and develop on a new branch
 branch. Steps:
 
 0. Make sure you have git installed, and a GitHub account.
@@ -137,7 +148,7 @@ local disk:
 3. Switch to the ``develop`` branch:
 
    ```bash
-   git checkout develop
+   git checkout main
    ```
 
 3. Create a ``feature`` branch to hold your development changes:
@@ -146,7 +157,7 @@ local disk:
    git checkout -b feature/my-feature
    ```
 
-   Always use a ``feature`` branch. It's good practice to never work on the ``main`` or ``develop`` branch! 
+   Always use a ``feature`` branch. It's good practice to never work on the ``main`` branch! 
    To make the nature of your pull request easily visible, please prepend the name of the branch with the type of changes you want to merge, such as ``feature`` if it contains a new feature, ``fix`` for a bugfix, ``doc`` for documentation and ``maint`` for other maintenance on the package.
 
 4. Develop the feature on your feature branch. Add changed files using ``git add`` and then ``git commit`` files:
diff --git a/README.md b/README.md
index 081bf7923..974c9fa53 100644
--- a/README.md
+++ b/README.md
@@ -15,12 +15,12 @@
 ## The Python API for a World of Data and More :dizzy:
 
 [![Latest Release](https://img.shields.io/github/v/release/openml/openml-python)](https://github.com/openml/openml-python/releases)
-[![Python Versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue)](https://pypi.org/project/openml/)
+[![Python Versions](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12%20%7C%203.13%20%7C%203.14-blue)](https://pypi.org/project/openml/)
 [![Downloads](https://static.pepy.tech/badge/openml)](https://pepy.tech/project/openml)
 [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
 <!-- Add green badges for CI and precommit -->
 
-[Installation](https://openml.github.io/openml-python/main/#how-to-get-openml-for-python) | [Documentation](https://openml.github.io/openml-python) | [Contribution guidelines](https://github.com/openml/openml-python/blob/develop/CONTRIBUTING.md)
+[Installation](https://openml.github.io/openml-python/main/#how-to-get-openml-for-python) | [Documentation](https://openml.github.io/openml-python) | [Contribution guidelines](https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md)
 </div>
 
 OpenML-Python provides an easy-to-use and straightforward Python interface for [OpenML](http://openml.org), an online platform for open science collaboration in machine learning.
@@ -60,7 +60,7 @@ for task_id in suite.tasks:
 
 ## :magic_wand: Installation
 
-OpenML-Python is supported on Python 3.8 - 3.13 and is available on Linux, MacOS, and Windows.
+OpenML-Python is supported on Python 3.10 - 3.14 and is available on Linux, MacOS, and Windows.
 
 You can install OpenML-Python with:
 
@@ -89,3 +89,14 @@ Bibtex entry:
   url     = {http://jmlr.org/papers/v22/19-920.html}
 }
 ```
+## :handshake: Contributing
+
+We welcome contributions from both new and experienced developers!
+
+If you would like to contribute to OpenML-Python, please read our  
+[Contribution Guidelines](https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md).
+
+If you are new to open-source development, a great way to get started is by
+looking at issues labeled **"good first issue"** in our GitHub issue tracker.
+These tasks are beginner-friendly and help you understand the project structure,
+development workflow, and how to submit a pull request.
diff --git a/docs/developer_setup.md b/docs/developer_setup.md
new file mode 100644
index 000000000..55a73fef9
--- /dev/null
+++ b/docs/developer_setup.md
@@ -0,0 +1,210 @@
+# OpenML Local Development Environment Setup
+
+This guide outlines the standard procedures for setting up a local development environment for the OpenML ecosystem. It covers the configuration of the backend servers (API v1 and API v2) and the Python Client SDK.
+
+OpenML currently has two backend architecture:
+
+* **API v1**: The PHP-based server currently serving production traffic.
+* **API v2**: The Python-based server (FastAPI) currently under active development.
+
+> Note on Migration: API v1 is projected to remain operational through at least 2026. API v2 is the target architecture for future development.
+
+## 1. API v1 Setup (PHP Backend)
+
+This section details the deployment of the legacy PHP backend.
+
+### Prerequisites
+
+* **Docker**: Docker Desktop (Ensure the daemon is running).
+* **Version Control**: Git.
+
+### Installation Steps
+
+#### 1. Clone the Repository
+
+Retrieve the OpenML services source code:
+
+```bash
+git clone https://github.com/openml/services
+cd services
+```
+
+#### 2. Configure File Permissions
+
+To ensure the containerized PHP service can write to the local filesystem, initialize the data directory permissions.
+
+From the repository root:
+
+```bash
+chown -R www-data:www-data data/php
+```
+
+If the `www-data` user does not exist on the host system, grant full permissions as a fallback:
+
+```bash
+chmod -R 777 data/php
+```
+
+#### 3. Launch Services
+
+Initialize the container stack:
+
+```bash
+docker compose --profile all up -d
+```
+
+#### Warning: Container Conflicts
+
+If API v2 (Python backend) containers are present on the system, name conflicts may occur. To resolve this, stop and remove existing containers before launching API v1:
+
+```bash
+docker compose --profile all down
+docker compose --profile all up -d
+```
+
+#### 4. Verification
+
+Validate the deployment by accessing the flow endpoint. A successful response will return structured JSON data.
+
+* **Endpoint**: http://localhost:8080/api/v1/json/flow/181
+
+### Client Configuration
+
+To direct the `openml-python` client to the local API v1 instance, modify the configuration as shown below. The API key corresponds to the default key located in `services/config/php/.env`.
+
+```python
+import openml
+from openml_sklearn.extension import SklearnExtension
+from sklearn.neighbors import KNeighborsClassifier
+
+# Configure client to use local Docker instance
+openml.config.server = "http://localhost:8080/api/v1/xml"
+openml.config.apikey = "AD000000000000000000000000000000"
+
+# Test flow publication
+clf = KNeighborsClassifier(n_neighbors=3)
+extension = SklearnExtension()
+knn_flow = extension.model_to_flow(clf)
+
+knn_flow.publish()
+```
+
+## 2. API v2 Setup (Python Backend)
+
+This section details the deployment of the FastAPI backend.
+
+### Prerequisites
+
+* **Docker**: Docker Desktop (Ensure the daemon is running).
+* **Version Control**: Git.
+
+### Installation Steps
+
+#### 1. Clone the Repository
+
+Retrieve the API v2 source code:
+
+```bash
+git clone https://github.com/openml/server-api
+cd server-api
+```
+
+#### 2. Launch Services
+
+Build and start the container stack:
+
+```bash
+docker compose --profile all up
+```
+
+#### 3. Verification
+
+Validate the deployment using the following endpoints:
+
+* **Task Endpoint**: http://localhost:8001/tasks/31
+* **Swagger UI (Documentation)**: http://localhost:8001/docs
+
+## 3. Python SDK (`openml-python`) Setup
+
+This section outlines the environment setup for contributing to the OpenML Python client.
+
+### Installation Steps
+
+#### 1. Clone the Repository
+
+```bash
+git clone https://github.com/openml/openml-python
+cd openml-python
+```
+
+#### 2. Environment Initialization
+
+Create an isolated virtual environment (example using Conda):
+
+```bash
+conda create -n openml-python-dev python=3.12
+conda activate openml-python-dev
+```
+
+#### 3. Install Dependencies
+
+Install the package in editable mode, including development and documentation dependencies:
+
+```bash
+python -m pip install -e ".[dev,docs]"
+```
+
+#### 4. Configure Quality Gates
+
+Install pre-commit hooks to enforce coding standards:
+
+```bash
+pre-commit install
+pre-commit run --all-files
+```
+
+## 4. Testing Guidelines
+
+The OpenML Python SDK utilizes `pytest` markers to categorize tests based on dependencies and execution context.
+
+| Marker            | Description                                                                 |
+|-------------------|-----------------------------------------------------------------------------|
+| `sklearn`          | Tests requiring `scikit-learn`. Skipped if the library is missing.          |
+| `production_server`| Tests that interact with the live OpenML server (real API calls).         |
+| `test_server`     | Tests requiring the OpenML test server environment.                       |
+
+### Execution Examples
+
+Run the full test suite:
+
+```bash
+pytest
+```
+
+Run a specific subset (e.g., `scikit-learn` tests):
+
+```bash
+pytest -m sklearn
+```
+
+Exclude production tests (local only):
+
+```bash
+pytest -m "not production_server"
+```
+
+### Admin Privilege Tests
+
+Certain tests require administrative privileges on the test server. These are skipped automatically unless an admin API key is provided via environment variables.
+
+#### Windows (PowerShell):
+
+```shell
+$env:OPENML_TEST_SERVER_ADMIN_KEY = "admin-key"
+```
+
+#### Linux/macOS:
+
+```bash
+export OPENML_TEST_SERVER_ADMIN_KEY="admin-key"
+```
diff --git a/examples/Advanced/fetch_evaluations_tutorial.py b/examples/Advanced/fetch_evaluations_tutorial.py
index 1b759423b..97b8d1bef 100644
--- a/examples/Advanced/fetch_evaluations_tutorial.py
+++ b/examples/Advanced/fetch_evaluations_tutorial.py
@@ -75,7 +75,7 @@
 
 def plot_cdf(values, metric="predictive_accuracy"):
     max_val = max(values)
-    n, bins, patches = plt.hist(values, density=True, histtype="step", cumulative=True, linewidth=3)
+    _, _, patches = plt.hist(values, density=True, histtype="step", cumulative=True, linewidth=3)
     patches[0].set_xy(patches[0].get_xy()[:-1])
     plt.xlim(max(0, min(values) - 0.1), 1)
     plt.title("CDF")
@@ -116,7 +116,7 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
     for i in range(len(flow_ids)):
         flow_values = evaluations[evaluations.flow_id == flow_ids[i]].value
         df = pd.concat([df, flow_values], ignore_index=True, axis=1)
-    fig, axs = plt.subplots()
+    _, axs = plt.subplots()
     df.boxplot()
     axs.set_title("Boxplot comparing " + metric + " for different flows")
     axs.set_ylabel(metric)
@@ -178,4 +178,4 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
     function="predictive_accuracy", flows=[6767], size=100, parameters_in_separate_columns=True
 )
 
-print(evals_setups.head(10))
\ No newline at end of file
+print(evals_setups.head(10))
diff --git a/examples/Advanced/suites_tutorial.py b/examples/Advanced/suites_tutorial.py
index 7ca42079d..8459510ef 100644
--- a/examples/Advanced/suites_tutorial.py
+++ b/examples/Advanced/suites_tutorial.py
@@ -72,7 +72,7 @@
 
 # %%
 all_tasks = list(openml.tasks.list_tasks()["tid"])
-task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))
+task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))  # noqa: NPY002
 
 # The study needs a machine-readable and unique alias. To obtain this,
 # we simply generate a random uuid.
diff --git a/examples/Basics/introduction_tutorial.py b/examples/Basics/introduction_tutorial.py
index c864772f5..2ba2d0ef1 100644
--- a/examples/Basics/introduction_tutorial.py
+++ b/examples/Basics/introduction_tutorial.py
@@ -12,7 +12,7 @@
 # For certain functionality, such as uploading tasks or datasets, users have to
 # sign up. Only accessing the data on OpenML does not require an account!
 #
-# If you don’t have an account yet, sign up now.
+# If you don't have an account yet, sign up now.
 # You will receive an API key, which will authenticate you to the server
 # and allow you to download and upload datasets, tasks, runs and flows.
 #
@@ -52,4 +52,4 @@
 # %%
 import openml
 
-openml.config.set_root_cache_directory("YOURDIR")
\ No newline at end of file
+openml.config.set_root_cache_directory("YOURDIR")
diff --git a/examples/Basics/simple_flows_and_runs_tutorial.py b/examples/Basics/simple_flows_and_runs_tutorial.py
index 41eed9234..eb42c7d02 100644
--- a/examples/Basics/simple_flows_and_runs_tutorial.py
+++ b/examples/Basics/simple_flows_and_runs_tutorial.py
@@ -85,7 +85,7 @@
 # Format the predictions for OpenML
 predictions = []
 for test_index, y_true_i, y_pred_i, y_pred_proba_i in zip(
-    test_indices, y_test, y_pred, y_pred_proba
+    test_indices, y_test, y_pred, y_pred_proba, strict=False
 ):
     predictions.append(
         openml.runs.functions.format_prediction(
@@ -95,7 +95,7 @@
             index=test_index,
             prediction=y_pred_i,
             truth=y_true_i,
-            proba=dict(zip(task.class_labels, y_pred_proba_i)),
+            proba=dict(zip(task.class_labels, y_pred_proba_i, strict=False)),
         )
     )
 
diff --git a/examples/_external_or_deprecated/2015_neurips_feurer_example.py b/examples/_external_or_deprecated/2015_neurips_feurer_example.py
index ae59c9ced..2dfc4bb97 100644
--- a/examples/_external_or_deprecated/2015_neurips_feurer_example.py
+++ b/examples/_external_or_deprecated/2015_neurips_feurer_example.py
@@ -13,12 +13,10 @@
 | Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
 | In *Advances in Neural Information Processing Systems 28*, 2015
 | Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
-"""  # noqa F401
+"""
 
 # License: BSD 3-Clause
 
-import pandas as pd
-
 import openml
 
 ####################################################################################################
@@ -68,7 +66,7 @@
 
 task_ids = []
 for did in dataset_ids:
-    tasks_ = list(tasks.query("did == {}".format(did)).tid)
+    tasks_ = list(tasks.query(f"did == {did}").tid)
     if len(tasks_) >= 1:  # if there are multiple task, take the one with lowest ID (oldest).
         task_id = min(tasks_)
     else:
diff --git a/examples/_external_or_deprecated/2018_ida_strang_example.py b/examples/_external_or_deprecated/2018_ida_strang_example.py
index 8b225125b..0e180badf 100644
--- a/examples/_external_or_deprecated/2018_ida_strang_example.py
+++ b/examples/_external_or_deprecated/2018_ida_strang_example.py
@@ -17,8 +17,8 @@
 # License: BSD 3-Clause
 
 import matplotlib.pyplot as plt
+
 import openml
-import pandas as pd
 
 ##############################################################################
 # A basic step for each data-mining or machine learning task is to determine
@@ -86,10 +86,9 @@
 def determine_class(val_lin, val_nonlin):
     if val_lin < val_nonlin:
         return class_values[0]
-    elif val_nonlin < val_lin:
+    if val_nonlin < val_lin:
         return class_values[1]
-    else:
-        return class_values[2]
+    return class_values[2]
 
 
 evaluations["class"] = evaluations.apply(
diff --git a/examples/_external_or_deprecated/2018_kdd_rijn_example.py b/examples/_external_or_deprecated/2018_kdd_rijn_example.py
index 6522013e3..957281616 100644
--- a/examples/_external_or_deprecated/2018_kdd_rijn_example.py
+++ b/examples/_external_or_deprecated/2018_kdd_rijn_example.py
@@ -32,16 +32,17 @@
 
 import sys
 
-if sys.platform == "win32":  # noqa
+if sys.platform == "win32":
     print(
         "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems"
     )
-    exit()
+    sys.exit()
 
 # DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
 print("This example is deprecated, remove the `if False` in this code to use it manually.")
 if False:
     import json
+
     import fanova
     import matplotlib.pyplot as plt
     import pandas as pd
@@ -49,7 +50,6 @@
 
     import openml
 
-
     ##############################################################################
     # With the advent of automated machine learning, automated hyperparameter
     # optimization methods are by now routinely used in data mining. However, this
@@ -80,7 +80,7 @@
     # important when it is put on a log-scale. All these simplifications can be
     # addressed by defining a ConfigSpace. For a more elaborated example that uses
     # this, please see:
-    # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401
+    # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py
 
     suite = openml.study.get_suite("OpenML100")
     flow_id = 7707
@@ -97,8 +97,7 @@
         if limit_nr_tasks is not None and idx >= limit_nr_tasks:
             continue
         print(
-            "Starting with task %d (%d/%d)"
-            % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
+            f"Starting with task {task_id} ({idx + 1}/{len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks})"
         )
         # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
         evals = openml.evaluations.list_evaluations_setups(
@@ -121,13 +120,13 @@
                 [
                     dict(
                         **{name: json.loads(value) for name, value in setup["parameters"].items()},
-                        **{performance_column: setup[performance_column]}
+                        **{performance_column: setup[performance_column]},
                     )
                     for _, setup in evals.iterrows()
                 ]
             )
         except json.decoder.JSONDecodeError as e:
-            print("Task %d error: %s" % (task_id, e))
+            print(f"Task {task_id} error: {e}")
             continue
         # apply our filters, to have only the setups that comply to the hyperparameters we want
         for filter_key, filter_value in parameter_filters.items():
@@ -156,19 +155,21 @@
             Y=setups_evals[performance_column].to_numpy(),
             n_trees=n_trees,
         )
-        for idx, pname in enumerate(parameter_names):
+        for idx, pname in enumerate(parameter_names):  # noqa: PLW2901
             try:
                 fanova_results.append(
                     {
                         "hyperparameter": pname.split(".")[-1],
-                        "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"],
+                        "fanova": evaluator.quantify_importance([idx])[(idx,)][
+                            "individual importance"
+                        ],
                     }
                 )
             except RuntimeError as e:
                 # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
                 # for all configurations (there is no variance). We will skip these tasks (like the authors did in the
                 # paper).
-                print("Task %d error: %s" % (task_id, e))
+                print(f"Task {task_id} error: {e}")
                 continue
 
     # transform ``fanova_results`` from a list of dicts into a DataFrame
diff --git a/examples/_external_or_deprecated/2018_neurips_perrone_example.py b/examples/_external_or_deprecated/2018_neurips_perrone_example.py
index 0d72846ac..8a3c36994 100644
--- a/examples/_external_or_deprecated/2018_neurips_perrone_example.py
+++ b/examples/_external_or_deprecated/2018_neurips_perrone_example.py
@@ -27,16 +27,17 @@
 
 # License: BSD 3-Clause
 
-import openml
 import numpy as np
 import pandas as pd
 from matplotlib import pyplot as plt
-from sklearn.pipeline import Pipeline
-from sklearn.impute import SimpleImputer
 from sklearn.compose import ColumnTransformer
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.impute import SimpleImputer
 from sklearn.metrics import mean_squared_error
+from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder
-from sklearn.ensemble import RandomForestRegressor
+
+import openml
 
 flow_type = "svm"  # this example will use the smaller svm flow evaluations
 ############################################################################
@@ -44,7 +45,7 @@
 # a tabular format that can be used to build models.
 
 
-def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_curve"):
+def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_curve"):  # noqa: FBT002
     """
     Fetch a list of evaluations based on the flows and tasks used in the experiments.
 
@@ -101,7 +102,10 @@ def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_cu
 
 
 def create_table_from_evaluations(
-    eval_df, flow_type="svm", run_count=np.iinfo(np.int64).max, task_ids=None
+    eval_df,
+    flow_type="svm",
+    run_count=np.iinfo(np.int64).max,  # noqa: B008
+    task_ids=None,
 ):
     """
     Create a tabular data with its ground truth from a dataframe of evaluations.
@@ -206,7 +210,7 @@ def list_categorical_attributes(flow_type="svm"):
 model.fit(X, y)
 y_pred = model.predict(X)
 
-print("Training RMSE : {:.5}".format(mean_squared_error(y, y_pred)))
+print(f"Training RMSE : {mean_squared_error(y, y_pred):.5}")
 
 
 #############################################################################
@@ -231,9 +235,9 @@ def random_sample_configurations(num_samples=100):
     X = pd.DataFrame(np.nan, index=range(num_samples), columns=colnames)
     for i in range(len(colnames)):
         if len(ranges[i]) == 2:
-            col_val = np.random.uniform(low=ranges[i][0], high=ranges[i][1], size=num_samples)
+            col_val = np.random.uniform(low=ranges[i][0], high=ranges[i][1], size=num_samples)  # noqa: NPY002
         else:
-            col_val = np.random.choice(ranges[i], size=num_samples)
+            col_val = np.random.choice(ranges[i], size=num_samples)  # noqa: NPY002
         X.iloc[:, i] = col_val
     return X
 
diff --git a/examples/_external_or_deprecated/benchmark_with_optunahub.py b/examples/_external_or_deprecated/benchmark_with_optunahub.py
index ece3e7c40..38114bc44 100644
--- a/examples/_external_or_deprecated/benchmark_with_optunahub.py
+++ b/examples/_external_or_deprecated/benchmark_with_optunahub.py
@@ -100,7 +100,7 @@ def objective(trial: optuna.Trial) -> Pipeline:
             run.publish()
 
             logger.log(1, f"Run was uploaded to - {run.openml_url}")
-        except Exception as e:
+        except Exception as e:  # noqa: BLE001
             logger.log(1, f"Could not publish run - {e}")
     else:
         logger.log(
diff --git a/examples/_external_or_deprecated/fetch_runtimes_tutorial.py b/examples/_external_or_deprecated/fetch_runtimes_tutorial.py
index b2a3f1d2a..c8f85adc5 100644
--- a/examples/_external_or_deprecated/fetch_runtimes_tutorial.py
+++ b/examples/_external_or_deprecated/fetch_runtimes_tutorial.py
@@ -39,17 +39,16 @@
 #
 # * (Case 5) Running models that do not release the Python Global Interpreter Lock (GIL)
 
-import openml
 import numpy as np
-from matplotlib import pyplot as plt
 from joblib.parallel import parallel_backend
-
-from sklearn.naive_bayes import GaussianNB
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.neural_network import MLPClassifier
+from matplotlib import pyplot as plt
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
+from sklearn.naive_bayes import GaussianNB
+from sklearn.neural_network import MLPClassifier
+from sklearn.tree import DecisionTreeClassifier
 
+import openml
 
 # %% [markdown]
 # # Preparing tasks and scikit-learn models
@@ -63,12 +62,7 @@
 # Viewing associated data
 n_repeats, n_folds, n_samples = task.get_split_dimensions()
 print(
-    "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
-        task_id,
-        n_repeats,
-        n_folds,
-        n_samples,
-    )
+    f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}."
 )
 
 
@@ -101,7 +95,7 @@ def print_compare_runtimes(measures):
 measures = run1.fold_evaluations
 
 print("The timing and performance metrics available: ")
-for key in measures.keys():
+for key in measures:
     print(key)
 print()
 
@@ -206,7 +200,6 @@ def print_compare_runtimes(measures):
 # included in the `wall_clock_time_millis_training` measure recorded.
 
 # %%
-from sklearn.model_selection import GridSearchCV
 
 clf = RandomForestClassifier(n_estimators=10, n_jobs=2)
 
@@ -284,22 +277,18 @@ def print_compare_runtimes(measures):
 
 # %%
 
+
 def extract_refit_time(run, repeat, fold):
-    refit_time = (
+    return (
         run.fold_evaluations["wall_clock_time_millis"][repeat][fold]
         - run.fold_evaluations["wall_clock_time_millis_training"][repeat][fold]
         - run.fold_evaluations["wall_clock_time_millis_testing"][repeat][fold]
     )
-    return refit_time
 
 
 for repeat in range(n_repeats):
     for fold in range(n_folds):
-        print(
-            "Repeat #{}-Fold #{}: {:.4f}".format(
-                repeat, fold, extract_refit_time(run4, repeat, fold)
-            )
-        )
+        print(f"Repeat #{repeat}-Fold #{fold}: {extract_refit_time(run4, repeat, fold):.4f}")
 
 # %% [markdown]
 # Along with the GridSearchCV already used above, we demonstrate how such
diff --git a/examples/_external_or_deprecated/flow_id_tutorial.py b/examples/_external_or_deprecated/flow_id_tutorial.py
index e813655fc..19190cf0b 100644
--- a/examples/_external_or_deprecated/flow_id_tutorial.py
+++ b/examples/_external_or_deprecated/flow_id_tutorial.py
@@ -9,7 +9,6 @@
 
 import openml
 
-
 # %% [markdown]
 # .. warning::
 #    .. include:: ../../test_server_usage_warning.txt
@@ -48,7 +47,7 @@
 # %% [markdown]
 # ## 2. Obtaining a flow given its name
 # The schema of a flow is given in XSD (
-# [here](https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.implementation.upload.xsd)).  # noqa E501
+# [here](https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.implementation.upload.xsd)).
 # Only two fields are required, a unique name, and an external version. While it should be pretty
 # obvious why we need a name, the need for the additional external version information might not
 # be immediately clear. However, this information is very important as it allows to have multiple
diff --git a/examples/_external_or_deprecated/flows_and_runs_tutorial.py b/examples/_external_or_deprecated/flows_and_runs_tutorial.py
index 2d1bcb864..71d6960bd 100644
--- a/examples/_external_or_deprecated/flows_and_runs_tutorial.py
+++ b/examples/_external_or_deprecated/flows_and_runs_tutorial.py
@@ -3,8 +3,7 @@
 # This tutorial covers how to train/run a model and how to upload the results.
 
 # %%
-import openml
-from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
+from sklearn import compose, ensemble, impute, neighbors, pipeline, preprocessing, tree
 
 import openml
 
diff --git a/examples/_external_or_deprecated/plot_svm_hyperparameters_tutorial.py b/examples/_external_or_deprecated/plot_svm_hyperparameters_tutorial.py
index faced588b..7bb72db5a 100644
--- a/examples/_external_or_deprecated/plot_svm_hyperparameters_tutorial.py
+++ b/examples/_external_or_deprecated/plot_svm_hyperparameters_tutorial.py
@@ -2,9 +2,10 @@
 # # Plotting hyperparameter surfaces
 
 # %%
-import openml
 import numpy as np
 
+import openml
+
 # %% [markdown]
 # # First step - obtaining the data
 # First, we need to choose an SVM flow, for example 8353, and a task. Finding the IDs of them are
diff --git a/examples/_external_or_deprecated/run_setup_tutorial.py b/examples/_external_or_deprecated/run_setup_tutorial.py
index 55d25d291..25591bb58 100644
--- a/examples/_external_or_deprecated/run_setup_tutorial.py
+++ b/examples/_external_or_deprecated/run_setup_tutorial.py
@@ -23,15 +23,15 @@
 # %%
 
 import numpy as np
-import openml
-from openml.extensions.sklearn import cat, cont
-
-from sklearn.pipeline import make_pipeline, Pipeline
 from sklearn.compose import ColumnTransformer
-from sklearn.impute import SimpleImputer
-from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
-from sklearn.ensemble import RandomForestClassifier
 from sklearn.decomposition import TruncatedSVD
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+import openml
+from openml.extensions.sklearn import cat, cont
 
 # %% [markdown]
 # .. warning::
diff --git a/examples/_external_or_deprecated/upload_amlb_flows_and_runs.py b/examples/_external_or_deprecated/upload_amlb_flows_and_runs.py
index 15ec0e1fb..b43926d4e 100644
--- a/examples/_external_or_deprecated/upload_amlb_flows_and_runs.py
+++ b/examples/_external_or_deprecated/upload_amlb_flows_and_runs.py
@@ -14,10 +14,10 @@
 
 # %%
 from collections import OrderedDict
+
 import numpy as np
 
 import openml
-from openml import OpenMLClassificationTask
 from openml.runs.functions import format_prediction
 
 # %% [markdown]
@@ -43,17 +43,17 @@
 # version of the package/script is used. Use tags so users can find your flow easily.
 
 # %%
-general = dict(
-    name="automlbenchmark_autosklearn",
-    description=(
+general = {
+    "name": "automlbenchmark_autosklearn",
+    "description": (
         "Auto-sklearn as set up by the AutoML Benchmark"
         "Source: https://github.com/openml/automlbenchmark/releases/tag/v0.9"
     ),
-    external_version="amlb==0.9",
-    language="English",
-    tags=["amlb", "benchmark", "study_218"],
-    dependencies="amlb==0.9",
-)
+    "external_version": "amlb==0.9",
+    "language": "English",
+    "tags": ["amlb", "benchmark", "study_218"],
+    "dependencies": "amlb==0.9",
+}
 
 # %% [markdown]
 # Next we define the flow hyperparameters. We define their name and default value in `parameters`,
@@ -62,14 +62,14 @@
 # The use of ordered dicts is required.
 
 # %%
-flow_hyperparameters = dict(
-    parameters=OrderedDict(time="240", memory="32", cores="8"),
-    parameters_meta_info=OrderedDict(
+flow_hyperparameters = {
+    "parameters": OrderedDict(time="240", memory="32", cores="8"),
+    "parameters_meta_info": OrderedDict(
         cores=OrderedDict(description="number of available cores", data_type="int"),
         memory=OrderedDict(description="memory in gigabytes", data_type="int"),
         time=OrderedDict(description="time in minutes", data_type="int"),
     ),
-)
+}
 
 # %% [markdown]
 # It is possible to build a flow which uses other flows.
@@ -89,11 +89,11 @@
 
 # %%
 autosklearn_flow = openml.flows.get_flow(9313)  # auto-sklearn 0.5.1
-subflow = dict(
-    components=OrderedDict(automl_tool=autosklearn_flow),
+subflow = {
+    "components": OrderedDict(automl_tool=autosklearn_flow),
     # If you do not want to reference a subflow, you can use the following:
     # components=OrderedDict(),
-)
+}
 
 # %% [markdown]
 # With all parameters of the flow defined, we can now initialize the OpenMLFlow and publish.
@@ -172,19 +172,19 @@
 ]
 
 # random class probabilities (Iris has 150 samples and 3 classes):
-r = np.random.rand(150 * n_repeats, 3)
+r = np.random.rand(150 * n_repeats, 3)  # noqa: NPY002
 # scale the random values so that the probabilities of each sample sum to 1:
 y_proba = r / r.sum(axis=1).reshape(-1, 1)
 y_pred = y_proba.argmax(axis=1)
 
-class_map = dict(zip(range(3), task.class_labels))
+class_map = dict(zip(range(3), task.class_labels, strict=False))
 _, y_true = task.get_X_and_y()
 y_true = [class_map[y] for y in y_true]
 
 # We format the predictions with the utility function `format_prediction`.
 # It will organize the relevant data in the expected format/order.
 predictions = []
-for where, y, yp, proba in zip(all_test_indices, y_true, y_pred, y_proba):
+for where, y, yp, proba in zip(all_test_indices, y_true, y_pred, y_proba, strict=False):
     repeat, fold, index = where
 
     prediction = format_prediction(
@@ -194,7 +194,7 @@
         index=index,
         prediction=class_map[yp],
         truth=y,
-        proba={c: pb for (c, pb) in zip(task.class_labels, proba)},
+        proba=dict(zip(task.class_labels, proba, strict=False)),
     )
     predictions.append(prediction)
 
@@ -203,7 +203,7 @@
 # We use the argument setup_string because the used flow was a script.
 
 # %%
-benchmark_command = f"python3 runbenchmark.py auto-sklearn medium -m aws -t 119"
+benchmark_command = "python3 runbenchmark.py auto-sklearn medium -m aws -t 119"
 my_run = openml.runs.OpenMLRun(
     task_id=task_id,
     flow_id=flow_id,
diff --git a/mkdocs.yml b/mkdocs.yml
index 92ba3c851..419cc249e 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -65,6 +65,7 @@ nav:
   - Advanced User Guide: details.md
   - API: reference/
   - Contributing: contributing.md
+  - Developer Setup: developer_setup.md
 
 markdown_extensions:
   - pymdownx.highlight:
@@ -127,7 +128,6 @@ plugins:
             docstring_options:
               ignore_init_summary: true
               trim_doctest_flags: true
-              returns_multiple_items: false
             show_docstring_attributes: true
             show_docstring_description: true
             show_root_heading: true
@@ -138,7 +138,7 @@ plugins:
             merge_init_into_class: true
             show_symbol_type_heading: true
             show_symbol_type_toc: true
-            docstring_style: google
+            docstring_style: numpy
             inherited_members: true
             show_if_no_docstring: false
             show_bases: true
diff --git a/openml/__init__.py b/openml/__init__.py
index c49505eb9..9a457c146 100644
--- a/openml/__init__.py
+++ b/openml/__init__.py
@@ -18,9 +18,11 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 from . import (
     _api_calls,
-    config,
+    _config as _config_module,
     datasets,
     evaluations,
     exceptions,
@@ -49,6 +51,11 @@
     OpenMLTask,
 )
 
+if TYPE_CHECKING:
+    from ._config import OpenMLConfigManager
+
+config: OpenMLConfigManager = _config_module.__config
+
 
 def populate_cache(
     task_ids: list[int] | None = None,
@@ -91,33 +98,33 @@ def populate_cache(
 
 
 __all__ = [
-    "OpenMLDataset",
+    "OpenMLBenchmarkSuite",
+    "OpenMLClassificationTask",
+    "OpenMLClusteringTask",
     "OpenMLDataFeature",
-    "OpenMLRun",
-    "OpenMLSplit",
+    "OpenMLDataset",
     "OpenMLEvaluation",
-    "OpenMLSetup",
-    "OpenMLParameter",
-    "OpenMLTask",
-    "OpenMLSupervisedTask",
-    "OpenMLClusteringTask",
+    "OpenMLFlow",
     "OpenMLLearningCurveTask",
+    "OpenMLParameter",
     "OpenMLRegressionTask",
-    "OpenMLClassificationTask",
-    "OpenMLFlow",
+    "OpenMLRun",
+    "OpenMLSetup",
+    "OpenMLSplit",
     "OpenMLStudy",
-    "OpenMLBenchmarkSuite",
+    "OpenMLSupervisedTask",
+    "OpenMLTask",
+    "__version__",
+    "_api_calls",
+    "config",
     "datasets",
     "evaluations",
     "exceptions",
     "extensions",
-    "config",
-    "runs",
     "flows",
-    "tasks",
+    "runs",
     "setups",
     "study",
+    "tasks",
     "utils",
-    "_api_calls",
-    "__version__",
 ]
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
index 81296b3da..179c814e7 100644
--- a/openml/_api_calls.py
+++ b/openml/_api_calls.py
@@ -12,7 +12,7 @@
 import xml
 import zipfile
 from pathlib import Path
-from typing import Dict, Tuple, Union
+from typing import cast
 
 import minio
 import requests
@@ -20,11 +20,12 @@
 import xmltodict
 from urllib3 import ProxyManager
 
-from . import config
+import openml
+
 from .__version__ import __version__
 from .exceptions import (
+    OpenMLAuthenticationError,
     OpenMLHashException,
-    OpenMLNotAuthorizedError,
     OpenMLServerError,
     OpenMLServerException,
     OpenMLServerNoResult,
@@ -33,8 +34,8 @@
 
 _HEADERS = {"user-agent": f"openml-python/{__version__}"}
 
-DATA_TYPE = Dict[str, Union[str, int]]
-FILE_ELEMENTS_TYPE = Dict[str, Union[str, Tuple[str, str]]]
+DATA_TYPE = dict[str, str | int]
+FILE_ELEMENTS_TYPE = dict[str, str | tuple[str, str]]
 DATABASE_CONNECTION_ERRCODE = 107
 
 API_TOKEN_HELP_LINK = "https://openml.github.io/openml-python/latest/examples/Basics/introduction_tutorial/#authentication"  # noqa: S105
@@ -71,7 +72,7 @@ def resolve_env_proxies(url: str) -> str | None:
 
 
 def _create_url_from_endpoint(endpoint: str) -> str:
-    url = config.server
+    url = cast("str", openml.config.server)
     if not url.endswith("/"):
         url += "/"
     url += endpoint
@@ -133,7 +134,7 @@ def _perform_api_call(
 def _download_minio_file(
     source: str,
     destination: str | Path,
-    exists_ok: bool = True,  # noqa: FBT001, FBT002
+    exists_ok: bool = True,  # noqa: FBT002
     proxy: str | None = "auto",
 ) -> None:
     """Download file ``source`` from a MinIO Bucket and store it at ``destination``.
@@ -172,7 +173,7 @@ def _download_minio_file(
             bucket_name=bucket,
             object_name=object_name,
             file_path=str(destination),
-            progress=ProgressBar() if config.show_progress else None,
+            progress=ProgressBar() if openml.config.show_progress else None,
             request_headers=_HEADERS,
         )
         if destination.is_file() and destination.suffix == ".zip":
@@ -239,7 +240,7 @@ def _download_text_file(
     source: str,
     output_path: str | Path | None = None,
     md5_checksum: str | None = None,
-    exists_ok: bool = True,  # noqa: FBT001, FBT002
+    exists_ok: bool = True,  # noqa: FBT002
     encoding: str = "utf8",
 ) -> str | None:
     """Download the text file at `source` and store it in `output_path`.
@@ -301,7 +302,8 @@ def _file_id_to_url(file_id: int, filename: str | None = None) -> str:
     Presents the URL how to download a given file id
     filename is optional
     """
-    openml_url = config.server.split("/api/")
+    openml_server = cast("str", openml.config.server)
+    openml_url = openml_server.split("/api/")
     url = openml_url[0] + f"/data/download/{file_id!s}"
     if filename is not None:
         url += "/" + filename
@@ -317,7 +319,7 @@ def _read_url_files(
     and sending file_elements as files
     """
     data = {} if data is None else data
-    data["api_key"] = config.apikey
+    data["api_key"] = openml.config.apikey
     if file_elements is None:
         file_elements = {}
     # Using requests.post sets header 'Accept-encoding' automatically to
@@ -337,8 +339,8 @@ def __read_url(
     md5_checksum: str | None = None,
 ) -> requests.Response:
     data = {} if data is None else data
-    if config.apikey:
-        data["api_key"] = config.apikey
+    if openml.config.apikey:
+        data["api_key"] = openml.config.apikey
     return _send_request(
         request_method=request_method,
         url=url,
@@ -363,10 +365,10 @@ def _send_request(  # noqa: C901, PLR0912
     files: FILE_ELEMENTS_TYPE | None = None,
     md5_checksum: str | None = None,
 ) -> requests.Response:
-    n_retries = max(1, config.connection_n_retries)
+    n_retries = max(1, openml.config.connection_n_retries)
 
     response: requests.Response | None = None
-    delay_method = _human_delay if config.retry_policy == "human" else _robot_delay
+    delay_method = _human_delay if openml.config.retry_policy == "human" else _robot_delay
 
     # Error to raise in case of retrying too often. Will be set to the last observed exception.
     retry_raise_e: Exception | None = None
@@ -516,11 +518,7 @@ def __parse_server_exception(
         400,  # run/42 delete
         460,  # task/42 delete
     ]:
-        msg = (
-            f"The API call {url} requires authentication via an API key.\nPlease configure "
-            "OpenML-Python to use your API as described in this example:"
-            "\nhttps://openml.github.io/openml-python/latest/examples/Basics/introduction_tutorial/#authentication"
-        )
-        return OpenMLNotAuthorizedError(message=msg)
+        msg = f"The API call {url} requires authentication via an API key."
+        return OpenMLAuthenticationError(message=msg)
 
     return OpenMLServerException(code=code, message=full_message, url=url)
diff --git a/openml/_config.py b/openml/_config.py
new file mode 100644
index 000000000..a7034b9b4
--- /dev/null
+++ b/openml/_config.py
@@ -0,0 +1,459 @@
+"""Store module level information like the API key, cache directory and the server"""
+
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import configparser
+import logging
+import logging.handlers
+import os
+import platform
+import shutil
+import warnings
+from collections.abc import Iterator
+from contextlib import contextmanager
+from dataclasses import dataclass, field, fields, replace
+from io import StringIO
+from pathlib import Path
+from typing import Any, ClassVar, Literal, cast
+from urllib.parse import urlparse
+
+logger = logging.getLogger(__name__)
+openml_logger = logging.getLogger("openml")
+
+
+def _resolve_default_cache_dir() -> Path:
+    user_defined_cache_dir = os.environ.get("OPENML_CACHE_DIR")
+    if user_defined_cache_dir is not None:
+        return Path(user_defined_cache_dir)
+
+    if platform.system().lower() != "linux":
+        return Path("~", ".openml").expanduser()
+
+    xdg_cache_home = os.environ.get("XDG_CACHE_HOME")
+    if xdg_cache_home is None:
+        return Path("~", ".cache", "openml").expanduser()
+
+    cache_dir = Path(xdg_cache_home) / "openml"
+    if cache_dir.exists():
+        return cache_dir
+
+    heuristic_dir_for_backwards_compat = Path(xdg_cache_home) / "org" / "openml"
+    if not heuristic_dir_for_backwards_compat.exists():
+        return cache_dir
+
+    root_dir_to_delete = Path(xdg_cache_home) / "org"
+    openml_logger.warning(
+        "An old cache directory was found at '%s'. This directory is no longer used by "
+        "OpenML-Python. To silence this warning you would need to delete the old cache "
+        "directory. The cached files will then be located in '%s'.",
+        root_dir_to_delete,
+        cache_dir,
+    )
+    return Path(xdg_cache_home)
+
+
+@dataclass
+class OpenMLConfig:
+    """Dataclass storing the OpenML configuration."""
+
+    apikey: str | None = ""
+    server: str = "https://www.openml.org/api/v1/xml"
+    cachedir: Path = field(default_factory=_resolve_default_cache_dir)
+    avoid_duplicate_runs: bool = False
+    retry_policy: Literal["human", "robot"] = "human"
+    connection_n_retries: int = 5
+    show_progress: bool = False
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name == "apikey" and not isinstance(value, (type(None), str)):
+            raise TypeError("apikey must be a string or None")
+
+        super().__setattr__(name, value)
+
+
+class OpenMLConfigManager:
+    """The OpenMLConfigManager manages the configuration of the openml-python package."""
+
+    def __init__(self) -> None:
+        self.console_handler: logging.StreamHandler | None = None
+        self.file_handler: logging.handlers.RotatingFileHandler | None = None
+
+        self.OPENML_CACHE_DIR_ENV_VAR = "OPENML_CACHE_DIR"
+        self.OPENML_SKIP_PARQUET_ENV_VAR = "OPENML_SKIP_PARQUET"
+        self._TEST_SERVER_NORMAL_USER_KEY = "normaluser"
+        self.OPENML_TEST_SERVER_ADMIN_KEY_ENV_VAR = "OPENML_TEST_SERVER_ADMIN_KEY"
+        self.TEST_SERVER_URL = "https://test.openml.org"
+
+        self._config: OpenMLConfig = OpenMLConfig()
+        # for legacy test `test_non_writable_home`
+        self._defaults: dict[str, Any] = OpenMLConfig().__dict__.copy()
+        self._root_cache_directory: Path = self._config.cachedir
+
+        self.logger = logger
+        self.openml_logger = openml_logger
+
+        self._examples = ConfigurationForExamples(self)
+
+        self._setup()
+
+    def __getattr__(self, name: str) -> Any:
+        if hasattr(self._config, name):
+            return getattr(self._config, name)
+        raise AttributeError(f"{type(self).__name__!r} object has no attribute {name!r}")
+
+    _FIELDS: ClassVar[set[str]] = {f.name for f in fields(OpenMLConfig)}
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        # during __init__ before _config exists
+        if name in {
+            "_config",
+            "_root_cache_directory",
+            "console_handler",
+            "file_handler",
+            "logger",
+            "openml_logger",
+            "_examples",
+            "OPENML_CACHE_DIR_ENV_VAR",
+            "OPENML_SKIP_PARQUET_ENV_VAR",
+            "_TEST_SERVER_NORMAL_USER_KEY",
+        }:
+            return object.__setattr__(self, name, value)
+
+        if name in self._FIELDS:
+            # write into dataclass, not manager (prevents shadowing)
+            if name == "cachedir":
+                object.__setattr__(self, "_root_cache_directory", Path(value))
+            object.__setattr__(self, "_config", replace(self._config, **{name: value}))
+            return None
+
+        object.__setattr__(self, name, value)
+        return None
+
+    def _create_log_handlers(self, create_file_handler: bool = True) -> None:  # noqa: FBT002
+        if self.console_handler is not None or self.file_handler is not None:
+            self.logger.debug("Requested to create log handlers, but they are already created.")
+            return
+
+        message_format = "[%(levelname)s] [%(asctime)s:%(name)s] %(message)s"
+        output_formatter = logging.Formatter(message_format, datefmt="%H:%M:%S")
+
+        self.console_handler = logging.StreamHandler()
+        self.console_handler.setFormatter(output_formatter)
+
+        if create_file_handler:
+            one_mb = 2**20
+            log_path = self._root_cache_directory / "openml_python.log"
+            self.file_handler = logging.handlers.RotatingFileHandler(
+                log_path,
+                maxBytes=one_mb,
+                backupCount=1,
+                delay=True,
+            )
+            self.file_handler.setFormatter(output_formatter)
+
+    def _convert_log_levels(self, log_level: int) -> tuple[int, int]:
+        openml_to_python = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG}
+        python_to_openml = {
+            logging.DEBUG: 2,
+            logging.INFO: 1,
+            logging.WARNING: 0,
+            logging.CRITICAL: 0,
+            logging.ERROR: 0,
+        }
+        openml_level = python_to_openml.get(log_level, log_level)
+        python_level = openml_to_python.get(log_level, log_level)
+        return openml_level, python_level
+
+    def _set_level_register_and_store(self, handler: logging.Handler, log_level: int) -> None:
+        _oml_level, py_level = self._convert_log_levels(log_level)
+        handler.setLevel(py_level)
+
+        if self.openml_logger.level > py_level or self.openml_logger.level == logging.NOTSET:
+            self.openml_logger.setLevel(py_level)
+
+        if handler not in self.openml_logger.handlers:
+            self.openml_logger.addHandler(handler)
+
+    def set_console_log_level(self, console_output_level: int) -> None:
+        """Set the log level for console output."""
+        assert self.console_handler is not None
+        self._set_level_register_and_store(self.console_handler, console_output_level)
+
+    def set_file_log_level(self, file_output_level: int) -> None:
+        """Set the log level for file output."""
+        assert self.file_handler is not None
+        self._set_level_register_and_store(self.file_handler, file_output_level)
+
+    def get_server_base_url(self) -> str:
+        """Get the base URL of the OpenML server (i.e., without /api)."""
+        domain, _ = self._config.server.split("/api", maxsplit=1)
+        return domain.replace("api", "www")
+
+    def set_retry_policy(
+        self, value: Literal["human", "robot"], n_retries: int | None = None
+    ) -> None:
+        """Set the retry policy for server connections."""
+        default_retries_by_policy = {"human": 5, "robot": 50}
+
+        if value not in default_retries_by_policy:
+            raise ValueError(
+                f"Detected retry_policy '{value}' but must be one of "
+                f"{list(default_retries_by_policy.keys())}",
+            )
+        if n_retries is not None and not isinstance(n_retries, int):
+            raise TypeError(
+                f"`n_retries` must be of type `int` or `None` but is `{type(n_retries)}`."
+            )
+
+        if isinstance(n_retries, int) and n_retries < 1:
+            raise ValueError(f"`n_retries` is '{n_retries}' but must be positive.")
+
+        self._config = replace(
+            self._config,
+            retry_policy=value,
+            connection_n_retries=(
+                default_retries_by_policy[value] if n_retries is None else n_retries
+            ),
+        )
+
+    def _handle_xdg_config_home_backwards_compatibility(self, xdg_home: str) -> Path:
+        config_dir = Path(xdg_home) / "openml"
+
+        backwards_compat_config_file = Path(xdg_home) / "config"
+        if not backwards_compat_config_file.exists():
+            return config_dir
+
+        try:
+            self._parse_config(backwards_compat_config_file)
+        except Exception:  # noqa: BLE001
+            return config_dir
+
+        correct_config_location = config_dir / "config"
+        try:
+            shutil.copy(backwards_compat_config_file, correct_config_location)
+            self.openml_logger.warning(
+                "An openml configuration file was found at the old location "
+                f"at {backwards_compat_config_file}. We have copied it to the new "
+                f"location at {correct_config_location}. "
+                "\nTo silence this warning please verify that the configuration file "
+                f"at {correct_config_location} is correct and delete the file at "
+                f"{backwards_compat_config_file}."
+            )
+            return config_dir
+        except Exception as e:  # noqa: BLE001
+            self.openml_logger.warning(
+                "While attempting to perform a backwards compatible fix, we "
+                f"failed to copy the openml config file at "
+                f"{backwards_compat_config_file}' to {correct_config_location}"
+                f"\n{type(e)}: {e}",
+                "\n\nTo silence this warning, please copy the file "
+                "to the new location and delete the old file at "
+                f"{backwards_compat_config_file}.",
+            )
+            return backwards_compat_config_file
+
+    def determine_config_file_path(self) -> Path:
+        """Determine the path to the openml configuration file."""
+        if platform.system().lower() == "linux":
+            xdg_home = os.environ.get("XDG_CONFIG_HOME")
+            if xdg_home is not None:
+                config_dir = self._handle_xdg_config_home_backwards_compatibility(xdg_home)
+            else:
+                config_dir = Path("~", ".config", "openml")
+        else:
+            config_dir = Path("~") / ".openml"
+
+        config_dir = Path(config_dir).expanduser().resolve()
+        return config_dir / "config"
+
+    def _parse_config(self, config_file: str | Path) -> dict[str, Any]:
+        config_file = Path(config_file)
+        config = configparser.RawConfigParser(defaults=OpenMLConfig().__dict__)  # type: ignore
+
+        config_file_ = StringIO()
+        config_file_.write("[FAKE_SECTION]\n")
+        try:
+            with config_file.open("r") as fh:
+                for line in fh:
+                    config_file_.write(line)
+        except FileNotFoundError:
+            self.logger.info(
+                "No config file found at %s, using default configuration.", config_file
+            )
+        except OSError as e:
+            self.logger.info("Error opening file %s: %s", config_file, e.args[0])
+        config_file_.seek(0)
+        config.read_file(config_file_)
+        configuration = dict(config.items("FAKE_SECTION"))
+        for boolean_field in ["avoid_duplicate_runs", "show_progress"]:
+            if isinstance(config["FAKE_SECTION"][boolean_field], str):
+                configuration[boolean_field] = config["FAKE_SECTION"].getboolean(boolean_field)  # type: ignore
+        return configuration  # type: ignore
+
+    def start_using_configuration_for_example(self) -> None:
+        """Sets the configuration to connect to the test server with valid apikey."""
+        return self._examples.start_using_configuration_for_example()
+
+    def stop_using_configuration_for_example(self) -> None:
+        """Store the configuration as it was before `start_use_example_configuration`."""
+        return self._examples.stop_using_configuration_for_example()
+
+    def _setup(self, config: dict[str, Any] | None = None) -> None:
+        config_file = self.determine_config_file_path()
+        config_dir = config_file.parent
+
+        try:
+            if not config_dir.exists():
+                config_dir.mkdir(exist_ok=True, parents=True)
+        except PermissionError:
+            self.openml_logger.warning(
+                f"No permission to create OpenML directory at {config_dir}!"
+                " This can result in OpenML-Python not working properly."
+            )
+
+        if config is None:
+            config = self._parse_config(config_file)
+
+        self._config = replace(
+            self._config,
+            apikey=config["apikey"],
+            server=config["server"],
+            show_progress=config["show_progress"],
+            avoid_duplicate_runs=config["avoid_duplicate_runs"],
+            retry_policy=config["retry_policy"],
+            connection_n_retries=int(config["connection_n_retries"]),
+        )
+
+        user_defined_cache_dir = os.environ.get(self.OPENML_CACHE_DIR_ENV_VAR)
+        if user_defined_cache_dir is not None:
+            short_cache_dir = Path(user_defined_cache_dir)
+        else:
+            short_cache_dir = Path(config["cachedir"])
+
+        self._root_cache_directory = short_cache_dir.expanduser().resolve()
+        self._config = replace(self._config, cachedir=self._root_cache_directory)
+
+        try:
+            cache_exists = self._root_cache_directory.exists()
+            if not cache_exists:
+                self._root_cache_directory.mkdir(exist_ok=True, parents=True)
+            self._create_log_handlers()
+        except PermissionError:
+            self.openml_logger.warning(
+                f"No permission to create OpenML directory at {self._root_cache_directory}!"
+                " This can result in OpenML-Python not working properly."
+            )
+            self._create_log_handlers(create_file_handler=False)
+
+    def set_field_in_config_file(self, field: str, value: Any) -> None:
+        """Set a field in the configuration file."""
+        if not hasattr(OpenMLConfig(), field):
+            raise ValueError(
+                f"Field '{field}' is not valid and must be one of "
+                f"'{OpenMLConfig().__dict__.keys()}'."
+            )
+
+        self._config = replace(self._config, **{field: value})
+        config_file = self.determine_config_file_path()
+        existing = self._parse_config(config_file)
+        with config_file.open("w") as fh:
+            for f in OpenMLConfig().__dict__:
+                v = value if f == field else existing.get(f)
+                if v is not None:
+                    fh.write(f"{f} = {v}\n")
+
+    def get_config_as_dict(self) -> dict[str, Any]:
+        """Get the current configuration as a dictionary."""
+        return self._config.__dict__.copy()
+
+    def get_cache_directory(self) -> str:
+        """Get the cache directory for the current server."""
+        url_suffix = urlparse(self._config.server).netloc
+        url_parts = url_suffix.replace(":", "_").split(".")[::-1]
+        reversed_url_suffix = os.sep.join(url_parts)  # noqa: PTH118
+        return os.path.join(self._root_cache_directory, reversed_url_suffix)  # noqa: PTH118
+
+    def set_root_cache_directory(self, root_cache_directory: str | Path) -> None:
+        """Set the root cache directory."""
+        self._root_cache_directory = Path(root_cache_directory)
+        self._config = replace(self._config, cachedir=self._root_cache_directory)
+
+    @contextmanager
+    def overwrite_config_context(self, config: dict[str, Any]) -> Iterator[dict[str, Any]]:
+        """Overwrite the current configuration within a context manager."""
+        existing_config = self.get_config_as_dict()
+        merged_config = {**existing_config, **config}
+
+        self._setup(merged_config)
+        yield merged_config
+        self._setup(existing_config)
+
+
+class ConfigurationForExamples:
+    """Allows easy switching to and from a test configuration, used for examples."""
+
+    _last_used_server = None
+    _last_used_key = None
+    _start_last_called = False
+
+    def __init__(self, manager: OpenMLConfigManager):
+        self._manager = manager
+        self._test_apikey = manager._TEST_SERVER_NORMAL_USER_KEY
+        self._test_server = f"{manager.TEST_SERVER_URL}/api/v1/xml"
+
+    def start_using_configuration_for_example(self) -> None:
+        """Sets the configuration to connect to the test server with valid apikey.
+
+        To configuration as was before this call is stored, and can be recovered
+        by using the `stop_use_example_configuration` method.
+        """
+        if (
+            self._start_last_called
+            and self._manager._config.server == self._test_server
+            and self._manager._config.apikey == self._test_apikey
+        ):
+            # Method is called more than once in a row without modifying the server or apikey.
+            # We don't want to save the current test configuration as a last used configuration.
+            return
+
+        self._last_used_server = self._manager._config.server
+        self._last_used_key = self._manager._config.apikey
+        type(self)._start_last_called = True
+
+        # Test server key for examples
+        self._manager._config = replace(
+            self._manager._config,
+            server=self._test_server,
+            apikey=self._test_apikey,
+        )
+        warnings.warn(
+            f"Switching to the test server {self._test_server} to not upload results to "
+            "the live server. Using the test server may result in reduced performance of the "
+            "API!",
+            stacklevel=2,
+        )
+
+    def stop_using_configuration_for_example(self) -> None:
+        """Return to configuration as it was before `start_use_example_configuration`."""
+        if not type(self)._start_last_called:
+            # We don't want to allow this because it will (likely) result in the `server` and
+            # `apikey` variables being set to None.
+            raise RuntimeError(
+                "`stop_use_example_configuration` called without a saved config."
+                "`start_use_example_configuration` must be called first.",
+            )
+
+        self._manager._config = replace(
+            self._manager._config,
+            server=cast("str", self._last_used_server),
+            apikey=cast("str", self._last_used_key),
+        )
+        type(self)._start_last_called = False
+
+
+__config = OpenMLConfigManager()
+
+
+def __getattr__(name: str) -> Any:
+    return getattr(__config, name)
diff --git a/openml/base.py b/openml/base.py
index fbfb9dfc8..ddee71196 100644
--- a/openml/base.py
+++ b/openml/base.py
@@ -4,12 +4,11 @@
 import re
 import webbrowser
 from abc import ABC, abstractmethod
-from typing import Iterable, Sequence
+from collections.abc import Iterable, Sequence
 
 import xmltodict
 
 import openml._api_calls
-import openml.config
 
 from .utils import _get_rest_api_type_alias, _tag_openml_base
 
diff --git a/openml/cli.py b/openml/cli.py
index d0a46e498..838f774d1 100644
--- a/openml/cli.py
+++ b/openml/cli.py
@@ -5,11 +5,13 @@
 import argparse
 import string
 import sys
+from collections.abc import Callable
+from dataclasses import fields
 from pathlib import Path
-from typing import Callable
 from urllib.parse import urlparse
 
-from openml import config
+import openml
+from openml.__version__ import __version__
 
 
 def is_hex(string_: str) -> bool:
@@ -58,17 +60,17 @@ def wait_until_valid_input(
 
 
 def print_configuration() -> None:
-    file = config.determine_config_file_path()
+    file = openml.config.determine_config_file_path()
     header = f"File '{file}' contains (or defaults to):"
     print(header)
 
-    max_key_length = max(map(len, config.get_config_as_dict()))
-    for field, value in config.get_config_as_dict().items():
+    max_key_length = max(map(len, openml.config.get_config_as_dict()))
+    for field, value in openml.config.get_config_as_dict().items():
         print(f"{field.ljust(max_key_length)}: {value}")
 
 
 def verbose_set(field: str, value: str) -> None:
-    config.set_field_in_config_file(field, value)
+    openml.config.set_field_in_config_file(field, value)
     print(f"{field} set to '{value}'.")
 
 
@@ -81,7 +83,7 @@ def check_apikey(apikey: str) -> str:
         return ""
 
     instructions = (
-        f"Your current API key is set to: '{config.apikey}'. "
+        f"Your current API key is set to: '{openml.config.apikey}'. "
         "You can get an API key at https://new.openml.org. "
         "You must create an account if you don't have one yet:\n"
         "  1. Log in with the account.\n"
@@ -101,15 +103,15 @@ def check_apikey(apikey: str) -> str:
 
 def configure_server(value: str) -> None:
     def check_server(server: str) -> str:
-        is_shorthand = server in ["test", "production"]
+        is_shorthand = server in ["test", "production_server"]
         if is_shorthand or looks_like_url(server):
             return ""
-        return "Must be 'test', 'production' or a url."
+        return "Must be 'test', 'production_server' or a url."
 
     def replace_shorthand(server: str) -> str:
         if server == "test":
-            return "https://test.openml.org/api/v1/xml"
-        if server == "production":
+            return f"{openml.config.TEST_SERVER_URL}/api/v1/xml"
+        if server == "production_server":
             return "https://www.openml.org/api/v1/xml"
         return server
 
@@ -118,7 +120,7 @@ def replace_shorthand(server: str) -> str:
         value=value,
         check_with_message=check_server,
         intro_message="Specify which server you wish to connect to.",
-        input_message="Specify a url or use 'test' or 'production' as a shorthand: ",
+        input_message="Specify a url or use 'test' or 'production_server' as a shorthand: ",
         sanitize=replace_shorthand,
     )
 
@@ -331,6 +333,13 @@ def main() -> None:
     subroutines = {"configure": configure}
 
     parser = argparse.ArgumentParser()
+    # Add a global --version flag to display installed version and exit
+    parser.add_argument(
+        "--version",
+        action="version",
+        version=f"%(prog)s {__version__}",
+        help="Show the OpenML version and exit",
+    )
     subparsers = parser.add_subparsers(dest="subroutine")
 
     parser_configure = subparsers.add_parser(
@@ -339,7 +348,9 @@ def main() -> None:
         "'https://openml.github.io/openml-python/main/usage.html#configuration'.",
     )
 
-    configurable_fields = [f for f in config._defaults if f not in ["max_retries"]]
+    configurable_fields = [
+        f.name for f in fields(openml._config.OpenMLConfig) if f.name not in ["max_retries"]
+    ]
 
     parser_configure.add_argument(
         "field",
diff --git a/openml/config.py b/openml/config.py
deleted file mode 100644
index 3dde45bdd..000000000
--- a/openml/config.py
+++ /dev/null
@@ -1,523 +0,0 @@
-"""Store module level information like the API key, cache directory and the server"""
-
-# License: BSD 3-Clause
-from __future__ import annotations
-
-import configparser
-import logging
-import logging.handlers
-import os
-import platform
-import shutil
-import warnings
-from contextlib import contextmanager
-from io import StringIO
-from pathlib import Path
-from typing import Any, Iterator, cast
-from typing_extensions import Literal, TypedDict
-from urllib.parse import urlparse
-
-logger = logging.getLogger(__name__)
-openml_logger = logging.getLogger("openml")
-console_handler: logging.StreamHandler | None = None
-file_handler: logging.handlers.RotatingFileHandler | None = None
-
-OPENML_CACHE_DIR_ENV_VAR = "OPENML_CACHE_DIR"
-OPENML_SKIP_PARQUET_ENV_VAR = "OPENML_SKIP_PARQUET"
-
-
-class _Config(TypedDict):
-    apikey: str
-    server: str
-    cachedir: Path
-    avoid_duplicate_runs: bool
-    retry_policy: Literal["human", "robot"]
-    connection_n_retries: int
-    show_progress: bool
-
-
-def _create_log_handlers(create_file_handler: bool = True) -> None:  # noqa: FBT001, FBT002
-    """Creates but does not attach the log handlers."""
-    global console_handler, file_handler  # noqa: PLW0603
-    if console_handler is not None or file_handler is not None:
-        logger.debug("Requested to create log handlers, but they are already created.")
-        return
-
-    message_format = "[%(levelname)s] [%(asctime)s:%(name)s] %(message)s"
-    output_formatter = logging.Formatter(message_format, datefmt="%H:%M:%S")
-
-    console_handler = logging.StreamHandler()
-    console_handler.setFormatter(output_formatter)
-
-    if create_file_handler:
-        one_mb = 2**20
-        log_path = _root_cache_directory / "openml_python.log"
-        file_handler = logging.handlers.RotatingFileHandler(
-            log_path,
-            maxBytes=one_mb,
-            backupCount=1,
-            delay=True,
-        )
-        file_handler.setFormatter(output_formatter)
-
-
-def _convert_log_levels(log_level: int) -> tuple[int, int]:
-    """Converts a log level that's either defined by OpenML/Python to both specifications."""
-    # OpenML verbosity level don't match Python values directly:
-    openml_to_python = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG}
-    python_to_openml = {
-        logging.DEBUG: 2,
-        logging.INFO: 1,
-        logging.WARNING: 0,
-        logging.CRITICAL: 0,
-        logging.ERROR: 0,
-    }
-    # Because the dictionaries share no keys, we use `get` to convert as necessary:
-    openml_level = python_to_openml.get(log_level, log_level)
-    python_level = openml_to_python.get(log_level, log_level)
-    return openml_level, python_level
-
-
-def _set_level_register_and_store(handler: logging.Handler, log_level: int) -> None:
-    """Set handler log level, register it if needed, save setting to config file if specified."""
-    _oml_level, py_level = _convert_log_levels(log_level)
-    handler.setLevel(py_level)
-
-    if openml_logger.level > py_level or openml_logger.level == logging.NOTSET:
-        openml_logger.setLevel(py_level)
-
-    if handler not in openml_logger.handlers:
-        openml_logger.addHandler(handler)
-
-
-def set_console_log_level(console_output_level: int) -> None:
-    """Set console output to the desired level and register it with openml logger if needed."""
-    global console_handler  # noqa: PLW0602
-    assert console_handler is not None
-    _set_level_register_and_store(console_handler, console_output_level)
-
-
-def set_file_log_level(file_output_level: int) -> None:
-    """Set file output to the desired level and register it with openml logger if needed."""
-    global file_handler  # noqa: PLW0602
-    assert file_handler is not None
-    _set_level_register_and_store(file_handler, file_output_level)
-
-
-# Default values (see also https://github.com/openml/OpenML/wiki/Client-API-Standards)
-_user_path = Path("~").expanduser().absolute()
-
-
-def _resolve_default_cache_dir() -> Path:
-    user_defined_cache_dir = os.environ.get(OPENML_CACHE_DIR_ENV_VAR)
-    if user_defined_cache_dir is not None:
-        return Path(user_defined_cache_dir)
-
-    if platform.system().lower() != "linux":
-        return _user_path / ".openml"
-
-    xdg_cache_home = os.environ.get("XDG_CACHE_HOME")
-    if xdg_cache_home is None:
-        return Path("~", ".cache", "openml")
-
-    # This is the proper XDG_CACHE_HOME directory, but
-    # we unfortunately had a problem where we used XDG_CACHE_HOME/org,
-    # we check heuristically if this old directory still exists and issue
-    # a warning if it does. There's too much data to move to do this for the user.
-
-    # The new cache directory exists
-    cache_dir = Path(xdg_cache_home) / "openml"
-    if cache_dir.exists():
-        return cache_dir
-
-    # The old cache directory *does not* exist
-    heuristic_dir_for_backwards_compat = Path(xdg_cache_home) / "org" / "openml"
-    if not heuristic_dir_for_backwards_compat.exists():
-        return cache_dir
-
-    root_dir_to_delete = Path(xdg_cache_home) / "org"
-    openml_logger.warning(
-        "An old cache directory was found at '%s'. This directory is no longer used by "
-        "OpenML-Python. To silence this warning you would need to delete the old cache "
-        "directory. The cached files will then be located in '%s'.",
-        root_dir_to_delete,
-        cache_dir,
-    )
-    return Path(xdg_cache_home)
-
-
-_defaults: _Config = {
-    "apikey": "",
-    "server": "https://www.openml.org/api/v1/xml",
-    "cachedir": _resolve_default_cache_dir(),
-    "avoid_duplicate_runs": False,
-    "retry_policy": "human",
-    "connection_n_retries": 5,
-    "show_progress": False,
-}
-
-# Default values are actually added here in the _setup() function which is
-# called at the end of this module
-server = _defaults["server"]
-
-
-def get_server_base_url() -> str:
-    """Return the base URL of the currently configured server.
-
-    Turns ``"https://api.openml.org/api/v1/xml"`` in ``"https://www.openml.org/"``
-    and ``"https://test.openml.org/api/v1/xml"`` in ``"https://test.openml.org/"``
-
-    Returns
-    -------
-    str
-    """
-    domain, path = server.split("/api", maxsplit=1)
-    return domain.replace("api", "www")
-
-
-apikey: str = _defaults["apikey"]
-show_progress: bool = _defaults["show_progress"]
-# The current cache directory (without the server name)
-_root_cache_directory: Path = Path(_defaults["cachedir"])
-avoid_duplicate_runs = _defaults["avoid_duplicate_runs"]
-
-retry_policy: Literal["human", "robot"] = _defaults["retry_policy"]
-connection_n_retries: int = _defaults["connection_n_retries"]
-
-
-def set_retry_policy(value: Literal["human", "robot"], n_retries: int | None = None) -> None:
-    global retry_policy  # noqa: PLW0603
-    global connection_n_retries  # noqa: PLW0603
-    default_retries_by_policy = {"human": 5, "robot": 50}
-
-    if value not in default_retries_by_policy:
-        raise ValueError(
-            f"Detected retry_policy '{value}' but must be one of "
-            f"{list(default_retries_by_policy.keys())}",
-        )
-    if n_retries is not None and not isinstance(n_retries, int):
-        raise TypeError(f"`n_retries` must be of type `int` or `None` but is `{type(n_retries)}`.")
-
-    if isinstance(n_retries, int) and n_retries < 1:
-        raise ValueError(f"`n_retries` is '{n_retries}' but must be positive.")
-
-    retry_policy = value
-    connection_n_retries = default_retries_by_policy[value] if n_retries is None else n_retries
-
-
-class ConfigurationForExamples:
-    """Allows easy switching to and from a test configuration, used for examples."""
-
-    _last_used_server = None
-    _last_used_key = None
-    _start_last_called = False
-    _test_server = "https://test.openml.org/api/v1/xml"
-    _test_apikey = "c0c42819af31e706efe1f4b88c23c6c1"
-
-    @classmethod
-    def start_using_configuration_for_example(cls) -> None:
-        """Sets the configuration to connect to the test server with valid apikey.
-
-        To configuration as was before this call is stored, and can be recovered
-        by using the `stop_use_example_configuration` method.
-        """
-        global server  # noqa: PLW0603
-        global apikey  # noqa: PLW0603
-
-        if cls._start_last_called and server == cls._test_server and apikey == cls._test_apikey:
-            # Method is called more than once in a row without modifying the server or apikey.
-            # We don't want to save the current test configuration as a last used configuration.
-            return
-
-        cls._last_used_server = server
-        cls._last_used_key = apikey
-        cls._start_last_called = True
-
-        # Test server key for examples
-        server = cls._test_server
-        apikey = cls._test_apikey
-        warnings.warn(
-            f"Switching to the test server {server} to not upload results to the live server. "
-            "Using the test server may result in reduced performance of the API!",
-            stacklevel=2,
-        )
-
-    @classmethod
-    def stop_using_configuration_for_example(cls) -> None:
-        """Return to configuration as it was before `start_use_example_configuration`."""
-        if not cls._start_last_called:
-            # We don't want to allow this because it will (likely) result in the `server` and
-            # `apikey` variables being set to None.
-            raise RuntimeError(
-                "`stop_use_example_configuration` called without a saved config."
-                "`start_use_example_configuration` must be called first.",
-            )
-
-        global server  # noqa: PLW0603
-        global apikey  # noqa: PLW0603
-
-        server = cast(str, cls._last_used_server)
-        apikey = cast(str, cls._last_used_key)
-        cls._start_last_called = False
-
-
-def _handle_xdg_config_home_backwards_compatibility(
-    xdg_home: str,
-) -> Path:
-    # NOTE(eddiebergman): A previous bug results in the config
-    # file being located at `${XDG_CONFIG_HOME}/config` instead
-    # of `${XDG_CONFIG_HOME}/openml/config`. As to maintain backwards
-    # compatibility, where users may already may have had a configuration,
-    # we copy it over an issue a warning until it's deleted.
-    # As a heurisitic to ensure that it's "our" config file, we try parse it first.
-    config_dir = Path(xdg_home) / "openml"
-
-    backwards_compat_config_file = Path(xdg_home) / "config"
-    if not backwards_compat_config_file.exists():
-        return config_dir
-
-    # If it errors, that's a good sign it's not ours and we can
-    # safely ignore it, jumping out of this block. This is a heurisitc
-    try:
-        _parse_config(backwards_compat_config_file)
-    except Exception:  # noqa: BLE001
-        return config_dir
-
-    # Looks like it's ours, lets try copy it to the correct place
-    correct_config_location = config_dir / "config"
-    try:
-        # We copy and return the new copied location
-        shutil.copy(backwards_compat_config_file, correct_config_location)
-        openml_logger.warning(
-            "An openml configuration file was found at the old location "
-            f"at {backwards_compat_config_file}. We have copied it to the new "
-            f"location at {correct_config_location}. "
-            "\nTo silence this warning please verify that the configuration file "
-            f"at {correct_config_location} is correct and delete the file at "
-            f"{backwards_compat_config_file}."
-        )
-        return config_dir
-    except Exception as e:  # noqa: BLE001
-        # We failed to copy and its ours, return the old one.
-        openml_logger.warning(
-            "While attempting to perform a backwards compatible fix, we "
-            f"failed to copy the openml config file at "
-            f"{backwards_compat_config_file}' to {correct_config_location}"
-            f"\n{type(e)}: {e}",
-            "\n\nTo silence this warning, please copy the file "
-            "to the new location and delete the old file at "
-            f"{backwards_compat_config_file}.",
-        )
-        return backwards_compat_config_file
-
-
-def determine_config_file_path() -> Path:
-    if platform.system().lower() == "linux":
-        xdg_home = os.environ.get("XDG_CONFIG_HOME")
-        if xdg_home is not None:
-            config_dir = _handle_xdg_config_home_backwards_compatibility(xdg_home)
-        else:
-            config_dir = Path("~", ".config", "openml")
-    else:
-        config_dir = Path("~") / ".openml"
-
-    # Still use os.path.expanduser to trigger the mock in the unit test
-    config_dir = Path(config_dir).expanduser().resolve()
-    return config_dir / "config"
-
-
-def _setup(config: _Config | None = None) -> None:
-    """Setup openml package. Called on first import.
-
-    Reads the config file and sets up apikey, server, cache appropriately.
-    key and server can be set by the user simply using
-    openml.config.apikey = THEIRKEY
-    openml.config.server = SOMESERVER
-    We could also make it a property but that's less clear.
-    """
-    global apikey  # noqa: PLW0603
-    global server  # noqa: PLW0603
-    global _root_cache_directory  # noqa: PLW0603
-    global avoid_duplicate_runs  # noqa: PLW0603
-    global show_progress  # noqa: PLW0603
-
-    config_file = determine_config_file_path()
-    config_dir = config_file.parent
-
-    # read config file, create directory for config file
-    try:
-        if not config_dir.exists():
-            config_dir.mkdir(exist_ok=True, parents=True)
-    except PermissionError:
-        openml_logger.warning(
-            f"No permission to create OpenML directory at {config_dir}!"
-            " This can result in OpenML-Python not working properly."
-        )
-
-    if config is None:
-        config = _parse_config(config_file)
-
-    avoid_duplicate_runs = config["avoid_duplicate_runs"]
-    apikey = config["apikey"]
-    server = config["server"]
-    show_progress = config["show_progress"]
-    n_retries = int(config["connection_n_retries"])
-
-    set_retry_policy(config["retry_policy"], n_retries)
-
-    user_defined_cache_dir = os.environ.get(OPENML_CACHE_DIR_ENV_VAR)
-    if user_defined_cache_dir is not None:
-        short_cache_dir = Path(user_defined_cache_dir)
-    else:
-        short_cache_dir = Path(config["cachedir"])
-    _root_cache_directory = short_cache_dir.expanduser().resolve()
-
-    try:
-        cache_exists = _root_cache_directory.exists()
-        # create the cache subdirectory
-        if not cache_exists:
-            _root_cache_directory.mkdir(exist_ok=True, parents=True)
-        _create_log_handlers()
-    except PermissionError:
-        openml_logger.warning(
-            f"No permission to create OpenML directory at {_root_cache_directory}!"
-            " This can result in OpenML-Python not working properly."
-        )
-        _create_log_handlers(create_file_handler=False)
-
-
-def set_field_in_config_file(field: str, value: Any) -> None:
-    """Overwrites the `field` in the configuration file with the new `value`."""
-    if field not in _defaults:
-        raise ValueError(f"Field '{field}' is not valid and must be one of '{_defaults.keys()}'.")
-
-    # TODO(eddiebergman): This use of globals has gone too far
-    globals()[field] = value
-    config_file = determine_config_file_path()
-    config = _parse_config(config_file)
-    with config_file.open("w") as fh:
-        for f in _defaults:
-            # We can't blindly set all values based on globals() because when the user
-            # sets it through config.FIELD it should not be stored to file.
-            # There doesn't seem to be a way to avoid writing defaults to file with configparser,
-            # because it is impossible to distinguish from an explicitly set value that matches
-            # the default value, to one that was set to its default because it was omitted.
-            value = globals()[f] if f == field else config.get(f)  # type: ignore
-            if value is not None:
-                fh.write(f"{f} = {value}\n")
-
-
-def _parse_config(config_file: str | Path) -> _Config:
-    """Parse the config file, set up defaults."""
-    config_file = Path(config_file)
-    config = configparser.RawConfigParser(defaults=_defaults)  # type: ignore
-
-    # The ConfigParser requires a [SECTION_HEADER], which we do not expect in our config file.
-    # Cheat the ConfigParser module by adding a fake section header
-    config_file_ = StringIO()
-    config_file_.write("[FAKE_SECTION]\n")
-    try:
-        with config_file.open("r") as fh:
-            for line in fh:
-                config_file_.write(line)
-    except FileNotFoundError:
-        logger.info("No config file found at %s, using default configuration.", config_file)
-    except OSError as e:
-        logger.info("Error opening file %s: %s", config_file, e.args[0])
-    config_file_.seek(0)
-    config.read_file(config_file_)
-    configuration = dict(config.items("FAKE_SECTION"))
-    for boolean_field in ["avoid_duplicate_runs", "show_progress"]:
-        if isinstance(config["FAKE_SECTION"][boolean_field], str):
-            configuration[boolean_field] = config["FAKE_SECTION"].getboolean(boolean_field)  # type: ignore
-    return configuration  # type: ignore
-
-
-def get_config_as_dict() -> _Config:
-    return {
-        "apikey": apikey,
-        "server": server,
-        "cachedir": _root_cache_directory,
-        "avoid_duplicate_runs": avoid_duplicate_runs,
-        "connection_n_retries": connection_n_retries,
-        "retry_policy": retry_policy,
-        "show_progress": show_progress,
-    }
-
-
-# NOTE: For backwards compatibility, we keep the `str`
-def get_cache_directory() -> str:
-    """Get the current cache directory.
-
-    This gets the cache directory for the current server relative
-    to the root cache directory that can be set via
-    ``set_root_cache_directory()``. The cache directory is the
-    ``root_cache_directory`` with additional information on which
-    subdirectory to use based on the server name. By default it is
-    ``root_cache_directory / org / openml / www`` for the standard
-    OpenML.org server and is defined as
-    ``root_cache_directory / top-level domain / second-level domain /
-    hostname``
-    ```
-
-    Returns
-    -------
-    cachedir : string
-        The current cache directory.
-
-    """
-    url_suffix = urlparse(server).netloc
-    reversed_url_suffix = os.sep.join(url_suffix.split(".")[::-1])  # noqa: PTH118
-    return os.path.join(_root_cache_directory, reversed_url_suffix)  # noqa: PTH118
-
-
-def set_root_cache_directory(root_cache_directory: str | Path) -> None:
-    """Set module-wide base cache directory.
-
-    Sets the root cache directory, wherin the cache directories are
-    created to store content from different OpenML servers. For example,
-    by default, cached data for the standard OpenML.org server is stored
-    at ``root_cache_directory / org / openml / www``, and the general
-    pattern is ``root_cache_directory / top-level domain / second-level
-    domain / hostname``.
-
-    Parameters
-    ----------
-    root_cache_directory : string
-         Path to use as cache directory.
-
-    See Also
-    --------
-    get_cache_directory
-    """
-    global _root_cache_directory  # noqa: PLW0603
-    _root_cache_directory = Path(root_cache_directory)
-
-
-start_using_configuration_for_example = (
-    ConfigurationForExamples.start_using_configuration_for_example
-)
-stop_using_configuration_for_example = ConfigurationForExamples.stop_using_configuration_for_example
-
-
-@contextmanager
-def overwrite_config_context(config: dict[str, Any]) -> Iterator[_Config]:
-    """A context manager to temporarily override variables in the configuration."""
-    existing_config = get_config_as_dict()
-    merged_config = {**existing_config, **config}
-
-    _setup(merged_config)  # type: ignore
-    yield merged_config  # type: ignore
-
-    _setup(existing_config)
-
-
-__all__ = [
-    "get_cache_directory",
-    "set_root_cache_directory",
-    "start_using_configuration_for_example",
-    "stop_using_configuration_for_example",
-    "get_config_as_dict",
-]
-
-_setup()
diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py
index 480dd9576..eb0932652 100644
--- a/openml/datasets/__init__.py
+++ b/openml/datasets/__init__.py
@@ -17,17 +17,17 @@
 )
 
 __all__ = [
+    "OpenMLDataFeature",
+    "OpenMLDataset",
     "attributes_arff_from_df",
     "check_datasets_active",
     "create_dataset",
+    "delete_dataset",
+    "edit_dataset",
+    "fork_dataset",
     "get_dataset",
     "get_datasets",
     "list_datasets",
-    "OpenMLDataset",
-    "OpenMLDataFeature",
-    "status_update",
     "list_qualities",
-    "edit_dataset",
-    "fork_dataset",
-    "delete_dataset",
+    "status_update",
 ]
diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py
index 218b0066d..0598763b0 100644
--- a/openml/datasets/data_feature.py
+++ b/openml/datasets/data_feature.py
@@ -1,13 +1,14 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, ClassVar, Sequence
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Any, ClassVar
 
 if TYPE_CHECKING:
     from IPython.lib import pretty
 
 
-class OpenMLDataFeature:
+class OpenMLDataFeature:  # noqa: PLW1641
     """
     Data Feature (a.k.a. Attribute) object.
 
@@ -51,8 +52,7 @@ def __init__(  # noqa: PLR0913
         if data_type == "nominal":
             if nominal_values is None:
                 raise TypeError(
-                    "Dataset features require attribute `nominal_values` for nominal "
-                    "feature type.",
+                    "Dataset features require attribute `nominal_values` for nominal feature type.",
                 )
 
             if not isinstance(nominal_values, list):
@@ -75,10 +75,10 @@ def __init__(  # noqa: PLR0913
         self.ontologies = ontologies
 
     def __repr__(self) -> str:
-        return "[%d - %s (%s)]" % (self.index, self.name, self.data_type)
+        return f"[{self.index} - {self.name} ({self.data_type})]"
 
     def __eq__(self, other: Any) -> bool:
         return isinstance(other, OpenMLDataFeature) and self.__dict__ == other.__dict__
 
-    def _repr_pretty_(self, pp: pretty.PrettyPrinter, cycle: bool) -> None:  # noqa: FBT001, ARG002
+    def _repr_pretty_(self, pp: pretty.PrettyPrinter, cycle: bool) -> None:  # noqa: ARG002
         pp.text(str(self))
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index fa83d2b8a..59d6205ba 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -7,9 +7,9 @@
 import pickle
 import re
 import warnings
+from collections.abc import Iterable, Sequence
 from pathlib import Path
-from typing import Any, Iterable, Sequence
-from typing_extensions import Literal
+from typing import Any, Literal
 
 import arff
 import numpy as np
@@ -17,8 +17,8 @@
 import scipy.sparse
 import xmltodict
 
+import openml
 from openml.base import OpenMLBase
-from openml.config import OPENML_SKIP_PARQUET_ENV_VAR
 
 from .data_feature import OpenMLDataFeature
 
@@ -41,7 +41,7 @@ def _ensure_dataframe(
     raise TypeError(f"Data type {type(data)} not supported.")
 
 
-class OpenMLDataset(OpenMLBase):
+class OpenMLDataset(OpenMLBase):  # noqa: PLW1641
     """Dataset object.
 
     Allows fetching and uploading datasets to OpenML.
@@ -375,7 +375,9 @@ def _download_data(self) -> None:
         # import required here to avoid circular import.
         from .functions import _get_dataset_arff, _get_dataset_parquet
 
-        skip_parquet = os.environ.get(OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true"
+        skip_parquet = (
+            os.environ.get(openml.config.OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true"
+        )
         if self._parquet_url is not None and not skip_parquet:
             parquet_file = _get_dataset_parquet(self)
             self.parquet_file = None if parquet_file is None else str(parquet_file)
@@ -420,7 +422,11 @@ def _get_arff(self, format: str) -> dict:  # noqa: A002
             file_size = filepath.stat().st_size
             if file_size > MB_120:
                 raise NotImplementedError(
-                    f"File {filename} too big for {file_size}-bit system ({bits} bytes).",
+                    f"File '{filename}' ({file_size / 1e6:.1f} MB)"
+                    f"exceeds the maximum supported size of 120 MB. "
+                    f"This limitation applies to {bits}-bit systems. "
+                    f"Large dataset handling is currently not fully supported. "
+                    f"Please consider using a smaller dataset"
                 )
 
         if format.lower() == "arff":
@@ -484,7 +490,7 @@ def _parse_data_from_arff(  # noqa: C901, PLR0912, PLR0915
                 try:
                     # checks if the strings which should be the class labels
                     # can be encoded into integers
-                    pd.factorize(type_)[0]
+                    pd.factorize(np.array(type_))[0]
                 except ValueError as e:
                     raise ValueError(
                         "Categorical data needs to be numeric when using sparse ARFF."
@@ -719,8 +725,8 @@ def valid_category(cat: Any) -> bool:
     def get_data(  # noqa: C901
         self,
         target: list[str] | str | None = None,
-        include_row_id: bool = False,  # noqa: FBT001, FBT002
-        include_ignore_attribute: bool = False,  # noqa: FBT001, FBT002
+        include_row_id: bool = False,  # noqa: FBT002
+        include_ignore_attribute: bool = False,  # noqa: FBT002
     ) -> tuple[pd.DataFrame, pd.Series | None, list[bool], list[str]]:
         """Returns dataset content as dataframes.
 
@@ -766,8 +772,8 @@ def get_data(  # noqa: C901
             logger.info(f"Going to remove the following attributes: {to_exclude}")
             keep = np.array([column not in to_exclude for column in attribute_names])
             data = data.drop(columns=to_exclude)
-            categorical_mask = [cat for cat, k in zip(categorical_mask, keep) if k]
-            attribute_names = [att for att, k in zip(attribute_names, keep) if k]
+            categorical_mask = [cat for cat, k in zip(categorical_mask, keep, strict=False) if k]
+            attribute_names = [att for att, k in zip(attribute_names, keep, strict=False) if k]
 
         if target is None:
             return data, None, categorical_mask, attribute_names
@@ -780,7 +786,12 @@ def get_data(  # noqa: C901
         # All the assumptions below for the target are dependant on the number of targets being 1
         n_targets = len(target_names)
         if n_targets > 1:
-            raise NotImplementedError(f"Number of targets {n_targets} not implemented.")
+            raise NotImplementedError(
+                f"Multi-target prediction is not yet supported."
+                f"Found {n_targets} target columns: {target_names}. "
+                f"Currently, only single-target datasets are supported. "
+                f"Please select a single target column."
+            )
 
         target_name = target_names[0]
         x = data.drop(columns=[target_name])
@@ -863,8 +874,8 @@ def get_features_by_type(  # noqa: C901
         self,
         data_type: str,
         exclude: list[str] | None = None,
-        exclude_ignore_attribute: bool = True,  # noqa: FBT002, FBT001
-        exclude_row_id_attribute: bool = True,  # noqa: FBT002, FBT001
+        exclude_ignore_attribute: bool = True,  # noqa: FBT002
+        exclude_row_id_attribute: bool = True,  # noqa: FBT002
     ) -> list[int]:
         """
         Return indices of features of a given type, e.g. all nominal features.
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index ac5466a44..432938520 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -9,8 +9,7 @@
 from functools import partial
 from pathlib import Path
 from pyexpat import ExpatError
-from typing import TYPE_CHECKING, Any
-from typing_extensions import Literal
+from typing import TYPE_CHECKING, Any, Literal
 
 import arff
 import minio.error
@@ -20,9 +19,9 @@
 import xmltodict
 from scipy.sparse import coo_matrix
 
+import openml
 import openml._api_calls
 import openml.utils
-from openml.config import OPENML_SKIP_PARQUET_ENV_VAR
 from openml.exceptions import (
     OpenMLHashException,
     OpenMLPrivateDatasetError,
@@ -259,7 +258,7 @@ def _validated_data_attributes(
 
 def check_datasets_active(
     dataset_ids: list[int],
-    raise_error_if_not_exist: bool = True,  # noqa: FBT001, FBT002
+    raise_error_if_not_exist: bool = True,  # noqa: FBT002
 ) -> dict[int, bool]:
     """
     Check if the dataset ids provided are active.
@@ -293,7 +292,7 @@ def check_datasets_active(
 def _name_to_id(
     dataset_name: str,
     version: int | None = None,
-    error_if_multiple: bool = False,  # noqa: FBT001, FBT002
+    error_if_multiple: bool = False,  # noqa: FBT002
 ) -> int:
     """Attempt to find the dataset id of the dataset with the given name.
 
@@ -341,8 +340,8 @@ def _name_to_id(
 
 def get_datasets(
     dataset_ids: list[str | int],
-    download_data: bool = False,  # noqa: FBT001, FBT002
-    download_qualities: bool = False,  # noqa: FBT001, FBT002
+    download_data: bool = False,  # noqa: FBT002
+    download_qualities: bool = False,  # noqa: FBT002
 ) -> list[OpenMLDataset]:
     """Download datasets.
 
@@ -377,14 +376,14 @@ def get_datasets(
 @openml.utils.thread_safe_if_oslo_installed
 def get_dataset(  # noqa: C901, PLR0912
     dataset_id: int | str,
-    download_data: bool = False,  # noqa: FBT002, FBT001
+    download_data: bool = False,  # noqa: FBT002
     version: int | None = None,
-    error_if_multiple: bool = False,  # noqa: FBT002, FBT001
+    error_if_multiple: bool = False,  # noqa: FBT002
     cache_format: Literal["pickle", "feather"] = "pickle",
-    download_qualities: bool = False,  # noqa: FBT002, FBT001
-    download_features_meta_data: bool = False,  # noqa: FBT002, FBT001
-    download_all_files: bool = False,  # noqa: FBT002, FBT001
-    force_refresh_cache: bool = False,  # noqa: FBT001, FBT002
+    download_qualities: bool = False,  # noqa: FBT002
+    download_features_meta_data: bool = False,  # noqa: FBT002
+    download_all_files: bool = False,  # noqa: FBT002
+    force_refresh_cache: bool = False,  # noqa: FBT002
 ) -> OpenMLDataset:
     """Download the OpenML dataset representation, optionally also download actual data file.
 
@@ -493,7 +492,9 @@ def get_dataset(  # noqa: C901, PLR0912
             qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
 
         parquet_file = None
-        skip_parquet = os.environ.get(OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true"
+        skip_parquet = (
+            os.environ.get(openml.config.OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true"
+        )
         download_parquet = "oml:parquet_url" in description and not skip_parquet
         if download_parquet and (download_data or download_all_files):
             try:
@@ -1116,7 +1117,7 @@ def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str,
 def _get_dataset_parquet(
     description: dict | OpenMLDataset,
     cache_directory: Path | None = None,
-    download_all_files: bool = False,  # noqa: FBT001, FBT002
+    download_all_files: bool = False,  # noqa: FBT002
 ) -> Path | None:
     """Return the path to the local parquet file of the dataset. If is not cached, it is downloaded.
 
@@ -1418,7 +1419,7 @@ def _get_online_dataset_arff(dataset_id: int) -> str | None:
     str or None
         A string representation of an ARFF file. Or None if file already exists.
     """
-    dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id, "get")
+    dataset_xml = openml._api_calls._perform_api_call(f"data/{dataset_id}", "get")
     # build a dict from the xml.
     # use the url from the dataset description and return the ARFF string
     return openml._api_calls._download_text_file(
@@ -1439,7 +1440,7 @@ def _get_online_dataset_format(dataset_id: int) -> str:
     str
         Dataset format.
     """
-    dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id, "get")
+    dataset_xml = openml._api_calls._perform_api_call(f"data/{dataset_id}", "get")
     # build a dict from the xml and get the format from the dataset description
     return xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:format"].lower()  # type: ignore
 
diff --git a/openml/evaluations/__init__.py b/openml/evaluations/__init__.py
index dbff47037..b56d0c2d5 100644
--- a/openml/evaluations/__init__.py
+++ b/openml/evaluations/__init__.py
@@ -5,7 +5,7 @@
 
 __all__ = [
     "OpenMLEvaluation",
-    "list_evaluations",
     "list_evaluation_measures",
+    "list_evaluations",
     "list_evaluations_setups",
 ]
diff --git a/openml/evaluations/evaluation.py b/openml/evaluations/evaluation.py
index 6d69d377e..87df8454a 100644
--- a/openml/evaluations/evaluation.py
+++ b/openml/evaluations/evaluation.py
@@ -1,15 +1,15 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
-import openml.config
+from dataclasses import asdict, dataclass
+
 import openml.datasets
 import openml.flows
 import openml.runs
 import openml.tasks
 
 
-# TODO(eddiebergman): A lot of this class is automatically
-# handled by a dataclass
+@dataclass
 class OpenMLEvaluation:
     """
     Contains all meta-information about a run / evaluation combination,
@@ -48,55 +48,23 @@ class OpenMLEvaluation:
         (e.g., in case of precision, auroc, recall)
     """
 
-    def __init__(  # noqa: PLR0913
-        self,
-        run_id: int,
-        task_id: int,
-        setup_id: int,
-        flow_id: int,
-        flow_name: str,
-        data_id: int,
-        data_name: str,
-        function: str,
-        upload_time: str,
-        uploader: int,
-        uploader_name: str,
-        value: float | None,
-        values: list[float] | None,
-        array_data: str | None = None,
-    ):
-        self.run_id = run_id
-        self.task_id = task_id
-        self.setup_id = setup_id
-        self.flow_id = flow_id
-        self.flow_name = flow_name
-        self.data_id = data_id
-        self.data_name = data_name
-        self.function = function
-        self.upload_time = upload_time
-        self.uploader = uploader
-        self.uploader_name = uploader_name
-        self.value = value
-        self.values = values
-        self.array_data = array_data
+    run_id: int
+    task_id: int
+    setup_id: int
+    flow_id: int
+    flow_name: str
+    data_id: int
+    data_name: str
+    function: str
+    upload_time: str
+    uploader: int
+    uploader_name: str
+    value: float | None
+    values: list[float] | None
+    array_data: str | None = None
 
     def _to_dict(self) -> dict:
-        return {
-            "run_id": self.run_id,
-            "task_id": self.task_id,
-            "setup_id": self.setup_id,
-            "flow_id": self.flow_id,
-            "flow_name": self.flow_name,
-            "data_id": self.data_id,
-            "data_name": self.data_name,
-            "function": self.function,
-            "upload_time": self.upload_time,
-            "uploader": self.uploader,
-            "uploader_name": self.uploader_name,
-            "value": self.value,
-            "values": self.values,
-            "array_data": self.array_data,
-        }
+        return asdict(self)
 
     def __repr__(self) -> str:
         header = "OpenML Evaluation"
@@ -119,11 +87,12 @@ def __repr__(self) -> str:
         }
 
         order = [
-            "Uploader Date",
+            "Upload Date",
             "Run ID",
             "OpenML Run URL",
             "Task ID",
-            "OpenML Task URL" "Flow ID",
+            "OpenML Task URL",
+            "Flow ID",
             "OpenML Flow URL",
             "Setup ID",
             "Data ID",
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index 7747294d7..61c95a480 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -5,8 +5,8 @@
 import json
 from functools import partial
 from itertools import chain
-from typing import Any
-from typing_extensions import Literal, overload
+from typing import Any, Literal
+from typing_extensions import overload
 
 import numpy as np
 import pandas as pd
@@ -228,11 +228,12 @@ def __list_evaluations(api_call: str) -> list[OpenMLEvaluation]:
     # Minimalistic check if the XML is useful
     if "oml:evaluations" not in evals_dict:
         raise ValueError(
-            "Error in return XML, does not contain " f'"oml:evaluations": {evals_dict!s}',
+            f'Error in return XML, does not contain "oml:evaluations": {evals_dict!s}',
         )
 
-    assert isinstance(evals_dict["oml:evaluations"]["oml:evaluation"], list), type(
-        evals_dict["oml:evaluations"],
+    assert isinstance(evals_dict["oml:evaluations"]["oml:evaluation"], list), (
+        "Expected 'oml:evaluation' to be a list, but got"
+        f"{type(evals_dict['oml:evaluations']['oml:evaluation']).__name__}. "
     )
 
     uploader_ids = list(
@@ -339,7 +340,7 @@ def list_evaluations_setups(
     tag: str | None = None,
     per_fold: bool | None = None,
     sort_order: str | None = None,
-    parameters_in_separate_columns: bool = False,  # noqa: FBT001, FBT002
+    parameters_in_separate_columns: bool = False,  # noqa: FBT002
 ) -> pd.DataFrame:
     """List all run-evaluation pairs matching all of the given filters
     and their hyperparameter settings.
diff --git a/openml/exceptions.py b/openml/exceptions.py
index fe63b8a58..1c1343ff3 100644
--- a/openml/exceptions.py
+++ b/openml/exceptions.py
@@ -63,5 +63,28 @@ class OpenMLNotAuthorizedError(OpenMLServerError):
     """Indicates an authenticated user is not authorized to execute the requested action."""
 
 
+class OpenMLAuthenticationError(OpenMLServerError):
+    """Exception raised when API authentication fails.
+
+    This typically occurs when:
+    - No API key is configured
+    - The API key is invalid or expired
+    - The API key format is incorrect
+
+    This is different from authorization (OpenMLNotAuthorizedError), which occurs
+    when a valid API key lacks permissions for the requested operation.
+    """
+
+    def __init__(self, message: str):
+        help_text = (
+            "\n\nTo fix this:\n"
+            "1. Get your API key from https://www.openml.org/\n"
+            "   (you'll need to register for a free account if you don't have one)\n"
+            "2. Configure your API key by following the authentication guide:\n"
+            "   https://openml.github.io/openml-python/latest/examples/Basics/introduction_tutorial/#authentication"
+        )
+        super().__init__(message + help_text)
+
+
 class ObjectNotPublishedError(PyOpenMLError):
     """Indicates an object has not been published yet."""
diff --git a/openml/extensions/__init__.py b/openml/extensions/__init__.py
index b49865e0e..979986182 100644
--- a/openml/extensions/__init__.py
+++ b/openml/extensions/__init__.py
@@ -1,16 +1,15 @@
 # License: BSD 3-Clause
 
-from typing import List, Type  # noqa: F401
 
 from .extension_interface import Extension
 from .functions import get_extension_by_flow, get_extension_by_model, register_extension
 
-extensions = []  # type: List[Type[Extension]]
+extensions: list[type[Extension]] = []
 
 
 __all__ = [
     "Extension",
-    "register_extension",
-    "get_extension_by_model",
     "get_extension_by_flow",
+    "get_extension_by_model",
+    "register_extension",
 ]
diff --git a/openml/extensions/extension_interface.py b/openml/extensions/extension_interface.py
index 2a336eb52..e391d109a 100644
--- a/openml/extensions/extension_interface.py
+++ b/openml/extensions/extension_interface.py
@@ -63,8 +63,8 @@ def can_handle_model(cls, model: Any) -> bool:
     def flow_to_model(
         self,
         flow: OpenMLFlow,
-        initialize_with_defaults: bool = False,  # noqa: FBT001, FBT002
-        strict_version: bool = True,  # noqa: FBT002, FBT001
+        initialize_with_defaults: bool = False,  # noqa: FBT002
+        strict_version: bool = True,  # noqa: FBT002
     ) -> Any:
         """Instantiate a model from the flow representation.
 
diff --git a/openml/extensions/functions.py b/openml/extensions/functions.py
index 7a944c997..44df5ec69 100644
--- a/openml/extensions/functions.py
+++ b/openml/extensions/functions.py
@@ -1,6 +1,7 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
+import importlib.util
 from typing import TYPE_CHECKING, Any
 
 # Need to implement the following by its full path because otherwise it won't be possible to
@@ -16,8 +17,9 @@
 SKLEARN_HINT = (
     "But it looks related to scikit-learn. "
     "Please install the OpenML scikit-learn extension (openml-sklearn) and try again. "
+    "You can use `pip install openml-sklearn` for installation."
     "For more information, see "
-    "https://github.com/openml/openml-sklearn?tab=readme-ov-file#installation"
+    "https://docs.openml.org/python/extensions/"
 )
 
 
@@ -40,7 +42,7 @@ def register_extension(extension: type[Extension]) -> None:
 
 def get_extension_by_flow(
     flow: OpenMLFlow,
-    raise_if_no_extension: bool = False,  # noqa: FBT001, FBT002
+    raise_if_no_extension: bool = False,  # noqa: FBT002
 ) -> Extension | None:
     """Get an extension which can handle the given flow.
 
@@ -58,6 +60,10 @@ def get_extension_by_flow(
     -------
     Extension or None
     """
+    # import openml_sklearn to register SklearnExtension
+    if importlib.util.find_spec("openml_sklearn"):
+        import openml_sklearn  # noqa: F401
+
     candidates = []
     for extension_class in openml.extensions.extensions:
         if extension_class.can_handle_flow(flow):
@@ -85,7 +91,7 @@ def get_extension_by_flow(
 
 def get_extension_by_model(
     model: Any,
-    raise_if_no_extension: bool = False,  # noqa: FBT001, FBT002
+    raise_if_no_extension: bool = False,  # noqa: FBT002
 ) -> Extension | None:
     """Get an extension which can handle the given flow.
 
@@ -103,6 +109,10 @@ def get_extension_by_model(
     -------
     Extension or None
     """
+    # import openml_sklearn to register SklearnExtension
+    if importlib.util.find_spec("openml_sklearn"):
+        import openml_sklearn  # noqa: F401
+
     candidates = []
     for extension_class in openml.extensions.extensions:
         if extension_class.can_handle_model(model):
diff --git a/openml/flows/__init__.py b/openml/flows/__init__.py
index ce32fec7d..d455249de 100644
--- a/openml/flows/__init__.py
+++ b/openml/flows/__init__.py
@@ -12,10 +12,10 @@
 
 __all__ = [
     "OpenMLFlow",
-    "get_flow",
-    "list_flows",
-    "get_flow_id",
-    "flow_exists",
     "assert_flows_equal",
     "delete_flow",
+    "flow_exists",
+    "get_flow",
+    "get_flow_id",
+    "list_flows",
 ]
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
index 02d24e78b..7dd84fdee 100644
--- a/openml/flows/flow.py
+++ b/openml/flows/flow.py
@@ -3,8 +3,9 @@
 
 import logging
 from collections import OrderedDict
+from collections.abc import Hashable, Sequence
 from pathlib import Path
-from typing import Any, Hashable, Sequence, cast
+from typing import Any, cast
 
 import xmltodict
 
@@ -169,7 +170,7 @@ def extension(self) -> Extension:
         """The extension of the flow (e.g., sklearn)."""
         if self._extension is None:
             self._extension = cast(
-                Extension, get_extension_by_flow(self, raise_if_no_extension=True)
+                "Extension", get_extension_by_flow(self, raise_if_no_extension=True)
             )
 
         return self._extension
@@ -408,7 +409,7 @@ def _parse_publish_response(self, xml_response: dict) -> None:
         """Parse the id from the xml_response and assign it to self."""
         self.flow_id = int(xml_response["oml:upload_flow"]["oml:id"])
 
-    def publish(self, raise_error_if_exists: bool = False) -> OpenMLFlow:  # noqa: FBT001, FBT002
+    def publish(self, raise_error_if_exists: bool = False) -> OpenMLFlow:  # noqa: FBT002
         """Publish this flow to OpenML server.
 
         Raises a PyOpenMLError if the flow exists on the server, but
@@ -435,7 +436,7 @@ def publish(self, raise_error_if_exists: bool = False) -> OpenMLFlow:  # noqa: F
         if not flow_id:
             if self.flow_id:
                 raise openml.exceptions.PyOpenMLError(
-                    "Flow does not exist on the server, " "but 'flow.flow_id' is not None.",
+                    "Flow does not exist on the server, but 'flow.flow_id' is not None.",
                 )
             super().publish()
             assert self.flow_id is not None  # for mypy
@@ -445,7 +446,7 @@ def publish(self, raise_error_if_exists: bool = False) -> OpenMLFlow:  # noqa: F
             raise openml.exceptions.PyOpenMLError(error_message)
         elif self.flow_id is not None and self.flow_id != flow_id:
             raise openml.exceptions.PyOpenMLError(
-                "Local flow_id does not match server flow_id: " f"'{self.flow_id}' vs '{flow_id}'",
+                f"Local flow_id does not match server flow_id: '{self.flow_id}' vs '{flow_id}'",
             )
 
         flow = openml.flows.functions.get_flow(flow_id)
@@ -517,7 +518,7 @@ def get_subflow(self, structure: list[str]) -> OpenMLFlow:
         sub_identifier = structure[0]
         if sub_identifier not in self.components:
             raise ValueError(
-                f"Flow {self.name} does not contain component with " f"identifier {sub_identifier}",
+                f"Flow {self.name} does not contain component with identifier {sub_identifier}",
             )
         if len(structure) == 1:
             return self.components[sub_identifier]  # type: ignore
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
index 9906958e5..0a2058890 100644
--- a/openml/flows/functions.py
+++ b/openml/flows/functions.py
@@ -5,7 +5,7 @@
 import re
 from collections import OrderedDict
 from functools import partial
-from typing import Any, Dict
+from typing import Any
 
 import dateutil.parser
 import pandas as pd
@@ -31,7 +31,7 @@ def _get_cached_flows() -> OrderedDict:
     flows = OrderedDict()  # type: 'OrderedDict[int, OpenMLFlow]'
 
     flow_cache_dir = openml.utils._create_cache_directory(FLOWS_CACHE_DIR_NAME)
-    directory_content = os.listdir(flow_cache_dir)
+    directory_content = os.listdir(flow_cache_dir)  # noqa: PTH208
     directory_content.sort()
     # Find all flow ids for which we have downloaded
     # the flow description
@@ -66,28 +66,64 @@ def _get_cached_flow(fid: int) -> OpenMLFlow:
             return _create_flow_from_xml(fh.read())
     except OSError as e:
         openml.utils._remove_cache_dir_for_id(FLOWS_CACHE_DIR_NAME, fid_cache_dir)
-        raise OpenMLCacheException("Flow file for fid %d not cached" % fid) from e
+        raise OpenMLCacheException(f"Flow file for fid {fid} not cached") from e
 
 
 @openml.utils.thread_safe_if_oslo_installed
-def get_flow(flow_id: int, reinstantiate: bool = False, strict_version: bool = True) -> OpenMLFlow:  # noqa: FBT001, FBT002
-    """Download the OpenML flow for a given flow ID.
+def get_flow(flow_id: int, reinstantiate: bool = False, strict_version: bool = True) -> OpenMLFlow:  # noqa: FBT002
+    """Fetch an OpenMLFlow by its server-assigned ID.
+
+    Queries the OpenML REST API for the flow metadata and returns an
+    :class:`OpenMLFlow` instance. If the flow is already cached locally,
+    the cached copy is returned. Optionally the flow can be re-instantiated
+    into a concrete model instance using the registered extension.
 
     Parameters
     ----------
     flow_id : int
         The OpenML flow id.
-
-    reinstantiate: bool
-        Whether to reinstantiate the flow to a model instance.
-
-    strict_version : bool, default=True
-        Whether to fail if version requirements are not fulfilled.
+    reinstantiate : bool, optional (default=False)
+        If True, convert the flow description into a concrete model instance
+        using the flow's extension (e.g., sklearn). If conversion fails and
+        ``strict_version`` is True, an exception will be raised.
+    strict_version : bool, optional (default=True)
+        When ``reinstantiate`` is True, whether to enforce exact version
+        requirements for the extension/model. If False, a new flow may
+        be returned when versions differ.
 
     Returns
     -------
-    flow : OpenMLFlow
-        the flow
+    OpenMLFlow
+        The flow object with metadata; ``model`` may be populated when
+        ``reinstantiate=True``.
+
+    Raises
+    ------
+    OpenMLCacheException
+        When cached flow files are corrupted or cannot be read.
+    OpenMLServerException
+        When the REST API call fails.
+
+    Side Effects
+    ------------
+    - Writes to ``openml.config.cache_directory/flows/{flow_id}/flow.xml``
+      when the flow is downloaded from the server.
+
+    Preconditions
+    -------------
+    - Network access to the OpenML server is required unless the flow is cached.
+    - For private flows, ``openml.config.apikey`` must be set.
+
+    Notes
+    -----
+    Results are cached to speed up subsequent calls. When ``reinstantiate`` is
+    True and version mismatches occur, a new flow may be returned to reflect
+    the converted model (only when ``strict_version`` is False).
+
+    Examples
+    --------
+    >>> import openml
+    >>> flow = openml.flows.get_flow(5)  # doctest: +SKIP
     """
     flow_id = int(flow_id)
     flow = _get_flow_description(flow_id)
@@ -124,7 +160,7 @@ def _get_flow_description(flow_id: int) -> OpenMLFlow:
         xml_file = (
             openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, flow_id) / "flow.xml"
         )
-        flow_xml = openml._api_calls._perform_api_call("flow/%d" % flow_id, request_method="get")
+        flow_xml = openml._api_calls._perform_api_call(f"flow/{flow_id}", request_method="get")
 
         with xml_file.open("w", encoding="utf8") as fh:
             fh.write(flow_xml)
@@ -138,32 +174,47 @@ def list_flows(
     tag: str | None = None,
     uploader: str | None = None,
 ) -> pd.DataFrame:
-    """
-    Return a list of all flows which are on OpenML.
-    (Supports large amount of results)
+    """List flows available on the OpenML server.
+
+    This function supports paging and filtering and returns a pandas
+    DataFrame with one row per flow and columns for id, name, version,
+    external_version, full_name and uploader.
 
     Parameters
     ----------
     offset : int, optional
-        the number of flows to skip, starting from the first
+        Number of flows to skip, starting from the first (for paging).
     size : int, optional
-        the maximum number of flows to return
+        Maximum number of flows to return.
     tag : str, optional
-        the tag to include
-    kwargs: dict, optional
-        Legal filter operators: uploader.
+        Only return flows having this tag.
+    uploader : str, optional
+        Only return flows uploaded by this user.
 
     Returns
     -------
-    flows : dataframe
-            Each row maps to a dataset
-            Each column contains the following information:
-            - flow id
-            - full name
-            - name
-            - version
-            - external version
-            - uploader
+    pandas.DataFrame
+        Rows correspond to flows. Columns include ``id``, ``full_name``,
+        ``name``, ``version``, ``external_version``, and ``uploader``.
+
+    Raises
+    ------
+    OpenMLServerException
+        When the API call fails.
+
+    Side Effects
+    ------------
+    - None: results are fetched and returned; Read-only operation.
+
+    Preconditions
+    -------------
+    - Network access is required to list flows unless cached mechanisms are
+      used by the underlying API helper.
+
+    Examples
+    --------
+    >>> import openml
+    >>> flows = openml.flows.list_flows(size=100)  # doctest: +SKIP
     """
     listing_call = partial(_list_flows, tag=tag, uploader=uploader)
     batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
@@ -206,25 +257,35 @@ def _list_flows(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame:
 
 
 def flow_exists(name: str, external_version: str) -> int | bool:
-    """Retrieves the flow id.
+    """Check whether a flow (name + external_version) exists on the server.
 
-    A flow is uniquely identified by name + external_version.
+    The OpenML server defines uniqueness of flows by the pair
+    ``(name, external_version)``. This helper queries the server and
+    returns the corresponding flow id when present.
 
     Parameters
     ----------
-    name : string
-        Name of the flow
-    external_version : string
+    name : str
+        Flow name (e.g., ``sklearn.tree._classes.DecisionTreeClassifier(1)``).
+    external_version : str
         Version information associated with flow.
 
     Returns
     -------
-    flow_exist : int or bool
-        flow id iff exists, False otherwise
-
-    Notes
-    -----
-    see https://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version
+    int or bool
+        The flow id if the flow exists on the server, otherwise ``False``.
+
+    Raises
+    ------
+    ValueError
+        If ``name`` or ``external_version`` are empty or not strings.
+    OpenMLServerException
+        When the API request fails.
+
+    Examples
+    --------
+    >>> import openml
+    >>> openml.flows.flow_exists("weka.JRip", "Weka_3.9.0_10153")  # doctest: +SKIP
     """
     if not (isinstance(name, str) and len(name) > 0):
         raise ValueError("Argument 'name' should be a non-empty string")
@@ -245,37 +306,60 @@ def flow_exists(name: str, external_version: str) -> int | bool:
 def get_flow_id(
     model: Any | None = None,
     name: str | None = None,
-    exact_version: bool = True,  # noqa: FBT001, FBT002
+    exact_version: bool = True,  # noqa: FBT002
 ) -> int | bool | list[int]:
-    """Retrieves the flow id for a model or a flow name.
+    """Retrieve flow id(s) for a model instance or a flow name.
 
-    Provide either a model or a name to this function. Depending on the input, it does
+    Provide either a concrete ``model`` (which will be converted to a flow by
+    the appropriate extension) or a flow ``name``. Behavior depends on
+    ``exact_version``:
 
-    * ``model`` and ``exact_version == True``: This helper function first queries for the necessary
-      extension. Second, it uses that extension to convert the model into a flow. Third, it
-      executes ``flow_exists`` to potentially obtain the flow id the flow is published to the
-      server.
-    * ``model`` and ``exact_version == False``: This helper function first queries for the
-      necessary extension. Second, it uses that extension to convert the model into a flow. Third
-      it calls ``list_flows`` and filters the returned values based on the flow name.
-    * ``name``: Ignores ``exact_version`` and calls ``list_flows``, then filters the returned
-      values based on the flow name.
+    - ``model`` + ``exact_version=True``: convert ``model`` to a flow and call
+        :func:`flow_exists` to get a single flow id (or False).
+    - ``model`` + ``exact_version=False``: convert ``model`` to a flow and
+        return all server flow ids with the same flow name.
+    - ``name``: ignore ``exact_version`` and return all server flow ids that
+        match ``name``.
 
     Parameters
     ----------
-    model : object
-        Any model. Must provide either ``model`` or ``name``.
-    name : str
-        Name of the flow. Must provide either ``model`` or ``name``.
-    exact_version : bool
-        Whether to return the flow id of the exact version or all flow ids where the name
-        of the flow matches. This is only taken into account for a model where a version number
-        is available (requires ``model`` to be set).
+    model : object, optional
+            A model instance that can be handled by a registered extension. Either
+            ``model`` or ``name`` must be provided.
+    name : str, optional
+            Flow name to query for. Either ``model`` or ``name`` must be provided.
+    exact_version : bool, optional (default=True)
+            When True and ``model`` is provided, only return the id for the exact
+            external version. When False, return a list of matching ids.
 
     Returns
     -------
-    int or bool, List
-        flow id iff exists, ``False`` otherwise, List if ``exact_version is False``
+    int or bool or list[int]
+            If ``exact_version`` is True: the flow id if found, otherwise ``False``.
+            If ``exact_version`` is False: a list of matching flow ids (may be empty).
+
+    Raises
+    ------
+    ValueError
+            If neither ``model`` nor ``name`` is provided, or if both are provided.
+    OpenMLServerException
+            If underlying API calls fail.
+
+    Side Effects
+    ------------
+    - May call server APIs (``flow/exists``, ``flow/list``) and therefore
+        depends on network access and API keys for private flows.
+
+    Examples
+    --------
+    >>> import openml
+    >>> # Lookup by flow name
+    >>> openml.flows.get_flow_id(name="weka.JRip")  # doctest: +SKIP
+    >>> # Lookup by model instance (requires a registered extension)
+    >>> import sklearn
+    >>> import openml_sklearn
+    >>> clf = sklearn.tree.DecisionTreeClassifier()
+    >>> openml.flows.get_flow_id(model=clf)  # doctest: +SKIP
     """
     if model is not None and name is not None:
         raise ValueError("Must provide either argument `model` or argument `name`, but not both.")
@@ -364,9 +448,9 @@ def assert_flows_equal(  # noqa: C901, PLR0912, PLR0913, PLR0915
     flow1: OpenMLFlow,
     flow2: OpenMLFlow,
     ignore_parameter_values_on_older_children: str | None = None,
-    ignore_parameter_values: bool = False,  # noqa: FBT001, FBT002
-    ignore_custom_name_if_none: bool = False,  # noqa:  FBT001, FBT002
-    check_description: bool = True,  # noqa:  FBT001, FBT002
+    ignore_parameter_values: bool = False,  # noqa: FBT002
+    ignore_custom_name_if_none: bool = False,  # noqa: FBT002
+    check_description: bool = True,  # noqa: FBT002
 ) -> None:
     """Check equality of two flows.
 
@@ -391,6 +475,21 @@ def assert_flows_equal(  # noqa: C901, PLR0912, PLR0913, PLR0915
 
     check_description : bool
         Whether to ignore matching of flow descriptions.
+
+    Raises
+    ------
+    TypeError
+        When either argument is not an :class:`OpenMLFlow`.
+    ValueError
+        When a relevant mismatch is found between the two flows.
+
+    Examples
+    --------
+    >>> import openml
+    >>> f1 = openml.flows.get_flow(5)  # doctest: +SKIP
+    >>> f2 = openml.flows.get_flow(5)  # doctest: +SKIP
+    >>> openml.flows.assert_flows_equal(f1, f2)  # doctest: +SKIP
+    >>> # If flows differ, a ValueError is raised
     """
     if not isinstance(flow1, OpenMLFlow):
         raise TypeError(f"Argument 1 must be of type OpenMLFlow, but is {type(flow1)}")
@@ -417,7 +516,7 @@ def assert_flows_equal(  # noqa: C901, PLR0912, PLR0913, PLR0915
         attr1 = getattr(flow1, key, None)
         attr2 = getattr(flow2, key, None)
         if key == "components":
-            if not (isinstance(attr1, Dict) and isinstance(attr2, Dict)):
+            if not (isinstance(attr1, dict) and isinstance(attr2, dict)):
                 raise TypeError("Cannot compare components because they are not dictionary.")
 
             for name in set(attr1.keys()).union(attr2.keys()):
@@ -456,9 +555,9 @@ def assert_flows_equal(  # noqa: C901, PLR0912, PLR0913, PLR0915
                         )
 
                 if ignore_parameter_values_on_older_children:
-                    assert (
-                        flow1.upload_date is not None
-                    ), "Flow1 has no upload date that allows us to compare age of children."
+                    assert flow1.upload_date is not None, (
+                        "Flow1 has no upload date that allows us to compare age of children."
+                    )
                     upload_date_current_flow = dateutil.parser.parse(flow1.upload_date)
                     upload_date_parent_flow = dateutil.parser.parse(
                         ignore_parameter_values_on_older_children,
@@ -493,8 +592,8 @@ def assert_flows_equal(  # noqa: C901, PLR0912, PLR0913, PLR0915
                 # iterating over the parameter's meta info list
                 for param in params1:
                     if (
-                        isinstance(flow1.parameters_meta_info[param], Dict)
-                        and isinstance(flow2.parameters_meta_info[param], Dict)
+                        isinstance(flow1.parameters_meta_info[param], dict)
+                        and isinstance(flow2.parameters_meta_info[param], dict)
                         and "data_type" in flow1.parameters_meta_info[param]
                         and "data_type" in flow2.parameters_meta_info[param]
                     ):
@@ -550,5 +649,20 @@ def delete_flow(flow_id: int) -> bool:
     -------
     bool
         True if the deletion was successful. False otherwise.
+
+    Raises
+    ------
+    OpenMLServerException
+        If the server-side deletion fails due to permissions or other errors.
+
+    Side Effects
+    ------------
+    - Removes the flow from the OpenML server (if permitted).
+
+    Examples
+    --------
+    >>> import openml
+    >>> # Deletes flow 23 if you are the uploader and it's not linked to runs
+    >>> openml.flows.delete_flow(23)  # doctest: +SKIP
     """
     return openml.utils._delete_entity("flow", flow_id)
diff --git a/openml/runs/__init__.py b/openml/runs/__init__.py
index 6d3dca504..2f068a2e6 100644
--- a/openml/runs/__init__.py
+++ b/openml/runs/__init__.py
@@ -19,14 +19,14 @@
     "OpenMLRun",
     "OpenMLRunTrace",
     "OpenMLTraceIteration",
-    "run_model_on_task",
-    "run_flow_on_task",
+    "delete_run",
     "get_run",
-    "list_runs",
-    "get_runs",
     "get_run_trace",
-    "run_exists",
+    "get_runs",
     "initialize_model_from_run",
     "initialize_model_from_trace",
-    "delete_run",
+    "list_runs",
+    "run_exists",
+    "run_flow_on_task",
+    "run_model_on_task",
 ]
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index 666b75c37..d87bd3e18 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -18,7 +18,6 @@
 import openml
 import openml._api_calls
 import openml.utils
-from openml import config
 from openml.exceptions import (
     OpenMLCacheException,
     OpenMLRunsExistError,
@@ -45,7 +44,7 @@
 
 # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
 if TYPE_CHECKING:
-    from openml.config import _Config
+    from openml._config import _Config
     from openml.extensions.extension_interface import Extension
 
 # get_dict is in run.py to avoid circular imports
@@ -62,9 +61,9 @@ def run_model_on_task(  # noqa: PLR0913
     avoid_duplicate_runs: bool | None = None,
     flow_tags: list[str] | None = None,
     seed: int | None = None,
-    add_local_measures: bool = True,  # noqa: FBT001, FBT002
-    upload_flow: bool = False,  # noqa: FBT001, FBT002
-    return_flow: bool = False,  # noqa: FBT001, FBT002
+    add_local_measures: bool = True,  # noqa: FBT002
+    upload_flow: bool = False,  # noqa: FBT002
+    return_flow: bool = False,  # noqa: FBT002
     n_jobs: int | None = None,
 ) -> OpenMLRun | tuple[OpenMLRun, OpenMLFlow]:
     """Run the model on the dataset defined by the task.
@@ -107,7 +106,7 @@ def run_model_on_task(  # noqa: PLR0913
     """
     if avoid_duplicate_runs is None:
         avoid_duplicate_runs = openml.config.avoid_duplicate_runs
-    if avoid_duplicate_runs and not config.apikey:
+    if avoid_duplicate_runs and not openml.config.apikey:
         warnings.warn(
             "avoid_duplicate_runs is set to True, but no API key is set. "
             "Please set your API key in the OpenML configuration file, see"
@@ -181,8 +180,8 @@ def run_flow_on_task(  # noqa: C901, PLR0912, PLR0915, PLR0913
     avoid_duplicate_runs: bool | None = None,
     flow_tags: list[str] | None = None,
     seed: int | None = None,
-    add_local_measures: bool = True,  # noqa: FBT001, FBT002
-    upload_flow: bool = False,  # noqa: FBT001, FBT002
+    add_local_measures: bool = True,  # noqa: FBT002
+    upload_flow: bool = False,  # noqa: FBT002
     n_jobs: int | None = None,
 ) -> OpenMLRun:
     """Run the model provided by the flow on the dataset defined by task.
@@ -336,7 +335,7 @@ def run_flow_on_task(  # noqa: C901, PLR0912, PLR0915, PLR0913
         message = f"Executed Task {task.task_id} with Flow id:{run.flow_id}"
     else:
         message = f"Executed Task {task.task_id} on local Flow with name {flow.name}."
-    config.logger.info(message)
+    openml.config.logger.info(message)
 
     return run
 
@@ -353,7 +352,7 @@ def get_run_trace(run_id: int) -> OpenMLRunTrace:
     -------
     openml.runs.OpenMLTrace
     """
-    trace_xml = openml._api_calls._perform_api_call("run/trace/%d" % run_id, "get")
+    trace_xml = openml._api_calls._perform_api_call(f"run/trace/{run_id}", "get")
     return OpenMLRunTrace.trace_from_xml(trace_xml)
 
 
@@ -376,7 +375,8 @@ def initialize_model_from_run(run_id: int, *, strict_version: bool = True) -> An
     run = get_run(run_id)
     # TODO(eddiebergman): I imagine this is None if it's not published,
     # might need to raise an explicit error for that
-    assert run.setup_id is not None
+    if run.setup_id is None:
+        raise ValueError(f"Run {run_id} has no associated setup_id. Cannot initialize model.")
     return initialize_model(setup_id=run.setup_id, strict_version=strict_version)
 
 
@@ -416,7 +416,8 @@ def initialize_model_from_trace(
     run = get_run(run_id)
     # TODO(eddiebergman): I imagine this is None if it's not published,
     # might need to raise an explicit error for that
-    assert run.flow_id is not None
+    if run.flow_id is None:
+        raise ValueError(f"Run {run_id} has no associated flow_id. Cannot initialize model.")
 
     flow = get_flow(run.flow_id)
     run_trace = get_run_trace(run_id)
@@ -528,7 +529,7 @@ def _run_task_get_arffcontent(  # noqa: PLR0915, PLR0912, C901
 
     # The forked child process may not copy the configuration state of OpenML from the parent.
     # Current configuration setup needs to be copied and passed to the child processes.
-    _config = config.get_config_as_dict()
+    _config = openml.config.get_config_as_dict()
     # Execute runs in parallel
     # assuming the same number of tasks as workers (n_jobs), the total compute time for this
     # statement will be similar to the slowest run
@@ -576,8 +577,10 @@ def _calculate_local_measure(  # type: ignore
             _user_defined_measures_fold[openml_name] = sklearn_fn(_test_y, _pred_y)
 
         if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
-            assert test_y is not None
-            assert proba_y is not None
+            if test_y is None:
+                raise ValueError("test_y cannot be None for classification tasks.")
+            if proba_y is None:
+                raise ValueError("proba_y cannot be None for classification tasks.")
 
             for i, tst_idx in enumerate(test_indices):
                 if task.class_labels is not None:
@@ -608,7 +611,7 @@ def _calculate_local_measure(  # type: ignore
                         index=tst_idx,
                         prediction=prediction,
                         truth=truth,
-                        proba=dict(zip(task.class_labels, pred_prob)),
+                        proba=dict(zip(task.class_labels, pred_prob, strict=False)),
                     )
                 else:
                     raise ValueError("The task has no class labels")
@@ -622,7 +625,8 @@ def _calculate_local_measure(  # type: ignore
                 )
 
         elif isinstance(task, OpenMLRegressionTask):
-            assert test_y is not None
+            if test_y is None:
+                raise ValueError("test_y cannot be None for regression tasks.")
             for i, _ in enumerate(test_indices):
                 truth = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i]
                 arff_line = format_prediction(
@@ -733,7 +737,7 @@ def _run_task_get_arffcontent_parallel_helper(  # noqa: PLR0913
     """
     # Sets up the OpenML instantiated in the child process to match that of the parent's
     # if configuration=None, loads the default
-    config._setup(configuration)
+    openml.config._setup(configuration)
 
     train_indices, test_indices = task.get_train_test_split_indices(
         repeat=rep_no,
@@ -743,7 +747,8 @@ def _run_task_get_arffcontent_parallel_helper(  # noqa: PLR0913
 
     if isinstance(task, OpenMLSupervisedTask):
         x, y = task.get_X_and_y()
-        assert isinstance(y, (pd.Series, pd.DataFrame))
+        if not isinstance(y, (pd.Series, pd.DataFrame)):
+            raise TypeError(f"y must be a pandas Series or DataFrame, got {type(y).__name__}")
         train_x = x.iloc[train_indices]
         train_y = y.iloc[train_indices]
         test_x = x.iloc[test_indices]
@@ -755,9 +760,14 @@ def _run_task_get_arffcontent_parallel_helper(  # noqa: PLR0913
         test_x = None
         test_y = None
     else:
-        raise NotImplementedError(task.task_type)
+        raise NotImplementedError(
+            f"Task type '{task.task_type}' is not supported. "
+            f"Only OpenMLSupervisedTask and OpenMLClusteringTask are currently implemented. "
+            f"Task details: task_id={getattr(task, 'task_id', 'unknown')}, "
+            f"task_class={task.__class__.__name__}"
+        )
 
-    config.logger.info(
+    openml.config.logger.info(
         f"Going to run model {model!s} on "
         f"dataset {openml.datasets.get_dataset(task.dataset_id).name} "
         f"for repeat {rep_no} fold {fold_no} sample {sample_no}"
@@ -798,7 +808,7 @@ def get_runs(run_ids: list[int]) -> list[OpenMLRun]:
 
 
 @openml.utils.thread_safe_if_oslo_installed
-def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun:  # noqa: FBT002, FBT001
+def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun:  # noqa: FBT002
     """Gets run corresponding to run_id.
 
     Parameters
@@ -828,14 +838,14 @@ def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun:  # noqa: FBT0
         raise OpenMLCacheException(message="dummy")
 
     except OpenMLCacheException:
-        run_xml = openml._api_calls._perform_api_call("run/%d" % run_id, "get")
+        run_xml = openml._api_calls._perform_api_call(f"run/{run_id}", "get")
         with run_file.open("w", encoding="utf8") as fh:
             fh.write(run_xml)
 
     return _create_run_from_xml(run_xml)
 
 
-def _create_run_from_xml(xml: str, from_server: bool = True) -> OpenMLRun:  # noqa: PLR0915, PLR0912, C901, FBT001, FBT002
+def _create_run_from_xml(xml: str, from_server: bool = True) -> OpenMLRun:  # noqa: PLR0915, PLR0912, C901, FBT002
     """Create a run object from xml returned from server.
 
     Parameters
@@ -977,18 +987,24 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):  # type: ignore
                     evaluations[key] = value
 
     if "description" not in files and from_server is True:
-        raise ValueError("No description file for run %d in run description XML" % run_id)
+        raise ValueError(f"No description file for run {run_id} in run description XML")
 
     if "predictions" not in files and from_server is True:
         task = openml.tasks.get_task(task_id)
         if task.task_type_id == TaskType.SUBGROUP_DISCOVERY:
-            raise NotImplementedError("Subgroup discovery tasks are not yet supported.")
+            raise NotImplementedError(
+                f"Subgroup discovery tasks are not yet supported. "
+                f"Task ID: {task_id}. Please check the OpenML documentation"
+                f"for supported task types. "
+                f"Currently supported task types: Classification, Regression,"
+                f"Clustering, and Learning Curve."
+            )
 
         # JvR: actually, I am not sure whether this error should be raised.
         # a run can consist without predictions. But for now let's keep it
         # Matthias: yes, it should stay as long as we do not really handle
         # this stuff
-        raise ValueError("No prediction files for run %d in run description XML" % run_id)
+        raise ValueError(f"No prediction files for run {run_id} in run description XML")
 
     tags = openml.utils.extract_xml_tags("oml:tag", run)
 
@@ -1037,7 +1053,7 @@ def list_runs(  # noqa: PLR0913
     uploader: list | None = None,
     tag: str | None = None,
     study: int | None = None,
-    display_errors: bool = False,  # noqa: FBT001, FBT002
+    display_errors: bool = False,  # noqa: FBT002
     task_type: TaskType | int | None = None,
 ) -> pd.DataFrame:
     """
@@ -1171,7 +1187,7 @@ def _list_runs(  # noqa: PLR0913, C901
     if uploader is not None:
         api_call += f"/uploader/{','.join([str(int(i)) for i in uploader])}"
     if study is not None:
-        api_call += "/study/%d" % study
+        api_call += f"/study/{study}"
     if display_errors:
         api_call += "/show_errors/true"
     if tag is not None:
@@ -1202,7 +1218,11 @@ def __list_runs(api_call: str) -> pd.DataFrame:
             f'"http://openml.org/openml": {runs_dict}',
         )
 
-    assert isinstance(runs_dict["oml:runs"]["oml:run"], list), type(runs_dict["oml:runs"])
+    if not isinstance(runs_dict["oml:runs"]["oml:run"], list):
+        raise TypeError(
+            f"Expected runs_dict['oml:runs']['oml:run'] to be a list, "
+            f"got {type(runs_dict['oml:runs']['oml:run']).__name__}"
+        )
 
     runs = {
         int(r["oml:run_id"]): {
@@ -1282,7 +1302,12 @@ def format_prediction(  # noqa: PLR0913
     if isinstance(task, OpenMLRegressionTask):
         return [repeat, fold, index, prediction, truth]
 
-    raise NotImplementedError(f"Formatting for {type(task)} is not supported.")
+    raise NotImplementedError(
+        f"Formatting for {type(task)} is not supported."
+        f"Supported task types: OpenMLClassificationTask, OpenMLRegressionTask,"
+        f"and OpenMLLearningCurveTask. "
+        f"Please ensure your task is one of these types."
+    )
 
 
 def delete_run(run_id: int) -> bool:
diff --git a/openml/runs/run.py b/openml/runs/run.py
index 945264131..086e9c046 100644
--- a/openml/runs/run.py
+++ b/openml/runs/run.py
@@ -4,12 +4,11 @@
 import pickle
 import time
 from collections import OrderedDict
+from collections.abc import Callable, Sequence
 from pathlib import Path
 from typing import (
     TYPE_CHECKING,
     Any,
-    Callable,
-    Sequence,
 )
 
 import arff
@@ -280,7 +279,7 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
         ]
 
     @classmethod
-    def from_filesystem(cls, directory: str | Path, expect_model: bool = True) -> OpenMLRun:  # noqa: FBT001, FBT002
+    def from_filesystem(cls, directory: str | Path, expect_model: bool = True) -> OpenMLRun:  # noqa: FBT002
         """
         The inverse of the to_filesystem method. Instantiates an OpenMLRun
         object based on files stored on the file system.
@@ -347,7 +346,7 @@ def from_filesystem(cls, directory: str | Path, expect_model: bool = True) -> Op
     def to_filesystem(
         self,
         directory: str | Path,
-        store_model: bool = True,  # noqa: FBT001, FBT002
+        store_model: bool = True,  # noqa: FBT002
     ) -> None:
         """
         The inverse of the from_filesystem method. Serializes a run
@@ -365,7 +364,7 @@ def to_filesystem(
             model.
         """
         if self.data_content is None or self.model is None:
-            raise ValueError("Run should have been executed (and contain " "model / predictions)")
+            raise ValueError("Run should have been executed (and contain model / predictions)")
         directory = Path(directory)
         directory.mkdir(exist_ok=True, parents=True)
 
@@ -390,6 +389,57 @@ def to_filesystem(
         if self.trace is not None:
             self.trace._to_filesystem(directory)
 
+    def _get_arff_attributes_for_task(self, task: OpenMLTask) -> list[tuple[str, Any]]:
+        """Get ARFF attributes based on task type.
+
+        Parameters
+        ----------
+        task : OpenMLTask
+            The task for which to generate attributes.
+
+        Returns
+        -------
+        list[tuple[str, Any]]
+            List of attribute tuples (name, type).
+        """
+        instance_specifications = [
+            ("repeat", "NUMERIC"),
+            ("fold", "NUMERIC"),
+        ]
+
+        if isinstance(task, (OpenMLLearningCurveTask, OpenMLClassificationTask)):
+            instance_specifications.append(("sample", "NUMERIC"))
+
+        instance_specifications.append(("row_id", "NUMERIC"))
+
+        if isinstance(task, (OpenMLLearningCurveTask, OpenMLClassificationTask)):
+            class_labels = task.class_labels
+            if class_labels is None:
+                raise ValueError("The task has no class labels")
+
+            prediction_confidences = [
+                ("confidence." + class_labels[i], "NUMERIC") for i in range(len(class_labels))
+            ]
+            prediction_and_true = [("prediction", class_labels), ("correct", class_labels)]
+            return instance_specifications + prediction_and_true + prediction_confidences
+
+        if isinstance(task, OpenMLRegressionTask):
+            return [*instance_specifications, ("prediction", "NUMERIC"), ("truth", "NUMERIC")]
+
+        if isinstance(task, OpenMLClusteringTask):
+            return [*instance_specifications, ("cluster", "NUMERIC")]
+
+        supported_task_types = [
+            TaskType.SUPERVISED_CLASSIFICATION,
+            TaskType.SUPERVISED_REGRESSION,
+            TaskType.CLUSTERING,
+            TaskType.LEARNING_CURVE,
+        ]
+        raise NotImplementedError(
+            f"Task type {task.task_type!s} for task_id {getattr(task, 'task_id', None)!s} "
+            f"is not yet supported. Supported task types are: {supported_task_types!r}"
+        )
+
     def _generate_arff_dict(self) -> OrderedDict[str, Any]:
         """Generates the arff dictionary for uploading predictions to the
         server.
@@ -407,7 +457,8 @@ def _generate_arff_dict(self) -> OrderedDict[str, Any]:
         if self.data_content is None:
             raise ValueError("Run has not been executed.")
         if self.flow is None:
-            assert self.flow_id is not None, "Run has no associated flow id!"
+            if self.flow_id is None:
+                raise ValueError("Run has no associated flow id!")
             self.flow = get_flow(self.flow_id)
 
         if self.description_text is None:
@@ -418,69 +469,7 @@ def _generate_arff_dict(self) -> OrderedDict[str, Any]:
         arff_dict["data"] = self.data_content
         arff_dict["description"] = self.description_text
         arff_dict["relation"] = f"openml_task_{task.task_id}_predictions"
-
-        if isinstance(task, OpenMLLearningCurveTask):
-            class_labels = task.class_labels
-            instance_specifications = [
-                ("repeat", "NUMERIC"),
-                ("fold", "NUMERIC"),
-                ("sample", "NUMERIC"),
-                ("row_id", "NUMERIC"),
-            ]
-
-            arff_dict["attributes"] = instance_specifications
-            if class_labels is not None:
-                arff_dict["attributes"] = (
-                    arff_dict["attributes"]
-                    + [("prediction", class_labels), ("correct", class_labels)]
-                    + [
-                        ("confidence." + class_labels[i], "NUMERIC")
-                        for i in range(len(class_labels))
-                    ]
-                )
-            else:
-                raise ValueError("The task has no class labels")
-
-        elif isinstance(task, OpenMLClassificationTask):
-            class_labels = task.class_labels
-            instance_specifications = [
-                ("repeat", "NUMERIC"),
-                ("fold", "NUMERIC"),
-                ("sample", "NUMERIC"),  # Legacy
-                ("row_id", "NUMERIC"),
-            ]
-
-            arff_dict["attributes"] = instance_specifications
-            if class_labels is not None:
-                prediction_confidences = [
-                    ("confidence." + class_labels[i], "NUMERIC") for i in range(len(class_labels))
-                ]
-                prediction_and_true = [("prediction", class_labels), ("correct", class_labels)]
-                arff_dict["attributes"] = (
-                    arff_dict["attributes"] + prediction_and_true + prediction_confidences
-                )
-            else:
-                raise ValueError("The task has no class labels")
-
-        elif isinstance(task, OpenMLRegressionTask):
-            arff_dict["attributes"] = [
-                ("repeat", "NUMERIC"),
-                ("fold", "NUMERIC"),
-                ("row_id", "NUMERIC"),
-                ("prediction", "NUMERIC"),
-                ("truth", "NUMERIC"),
-            ]
-
-        elif isinstance(task, OpenMLClusteringTask):
-            arff_dict["attributes"] = [
-                ("repeat", "NUMERIC"),
-                ("fold", "NUMERIC"),
-                ("row_id", "NUMERIC"),
-                ("cluster", "NUMERIC"),
-            ]
-
-        else:
-            raise NotImplementedError(f"Task type {task.task_type!s} is not yet supported.")
+        arff_dict["attributes"] = self._get_arff_attributes_for_task(task)
 
         return arff_dict
 
@@ -517,7 +506,7 @@ def get_metric_fn(self, sklearn_fn: Callable, kwargs: dict | None = None) -> np.
             # TODO: make this a stream reader
         else:
             raise ValueError(
-                "Run should have been locally executed or " "contain outputfile reference.",
+                "Run should have been locally executed or contain outputfile reference.",
             )
 
         # Need to know more about the task to compute scores correctly
@@ -528,11 +517,11 @@ def get_metric_fn(self, sklearn_fn: Callable, kwargs: dict | None = None) -> np.
             task.task_type_id in [TaskType.SUPERVISED_CLASSIFICATION, TaskType.LEARNING_CURVE]
             and "correct" not in attribute_names
         ):
-            raise ValueError('Attribute "correct" should be set for ' "classification task runs")
+            raise ValueError('Attribute "correct" should be set for classification task runs')
         if task.task_type_id == TaskType.SUPERVISED_REGRESSION and "truth" not in attribute_names:
-            raise ValueError('Attribute "truth" should be set for ' "regression task runs")
+            raise ValueError('Attribute "truth" should be set for regression task runs')
         if task.task_type_id != TaskType.CLUSTERING and "prediction" not in attribute_names:
-            raise ValueError('Attribute "predict" should be set for ' "supervised task runs")
+            raise ValueError('Attribute "prediction" should be set for supervised task runs')
 
         def _attribute_list_to_dict(attribute_list):  # type: ignore
             # convenience function: Creates a mapping to map from the name of
@@ -566,7 +555,7 @@ def _attribute_list_to_dict(attribute_list):  # type: ignore
             pred = predictions_arff["attributes"][predicted_idx][1]
             corr = predictions_arff["attributes"][correct_idx][1]
             raise ValueError(
-                "Predicted and Correct do not have equal values:" f" {pred!s} Vs. {corr!s}",
+                f"Predicted and Correct do not have equal values: {pred!s} Vs. {corr!s}",
             )
 
         # TODO: these could be cached
@@ -602,7 +591,7 @@ def _attribute_list_to_dict(attribute_list):  # type: ignore
             values_correct[rep][fold][samp].append(correct)
 
         scores = []
-        for rep in values_predict:
+        for rep in values_predict:  # noqa: PLC0206
             for fold in values_predict[rep]:
                 last_sample = len(values_predict[rep][fold]) - 1
                 y_pred = values_predict[rep][fold][last_sample]
@@ -637,7 +626,10 @@ def _get_file_elements(self) -> dict:
 
         if self.parameter_settings is None:
             if self.flow is None:
-                assert self.flow_id is not None  # for mypy
+                if self.flow_id is None:
+                    raise ValueError(
+                        "Run has no associated flow_id and cannot obtain parameter values."
+                    )
                 self.flow = openml.flows.get_flow(self.flow_id)
             self.parameter_settings = self.flow.extension.obtain_parameter_values(
                 self.flow,
diff --git a/openml/runs/trace.py b/openml/runs/trace.py
index bc9e1b5d6..f76bd04e8 100644
--- a/openml/runs/trace.py
+++ b/openml/runs/trace.py
@@ -3,9 +3,10 @@
 
 import json
 from collections import OrderedDict
+from collections.abc import Iterator
 from dataclasses import dataclass
 from pathlib import Path
-from typing import IO, Any, Iterator
+from typing import IO, Any
 from typing_extensions import Self
 
 import arff
@@ -93,7 +94,8 @@ def get_parameters(self) -> dict[str, Any]:
                 for param, value in self.setup_string.items()
             }
 
-        assert self.parameters is not None
+        if self.parameters is None:
+            raise ValueError("Parameters must be set before calling get_parameters().")
         return {param[len(PREFIX) :]: value for param, value in self.parameters.items()}
 
 
@@ -149,9 +151,7 @@ def get_selected_iteration(self, fold: int, repeat: int) -> int:
         for r, f, i in self.trace_iterations:
             if r == repeat and f == fold and self.trace_iterations[(r, f, i)].selected is True:
                 return i
-        raise ValueError(
-            "Could not find the selected iteration for rep/fold %d/%d" % (repeat, fold),
-        )
+        raise ValueError(f"Could not find the selected iteration for rep/fold {repeat}/{fold}")
 
     @classmethod
     def generate(
@@ -185,8 +185,7 @@ def generate(
             raise ValueError("Trace content is empty.")
         if len(attributes) != len(content[0]):
             raise ValueError(
-                "Trace_attributes and trace_content not compatible:"
-                f" {attributes} vs {content[0]}",
+                f"Trace_attributes and trace_content not compatible: {attributes} vs {content[0]}",
             )
 
         return cls._trace_from_arff_struct(
@@ -492,13 +491,21 @@ def merge_traces(cls, traces: list[OpenMLRunTrace]) -> OpenMLRunTrace:
             for iteration in trace:
                 key = (iteration.repeat, iteration.fold, iteration.iteration)
 
-                assert iteration.parameters is not None
+                if iteration.parameters is None:
+                    raise ValueError(
+                        f"Iteration parameters cannot be None for repeat {iteration.repeat}, "
+                        f"fold {iteration.fold}, iteration {iteration.iteration}"
+                    )
                 param_keys = iteration.parameters.keys()
 
                 if previous_iteration is not None:
                     trace_itr = merged_trace[previous_iteration]
 
-                    assert trace_itr.parameters is not None
+                    if trace_itr.parameters is None:
+                        raise ValueError(
+                            f"Trace iteration parameters cannot be None "
+                            f"for iteration {previous_iteration}"
+                        )
                     trace_itr_keys = trace_itr.parameters.keys()
 
                     if list(param_keys) != list(trace_itr_keys):
diff --git a/openml/setups/__init__.py b/openml/setups/__init__.py
index dd38cb9b7..fa4072059 100644
--- a/openml/setups/__init__.py
+++ b/openml/setups/__init__.py
@@ -4,10 +4,10 @@
 from .setup import OpenMLParameter, OpenMLSetup
 
 __all__ = [
-    "OpenMLSetup",
     "OpenMLParameter",
+    "OpenMLSetup",
     "get_setup",
+    "initialize_model",
     "list_setups",
     "setup_exists",
-    "initialize_model",
 ]
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
index 374911901..a24d3a456 100644
--- a/openml/setups/functions.py
+++ b/openml/setups/functions.py
@@ -2,11 +2,11 @@
 from __future__ import annotations
 
 from collections import OrderedDict
+from collections.abc import Iterable
 from functools import partial
 from itertools import chain
 from pathlib import Path
-from typing import Any, Iterable
-from typing_extensions import Literal
+from typing import Any, Literal
 
 import pandas as pd
 import xmltodict
@@ -14,7 +14,6 @@
 import openml
 import openml.exceptions
 import openml.utils
-from openml import config
 from openml.flows import OpenMLFlow, flow_exists
 
 from .setup import OpenMLParameter, OpenMLSetup
@@ -84,7 +83,7 @@ def _get_cached_setup(setup_id: int) -> OpenMLSetup:
     OpenMLCacheException
         If the setup file for the given setup ID is not cached.
     """
-    cache_dir = Path(config.get_cache_directory())
+    cache_dir = Path(openml.config.get_cache_directory())
     setup_cache_dir = cache_dir / "setups" / str(setup_id)
     try:
         setup_file = setup_cache_dir / "description.xml"
@@ -94,7 +93,7 @@ def _get_cached_setup(setup_id: int) -> OpenMLSetup:
 
     except OSError as e:
         raise openml.exceptions.OpenMLCacheException(
-            "Setup file for setup id %d not cached" % setup_id,
+            f"Setup file for setup id {setup_id} not cached",
         ) from e
 
 
@@ -112,7 +111,7 @@ def get_setup(setup_id: int) -> OpenMLSetup:
     -------
     OpenMLSetup (an initialized openml setup object)
     """
-    setup_dir = Path(config.get_cache_directory()) / "setups" / str(setup_id)
+    setup_dir = Path(openml.config.get_cache_directory()) / "setups" / str(setup_id)
     setup_dir.mkdir(exist_ok=True, parents=True)
 
     setup_file = setup_dir / "description.xml"
diff --git a/openml/setups/setup.py b/openml/setups/setup.py
index 0960ad4c1..0c3a3cb6b 100644
--- a/openml/setups/setup.py
+++ b/openml/setups/setup.py
@@ -1,12 +1,13 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
+from dataclasses import asdict, dataclass
 from typing import Any
 
-import openml.config
 import openml.flows
 
 
+@dataclass
 class OpenMLSetup:
     """Setup object (a.k.a. Configuration).
 
@@ -20,20 +21,20 @@ class OpenMLSetup:
         The setting of the parameters
     """
 
-    def __init__(self, setup_id: int, flow_id: int, parameters: dict[int, Any] | None):
-        if not isinstance(setup_id, int):
+    setup_id: int
+    flow_id: int
+    parameters: dict[int, Any] | None
+
+    def __post_init__(self) -> None:
+        if not isinstance(self.setup_id, int):
             raise ValueError("setup id should be int")
 
-        if not isinstance(flow_id, int):
+        if not isinstance(self.flow_id, int):
             raise ValueError("flow id should be int")
 
-        if parameters is not None and not isinstance(parameters, dict):
+        if self.parameters is not None and not isinstance(self.parameters, dict):
             raise ValueError("parameters should be dict")
 
-        self.setup_id = setup_id
-        self.flow_id = flow_id
-        self.parameters = parameters
-
     def _to_dict(self) -> dict[str, Any]:
         return {
             "setup_id": self.setup_id,
@@ -66,6 +67,7 @@ def __repr__(self) -> str:
         return header + body
 
 
+@dataclass
 class OpenMLParameter:
     """Parameter object (used in setup).
 
@@ -91,37 +93,24 @@ class OpenMLParameter:
         If the parameter was set, the value that it was set to.
     """
 
-    def __init__(  # noqa: PLR0913
-        self,
-        input_id: int,
-        flow_id: int,
-        flow_name: str,
-        full_name: str,
-        parameter_name: str,
-        data_type: str,
-        default_value: str,
-        value: str,
-    ):
-        self.id = input_id
-        self.flow_id = flow_id
-        self.flow_name = flow_name
-        self.full_name = full_name
-        self.parameter_name = parameter_name
-        self.data_type = data_type
-        self.default_value = default_value
-        self.value = value
+    input_id: int
+    flow_id: int
+    flow_name: str
+    full_name: str
+    parameter_name: str
+    data_type: str
+    default_value: str
+    value: str
+
+    def __post_init__(self) -> None:
+        # Map input_id to id for backward compatibility
+        self.id = self.input_id
 
     def _to_dict(self) -> dict[str, Any]:
-        return {
-            "id": self.id,
-            "flow_id": self.flow_id,
-            "flow_name": self.flow_name,
-            "full_name": self.full_name,
-            "parameter_name": self.parameter_name,
-            "data_type": self.data_type,
-            "default_value": self.default_value,
-            "value": self.value,
-        }
+        result = asdict(self)
+        # Replaces input_id with id for backward compatibility
+        result["id"] = result.pop("input_id")
+        return result
 
     def __repr__(self) -> str:
         header = "OpenML Parameter"
diff --git a/openml/study/__init__.py b/openml/study/__init__.py
index b7d77fec4..37a6d376a 100644
--- a/openml/study/__init__.py
+++ b/openml/study/__init__.py
@@ -19,8 +19,8 @@
 from .study import OpenMLBenchmarkSuite, OpenMLStudy
 
 __all__ = [
-    "OpenMLStudy",
     "OpenMLBenchmarkSuite",
+    "OpenMLStudy",
     "attach_to_study",
     "attach_to_suite",
     "create_benchmark_suite",
@@ -33,6 +33,6 @@
     "get_suite",
     "list_studies",
     "list_suites",
-    "update_suite_status",
     "update_study_status",
+    "update_suite_status",
 ]
diff --git a/openml/study/functions.py b/openml/study/functions.py
index 4e16879d7..7268ea97c 100644
--- a/openml/study/functions.py
+++ b/openml/study/functions.py
@@ -1,5 +1,4 @@
 # License: BSD 3-Clause
-# ruff: noqa: PLR0913
 from __future__ import annotations
 
 import warnings
@@ -10,7 +9,6 @@
 import xmltodict
 
 import openml._api_calls
-import openml.config
 import openml.utils
 from openml.study.study import OpenMLBenchmarkSuite, OpenMLStudy
 
@@ -422,7 +420,7 @@ def detach_from_study(study_id: int, run_ids: list[int]) -> int:
         new size of the study (in terms of explicitly linked entities)
     """
     # Interestingly, there's no need to tell the server about the entity type, it knows by itself
-    uri = "study/%d/detach" % study_id
+    uri = f"study/{study_id}/detach"
     post_variables = {"ids": ",".join(str(x) for x in run_ids)}  # type: openml._api_calls.DATA_TYPE
     result_xml = openml._api_calls._perform_api_call(
         call=uri,
diff --git a/openml/study/study.py b/openml/study/study.py
index 83bbf0497..803c6455b 100644
--- a/openml/study/study.py
+++ b/openml/study/study.py
@@ -2,10 +2,11 @@
 # TODO(eddiebergman): Begging for dataclassses to shorten this all
 from __future__ import annotations
 
-from typing import Any, Sequence
+from collections.abc import Sequence
+from typing import Any
 
+import openml
 from openml.base import OpenMLBase
-from openml.config import get_server_base_url
 
 
 class BaseStudy(OpenMLBase):
@@ -110,7 +111,7 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
             fields["ID"] = self.study_id
             fields["Study URL"] = self.openml_url
         if self.creator is not None:
-            fields["Creator"] = f"{get_server_base_url()}/u/{self.creator}"
+            fields["Creator"] = f"{openml.config.get_server_base_url()}/u/{self.creator}"
         if self.creation_date is not None:
             fields["Upload Time"] = self.creation_date.replace("T", " ")
         if self.data is not None:
@@ -175,11 +176,23 @@ def _to_dict(self) -> dict[str, dict]:
 
     def push_tag(self, tag: str) -> None:
         """Add a tag to the study."""
-        raise NotImplementedError("Tags for studies is not (yet) supported.")
+        raise NotImplementedError(
+            "Tag management for studies is not yet supported. "
+            "The OpenML Python SDK does not currently provide functionality"
+            "for adding tags to studies."
+            "For updates on this feature, please refer to the GitHub issues at: "
+            "https://github.com/openml/openml-python/issues"
+        )
 
     def remove_tag(self, tag: str) -> None:
         """Remove a tag from the study."""
-        raise NotImplementedError("Tags for studies is not (yet) supported.")
+        raise NotImplementedError(
+            "Tag management for studies is not yet supported. "
+            "The OpenML Python SDK does not currently provide functionality"
+            "for removing tags from studies. "
+            "For updates on this feature, please refer to the GitHub issues at: "
+            "https://github.com/openml/openml-python/issues"
+        )
 
 
 class OpenMLStudy(BaseStudy):
diff --git a/openml/tasks/__init__.py b/openml/tasks/__init__.py
index f6df3a8d4..34c994e3a 100644
--- a/openml/tasks/__init__.py
+++ b/openml/tasks/__init__.py
@@ -19,17 +19,17 @@
 )
 
 __all__ = [
-    "OpenMLTask",
-    "OpenMLSupervisedTask",
-    "OpenMLClusteringTask",
-    "OpenMLRegressionTask",
     "OpenMLClassificationTask",
+    "OpenMLClusteringTask",
     "OpenMLLearningCurveTask",
+    "OpenMLRegressionTask",
+    "OpenMLSplit",
+    "OpenMLSupervisedTask",
+    "OpenMLTask",
+    "TaskType",
     "create_task",
+    "delete_task",
     "get_task",
     "get_tasks",
     "list_tasks",
-    "OpenMLSplit",
-    "TaskType",
-    "delete_task",
 ]
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index d2bf5e946..3fbc7adee 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -38,7 +38,7 @@ def _get_cached_tasks() -> dict[int, OpenMLTask]:
         OpenMLTask.
     """
     task_cache_dir = openml.utils._create_cache_directory(TASKS_CACHE_DIR_NAME)
-    directory_content = os.listdir(task_cache_dir)
+    directory_content = os.listdir(task_cache_dir)  # noqa: PTH208
     directory_content.sort()
 
     # Find all dataset ids for which we have downloaded the dataset
@@ -329,7 +329,7 @@ def __list_tasks(api_call: str) -> pd.DataFrame:  # noqa: C901, PLR0912
         except KeyError as e:
             if tid is not None:
                 warnings.warn(
-                    "Invalid xml for task %d: %s\nFrom %s" % (tid, e, task_),
+                    f"Invalid xml for task {tid}: {e}\nFrom {task_}",
                     RuntimeWarning,
                     stacklevel=2,
                 )
@@ -388,7 +388,7 @@ def get_tasks(
 @openml.utils.thread_safe_if_oslo_installed
 def get_task(
     task_id: int,
-    download_splits: bool = False,  # noqa: FBT001, FBT002
+    download_splits: bool = False,  # noqa: FBT002
     **get_dataset_kwargs: Any,
 ) -> OpenMLTask:
     """Download OpenML task for a given task ID.
@@ -415,8 +415,10 @@ def get_task(
     if not isinstance(task_id, int):
         raise TypeError(f"Task id should be integer, is {type(task_id)}")
 
-    tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
-
+    task_cache_directory = openml.utils._create_cache_directory_for_id(
+        TASKS_CACHE_DIR_NAME, task_id
+    )
+    task_cache_directory_existed = task_cache_directory.exists()
     try:
         task = _get_task_description(task_id)
         dataset = get_dataset(task.dataset_id, **get_dataset_kwargs)
@@ -424,13 +426,17 @@ def get_task(
         # Including class labels as part of task meta data handles
         #   the case where data download was initially disabled
         if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
+            assert task.target_name is not None, (
+                "Supervised tasks must define a target feature before retrieving class labels."
+            )
             task.class_labels = dataset.retrieve_class_labels(task.target_name)
         # Clustering tasks do not have class labels
         # and do not offer download_split
         if download_splits and isinstance(task, OpenMLSupervisedTask):
             task.download_split()
     except Exception as e:
-        openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir)
+        if not task_cache_directory_existed:
+            openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, task_cache_directory)
         raise e
 
     return task
@@ -442,7 +448,7 @@ def _get_task_description(task_id: int) -> OpenMLTask:
     except OpenMLCacheException:
         _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
         xml_file = _cache_dir / "task.xml"
-        task_xml = openml._api_calls._perform_api_call("task/%d" % task_id, "get")
+        task_xml = openml._api_calls._perform_api_call(f"task/{task_id}", "get")
 
         with xml_file.open("w", encoding="utf8") as fh:
             fh.write(task_xml)
@@ -526,7 +532,12 @@ def _create_task_from_xml(xml: str) -> OpenMLTask:
         TaskType.LEARNING_CURVE: OpenMLLearningCurveTask,
     }.get(task_type)
     if cls is None:
-        raise NotImplementedError(f"Task type {common_kwargs['task_type']} not supported.")
+        raise NotImplementedError(
+            f"Task type '{common_kwargs['task_type']}' is not supported. "
+            f"Supported task types: SUPERVISED_CLASSIFICATION,"
+            f"SUPERVISED_REGRESSION, CLUSTERING, LEARNING_CURVE."
+            f"Please check the OpenML documentation for available task types."
+        )
     return cls(**common_kwargs)  # type: ignore
 
 
@@ -582,9 +593,16 @@ def create_task(
     elif task_type == TaskType.SUPERVISED_REGRESSION:
         task_cls = OpenMLRegressionTask  # type: ignore
     else:
-        raise NotImplementedError(f"Task type {task_type:d} not supported.")
+        raise NotImplementedError(
+            f"Task type ID {task_type:d} is not supported. "
+            f"Supported task type IDs: {TaskType.SUPERVISED_CLASSIFICATION.value},"
+            f"{TaskType.SUPERVISED_REGRESSION.value}, "
+            f"{TaskType.CLUSTERING.value}, {TaskType.LEARNING_CURVE.value}. "
+            f"Please refer to the TaskType enum for valid task type identifiers."
+        )
 
     return task_cls(
+        task_id=None,
         task_type_id=task_type,
         task_type="None",  # TODO: refactor to get task type string from ID.
         data_set_id=dataset_id,
diff --git a/openml/tasks/split.py b/openml/tasks/split.py
index 4e781df35..464e41b2a 100644
--- a/openml/tasks/split.py
+++ b/openml/tasks/split.py
@@ -18,7 +18,7 @@ class Split(NamedTuple):
     test: np.ndarray
 
 
-class OpenMLSplit:
+class OpenMLSplit:  # noqa: PLW1641
     """OpenML Split object.
 
     This class manages train-test splits for a dataset across multiple
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
index 395b52482..ab3cb3da4 100644
--- a/openml/tasks/task.py
+++ b/openml/tasks/task.py
@@ -1,17 +1,17 @@
 # License: BSD 3-Clause
-# TODO(eddbergman): Seems like a lot of the subclasses could just get away with setting
-# a `ClassVar` for whatever changes as their `__init__` defaults, less duplicated code.
 from __future__ import annotations
 
+import logging
 import warnings
 from abc import ABC
+from collections.abc import Sequence
 from enum import Enum
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any, ClassVar
 from typing_extensions import TypedDict
 
+import arff
+
 import openml._api_calls
-import openml.config
 from openml import datasets
 from openml.base import OpenMLBase
 from openml.utils import _create_cache_directory_for_id
@@ -23,6 +23,9 @@
     import pandas as pd
 
 
+logger = logging.getLogger(__name__)
+
+
 # TODO(eddiebergman): Should use `auto()` but might be too late if these numbers are used
 # and stored on server.
 class TaskType(Enum):
@@ -70,31 +73,45 @@ class OpenMLTask(OpenMLBase):
         Refers to the URL of the data splits used for the OpenML task.
     """
 
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1
+
     def __init__(  # noqa: PLR0913
         self,
         task_id: int | None,
         task_type_id: TaskType,
         task_type: str,
         data_set_id: int,
-        estimation_procedure_id: int = 1,
+        estimation_procedure_id: int | None = None,
         estimation_procedure_type: str | None = None,
         estimation_parameters: dict[str, str] | None = None,
         evaluation_measure: str | None = None,
         data_splits_url: str | None = None,
+        target_name: str | None = None,
     ):
         self.task_id = int(task_id) if task_id is not None else None
         self.task_type_id = task_type_id
         self.task_type = task_type
         self.dataset_id = int(data_set_id)
+        self.target_name = target_name
+        resolved_estimation_procedure_id = self._resolve_estimation_procedure_id(
+            estimation_procedure_id,
+        )
         self.evaluation_measure = evaluation_measure
         self.estimation_procedure: _EstimationProcedure = {
             "type": estimation_procedure_type,
             "parameters": estimation_parameters,
             "data_splits_url": data_splits_url,
         }
-        self.estimation_procedure_id = estimation_procedure_id
+        self.estimation_procedure_id = resolved_estimation_procedure_id
         self.split: OpenMLSplit | None = None
 
+    def _resolve_estimation_procedure_id(self, estimation_procedure_id: int | None) -> int:
+        return (
+            estimation_procedure_id
+            if estimation_procedure_id is not None
+            else self.DEFAULT_ESTIMATION_PROCEDURE_ID
+        )
+
     @classmethod
     def _entity_letter(cls) -> str:
         return "t"
@@ -128,7 +145,8 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
             if class_labels is not None:
                 fields["# of Classes"] = len(class_labels)
 
-            if hasattr(self, "cost_matrix"):
+            cost_matrix = getattr(self, "cost_matrix", None)
+            if cost_matrix is not None:
                 fields["Cost Matrix"] = "Available"
 
         # determines the order in which the information will be printed
@@ -164,18 +182,6 @@ def get_train_test_split_indices(
 
         return self.split.get(repeat=repeat, fold=fold, sample=sample)
 
-    def _download_split(self, cache_file: Path) -> None:
-        # TODO(eddiebergman): Not sure about this try to read and error approach
-        try:
-            with cache_file.open(encoding="utf8"):
-                pass
-        except OSError:
-            split_url = self.estimation_procedure["data_splits_url"]
-            openml._api_calls._download_text_file(
-                source=str(split_url),
-                output_path=str(cache_file),
-            )
-
     def download_split(self) -> OpenMLSplit:
         """Download the OpenML split for a given task."""
         # TODO(eddiebergman): Can this every be `None`?
@@ -185,9 +191,23 @@ def download_split(self) -> OpenMLSplit:
 
         try:
             split = OpenMLSplit._from_arff_file(cached_split_file)
-        except OSError:
+            logger.debug("Loaded file from cache: %s", str(cached_split_file))
+        except (OSError, arff.BadDataFormat):
+            logger.info("Failed to load file from cache: %s", str(cached_split_file))
+            if cached_split_file.exists():
+                logger.debug("Cleaning up old file")
+                cached_split_file.unlink()
             # Next, download and cache the associated split file
-            self._download_split(cached_split_file)
+            split_url = self.estimation_procedure["data_splits_url"]
+            openml._api_calls._download_text_file(
+                source=str(split_url),
+                output_path=str(cached_split_file),
+            )
+            if cached_split_file.exists():
+                logger.info("New file created of size %d", cached_split_file.stat().st_size)
+            else:
+                logger.info("Failed to create new file")
+
             split = OpenMLSplit._from_arff_file(cached_split_file)
 
         return split
@@ -249,13 +269,15 @@ class OpenMLSupervisedTask(OpenMLTask, ABC):
         Refers to the unique identifier of task.
     """
 
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1
+
     def __init__(  # noqa: PLR0913
         self,
         task_type_id: TaskType,
         task_type: str,
         data_set_id: int,
         target_name: str,
-        estimation_procedure_id: int = 1,
+        estimation_procedure_id: int | None = None,
         estimation_procedure_type: str | None = None,
         estimation_parameters: dict[str, str] | None = None,
         evaluation_measure: str | None = None,
@@ -272,10 +294,9 @@ def __init__(  # noqa: PLR0913
             estimation_parameters=estimation_parameters,
             evaluation_measure=evaluation_measure,
             data_splits_url=data_splits_url,
+            target_name=target_name,
         )
 
-        self.target_name = target_name
-
     def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
         """Get data associated with the current task.
 
@@ -290,7 +311,12 @@ def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
             TaskType.SUPERVISED_REGRESSION,
             TaskType.LEARNING_CURVE,
         ):
-            raise NotImplementedError(self.task_type)
+            raise NotImplementedError(
+                f"Task type '{self.task_type}' is not implemented for get_X_and_y(). "
+                f"Supported types: SUPERVISED_CLASSIFICATION, SUPERVISED_REGRESSION,"
+                f"LEARNING_CURVE."
+                f"Task ID: {getattr(self, 'task_id', 'unknown')}. "
+            )
 
         X, y, _, _ = dataset.get_data(target=self.target_name)
         return X, y
@@ -325,6 +351,8 @@ class OpenMLClassificationTask(OpenMLSupervisedTask):
 
     Parameters
     ----------
+    task_id : Union[int, None]
+        ID of the Classification task (if it already exists on OpenML).
     task_type_id : TaskType
         ID of the Classification task type.
     task_type : str
@@ -333,7 +361,7 @@ class OpenMLClassificationTask(OpenMLSupervisedTask):
         ID of the OpenML dataset associated with the Classification task.
     target_name : str
         Name of the target variable.
-    estimation_procedure_id : int, default=None
+    estimation_procedure_id : int, default=1
         ID of the estimation procedure for the Classification task.
     estimation_procedure_type : str, default=None
         Type of the estimation procedure.
@@ -343,21 +371,21 @@ class OpenMLClassificationTask(OpenMLSupervisedTask):
         Name of the evaluation measure.
     data_splits_url : str, default=None
         URL of the data splits for the Classification task.
-    task_id : Union[int, None]
-        ID of the Classification task (if it already exists on OpenML).
     class_labels : List of str, default=None
         A list of class labels (for classification tasks).
     cost_matrix : array, default=None
         A cost matrix (for classification tasks).
     """
 
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1
+
     def __init__(  # noqa: PLR0913
         self,
         task_type_id: TaskType,
         task_type: str,
         data_set_id: int,
         target_name: str,
-        estimation_procedure_id: int = 1,
+        estimation_procedure_id: int | None = None,
         estimation_procedure_type: str | None = None,
         estimation_parameters: dict[str, str] | None = None,
         evaluation_measure: str | None = None,
@@ -367,22 +395,21 @@ def __init__(  # noqa: PLR0913
         cost_matrix: np.ndarray | None = None,
     ):
         super().__init__(
-            task_id=task_id,
             task_type_id=task_type_id,
             task_type=task_type,
             data_set_id=data_set_id,
+            target_name=target_name,
             estimation_procedure_id=estimation_procedure_id,
             estimation_procedure_type=estimation_procedure_type,
             estimation_parameters=estimation_parameters,
             evaluation_measure=evaluation_measure,
-            target_name=target_name,
             data_splits_url=data_splits_url,
+            task_id=task_id,
         )
         self.class_labels = class_labels
         self.cost_matrix = cost_matrix
-
         if cost_matrix is not None:
-            raise NotImplementedError("Costmatrix")
+            raise NotImplementedError("Costmatrix functionality is not yet implemented.")
 
 
 class OpenMLRegressionTask(OpenMLSupervisedTask):
@@ -390,6 +417,8 @@ class OpenMLRegressionTask(OpenMLSupervisedTask):
 
     Parameters
     ----------
+    task_id : Union[int, None]
+        ID of the OpenML Regression task.
     task_type_id : TaskType
         Task type ID of the OpenML Regression task.
     task_type : str
@@ -398,7 +427,7 @@ class OpenMLRegressionTask(OpenMLSupervisedTask):
         ID of the OpenML dataset.
     target_name : str
         Name of the target feature used in the Regression task.
-    estimation_procedure_id : int, default=None
+    estimation_procedure_id : int, default=7
         ID of the OpenML estimation procedure.
     estimation_procedure_type : str, default=None
         Type of the OpenML estimation procedure.
@@ -406,37 +435,11 @@ class OpenMLRegressionTask(OpenMLSupervisedTask):
         Parameters used by the OpenML estimation procedure.
     data_splits_url : str, default=None
         URL of the OpenML data splits for the Regression task.
-    task_id : Union[int, None]
-        ID of the OpenML Regression task.
     evaluation_measure : str, default=None
         Evaluation measure used in the Regression task.
     """
 
-    def __init__(  # noqa: PLR0913
-        self,
-        task_type_id: TaskType,
-        task_type: str,
-        data_set_id: int,
-        target_name: str,
-        estimation_procedure_id: int = 7,
-        estimation_procedure_type: str | None = None,
-        estimation_parameters: dict[str, str] | None = None,
-        data_splits_url: str | None = None,
-        task_id: int | None = None,
-        evaluation_measure: str | None = None,
-    ):
-        super().__init__(
-            task_id=task_id,
-            task_type_id=task_type_id,
-            task_type=task_type,
-            data_set_id=data_set_id,
-            estimation_procedure_id=estimation_procedure_id,
-            estimation_procedure_type=estimation_procedure_type,
-            estimation_parameters=estimation_parameters,
-            evaluation_measure=evaluation_measure,
-            target_name=target_name,
-            data_splits_url=data_splits_url,
-        )
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 7
 
 
 class OpenMLClusteringTask(OpenMLTask):
@@ -444,16 +447,16 @@ class OpenMLClusteringTask(OpenMLTask):
 
     Parameters
     ----------
+    task_id : Union[int, None]
+        ID of the OpenML clustering task.
     task_type_id : TaskType
         Task type ID of the OpenML clustering task.
     task_type : str
         Task type of the OpenML clustering task.
     data_set_id : int
         ID of the OpenML dataset used in clustering the task.
-    estimation_procedure_id : int, default=None
+    estimation_procedure_id : int, default=17
         ID of the OpenML estimation procedure.
-    task_id : Union[int, None]
-        ID of the OpenML clustering task.
     estimation_procedure_type : str, default=None
         Type of the OpenML estimation procedure used in the clustering task.
     estimation_parameters : dict, default=None
@@ -467,32 +470,7 @@ class OpenMLClusteringTask(OpenMLTask):
         feature set for the clustering task.
     """
 
-    def __init__(  # noqa: PLR0913
-        self,
-        task_type_id: TaskType,
-        task_type: str,
-        data_set_id: int,
-        estimation_procedure_id: int = 17,
-        task_id: int | None = None,
-        estimation_procedure_type: str | None = None,
-        estimation_parameters: dict[str, str] | None = None,
-        data_splits_url: str | None = None,
-        evaluation_measure: str | None = None,
-        target_name: str | None = None,
-    ):
-        super().__init__(
-            task_id=task_id,
-            task_type_id=task_type_id,
-            task_type=task_type,
-            data_set_id=data_set_id,
-            evaluation_measure=evaluation_measure,
-            estimation_procedure_id=estimation_procedure_id,
-            estimation_procedure_type=estimation_procedure_type,
-            estimation_parameters=estimation_parameters,
-            data_splits_url=data_splits_url,
-        )
-
-        self.target_name = target_name
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 17
 
     def get_X(self) -> pd.DataFrame:
         """Get data associated with the current task.
@@ -528,6 +506,8 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask):
 
     Parameters
     ----------
+    task_id : Union[int, None]
+        ID of the Learning Curve task.
     task_type_id : TaskType
         ID of the Learning Curve task.
     task_type : str
@@ -536,7 +516,7 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask):
         ID of the dataset that this task is associated with.
     target_name : str
         Name of the target feature in the dataset.
-    estimation_procedure_id : int, default=None
+    estimation_procedure_id : int, default=13
         ID of the estimation procedure to use for evaluating models.
     estimation_procedure_type : str, default=None
         Type of the estimation procedure.
@@ -544,8 +524,6 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask):
         Additional parameters for the estimation procedure.
     data_splits_url : str, default=None
         URL of the file containing the data splits for Learning Curve task.
-    task_id : Union[int, None]
-        ID of the Learning Curve task.
     evaluation_measure : str, default=None
         Name of the evaluation measure to use for evaluating models.
     class_labels : list of str, default=None
@@ -554,32 +532,4 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask):
         Cost matrix for Learning Curve tasks.
     """
 
-    def __init__(  # noqa: PLR0913
-        self,
-        task_type_id: TaskType,
-        task_type: str,
-        data_set_id: int,
-        target_name: str,
-        estimation_procedure_id: int = 13,
-        estimation_procedure_type: str | None = None,
-        estimation_parameters: dict[str, str] | None = None,
-        data_splits_url: str | None = None,
-        task_id: int | None = None,
-        evaluation_measure: str | None = None,
-        class_labels: list[str] | None = None,
-        cost_matrix: np.ndarray | None = None,
-    ):
-        super().__init__(
-            task_id=task_id,
-            task_type_id=task_type_id,
-            task_type=task_type,
-            data_set_id=data_set_id,
-            estimation_procedure_id=estimation_procedure_id,
-            estimation_procedure_type=estimation_procedure_type,
-            estimation_parameters=estimation_parameters,
-            evaluation_measure=evaluation_measure,
-            target_name=target_name,
-            data_splits_url=data_splits_url,
-            class_labels=class_labels,
-            cost_matrix=cost_matrix,
-        )
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 13
diff --git a/openml/testing.py b/openml/testing.py
index 2003bb1b9..9f694f9bf 100644
--- a/openml/testing.py
+++ b/openml/testing.py
@@ -47,9 +47,9 @@ class TestBase(unittest.TestCase):
         "user": [],
     }
     flow_name_tracker: ClassVar[list[str]] = []
-    test_server = "https://test.openml.org/api/v1/xml"
-    # amueller's read/write key that he will throw away later
-    apikey = "610344db6388d9ba34f6db45a3cf71de"
+    test_server = f"{openml.config.TEST_SERVER_URL}/api/v1/xml"
+    admin_key = os.environ.get(openml.config.OPENML_TEST_SERVER_ADMIN_KEY_ENV_VAR)
+    user_key = openml.config._TEST_SERVER_NORMAL_USER_KEY
 
     # creating logger for tracking files uploaded to test server
     logger = logging.getLogger("unit_tests_published_entities")
@@ -80,7 +80,7 @@ def setUp(self, n_levels: int = 1, tmpdir_suffix: str = "") -> None:
         for _ in range(n_levels):
             static_cache_dir = static_cache_dir.parent.absolute()
 
-        content = os.listdir(static_cache_dir)
+        content = os.listdir(static_cache_dir)  # noqa: PTH208
         if "files" in content:
             static_cache_dir = static_cache_dir / "files"
         else:
@@ -99,7 +99,7 @@ def setUp(self, n_levels: int = 1, tmpdir_suffix: str = "") -> None:
         os.chdir(self.workdir)
 
         self.cached = True
-        openml.config.apikey = TestBase.apikey
+        openml.config.apikey = TestBase.user_key
         self.production_server = "https://www.openml.org/api/v1/xml"
         openml.config.set_root_cache_directory(str(self.workdir))
 
@@ -166,7 +166,11 @@ def _delete_entity_from_tracker(cls, entity_type: str, entity: int) -> None:
                 delete_index = next(
                     i
                     for i, (id_, _) in enumerate(
-                        zip(TestBase.publish_tracker[entity_type], TestBase.flow_name_tracker),
+                        zip(
+                            TestBase.publish_tracker[entity_type],
+                            TestBase.flow_name_tracker,
+                            strict=False,
+                        ),
                     )
                     if id_ == entity
                 )
@@ -352,9 +356,9 @@ def create_request_response(
 
 
 __all__ = [
-    "TestBase",
-    "SimpleImputer",
     "CustomImputer",
+    "SimpleImputer",
+    "TestBase",
     "check_task_existence",
     "create_request_response",
 ]
diff --git a/openml/utils/__init__.py b/openml/utils/__init__.py
new file mode 100644
index 000000000..1e74a3684
--- /dev/null
+++ b/openml/utils/__init__.py
@@ -0,0 +1,39 @@
+"""Utilities module."""
+
+from openml.utils._openml import (
+    ProgressBar,
+    ReprMixin,
+    _create_cache_directory,
+    _create_cache_directory_for_id,
+    _create_lockfiles_dir,
+    _delete_entity,
+    _get_cache_dir_for_id,
+    _get_cache_dir_for_key,
+    _get_rest_api_type_alias,
+    _list_all,
+    _remove_cache_dir_for_id,
+    _tag_entity,
+    _tag_openml_base,
+    extract_xml_tags,
+    get_cache_size,
+    thread_safe_if_oslo_installed,
+)
+
+__all__ = [
+    "ProgressBar",
+    "ReprMixin",
+    "_create_cache_directory",
+    "_create_cache_directory_for_id",
+    "_create_lockfiles_dir",
+    "_delete_entity",
+    "_get_cache_dir_for_id",
+    "_get_cache_dir_for_key",
+    "_get_rest_api_type_alias",
+    "_list_all",
+    "_remove_cache_dir_for_id",
+    "_tag_entity",
+    "_tag_openml_base",
+    "extract_xml_tags",
+    "get_cache_size",
+    "thread_safe_if_oslo_installed",
+]
diff --git a/openml/utils.py b/openml/utils/_openml.py
similarity index 83%
rename from openml/utils.py
rename to openml/utils/_openml.py
index 7e72e7aee..2bf54690e 100644
--- a/openml/utils.py
+++ b/openml/utils/_openml.py
@@ -2,12 +2,21 @@
 from __future__ import annotations
 
 import contextlib
+import re
 import shutil
 import warnings
+from abc import ABC, abstractmethod
+from collections.abc import Callable, Iterable, Mapping, Sequence, Sized
 from functools import wraps
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Mapping, Sized, TypeVar, overload
-from typing_extensions import Literal, ParamSpec
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+    TypeVar,
+    overload,
+)
+from typing_extensions import ParamSpec
 
 import numpy as np
 import xmltodict
@@ -18,8 +27,6 @@
 import openml._api_calls
 import openml.exceptions
 
-from . import config
-
 # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
 if TYPE_CHECKING:
     from openml.base import OpenMLBase
@@ -103,7 +110,7 @@ def _get_rest_api_type_alias(oml_object: OpenMLBase) -> str:
     return api_type_alias
 
 
-def _tag_openml_base(oml_object: OpenMLBase, tag: str, untag: bool = False) -> None:  # noqa: FBT001, FBT002
+def _tag_openml_base(oml_object: OpenMLBase, tag: str, untag: bool = False) -> None:  # noqa: FBT002
     api_type_alias = _get_rest_api_type_alias(oml_object)
     if oml_object.id is None:
         raise openml.exceptions.ObjectNotPublishedError(
@@ -198,7 +205,7 @@ def _delete_entity(entity_type: str, entity_id: int) -> bool:
     if entity_type not in legal_entities:
         raise ValueError(f"Can't delete a {entity_type}")
 
-    url_suffix = "%s/%d" % (entity_type, entity_id)
+    url_suffix = f"{entity_type}/{entity_id}"
     try:
         result_xml = openml._api_calls._perform_api_call(url_suffix, "delete")
         result = xmltodict.parse(result_xml)
@@ -328,7 +335,7 @@ def _list_all(  # noqa: C901
 
 
 def _get_cache_dir_for_key(key: str) -> Path:
-    return Path(config.get_cache_directory()) / key
+    return Path(openml.config.get_cache_directory()) / key
 
 
 def _create_cache_directory(key: str) -> Path:
@@ -344,7 +351,7 @@ def _create_cache_directory(key: str) -> Path:
     return cache_dir
 
 
-def _get_cache_dir_for_id(key: str, id_: int, create: bool = False) -> Path:  # noqa: FBT001, FBT002
+def _get_cache_dir_for_id(key: str, id_: int, create: bool = False) -> Path:  # noqa: FBT002
     cache_dir = _create_cache_directory(key) if create else _get_cache_dir_for_key(key)
     return Path(cache_dir) / str(id_)
 
@@ -427,8 +434,20 @@ def safe_func(*args: P.args, **kwargs: P.kwargs) -> R:
         return func
 
 
+def get_cache_size() -> int:
+    """Calculate the size of OpenML cache directory
+
+    Returns
+    -------
+    cache_size: int
+        Total size of cache in bytes
+    """
+    path = Path(openml.config.get_cache_directory())
+    return sum(f.stat().st_size for f in path.rglob("*") if f.is_file())
+
+
 def _create_lockfiles_dir() -> Path:
-    path = Path(config.get_cache_directory()) / "locks"
+    path = Path(openml.config.get_cache_directory()) / "locks"
     # TODO(eddiebergman): Not sure why this is allowed to error and ignore???
     with contextlib.suppress(OSError):
         path.mkdir(exist_ok=True, parents=True)
@@ -469,3 +488,57 @@ def update(self, length: int) -> None:
         self._progress_bar.update(length)
         if self._progress_bar.total <= self._progress_bar.n:
             self._progress_bar.close()
+
+
+class ReprMixin(ABC):
+    """A mixin class that provides a customizable string representation for OpenML objects.
+
+    This mixin standardizes the __repr__ output format across OpenML classes.
+    Classes inheriting from this mixin should implement the
+    _get_repr_body_fields method to specify which fields to display.
+    """
+
+    def __repr__(self) -> str:
+        body_fields = self._get_repr_body_fields()
+        return self._apply_repr_template(body_fields)
+
+    @abstractmethod
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
+        """Collect all information to display in the __repr__ body.
+
+        Returns
+        -------
+        body_fields : List[Tuple[str, Union[str, int, List[str]]]]
+            A list of (name, value) pairs to display in the body of the __repr__.
+            E.g.: [('metric', 'accuracy'), ('dataset', 'iris')]
+            If value is a List of str, then each item of the list will appear in a separate row.
+        """
+        # Should be implemented in the base class.
+
+    def _apply_repr_template(
+        self,
+        body_fields: Iterable[tuple[str, str | int | list[str] | None]],
+    ) -> str:
+        """Generates the header and formats the body for string representation of the object.
+
+        Parameters
+        ----------
+        body_fields: List[Tuple[str, str]]
+           A list of (name, value) pairs to display in the body of the __repr__.
+        """
+        # We add spaces between capitals, e.g. ClassificationTask -> Classification Task
+        name_with_spaces = re.sub(
+            r"(\w)([A-Z])",
+            r"\1 \2",
+            self.__class__.__name__[len("OpenML") :],
+        )
+        header_text = f"OpenML {name_with_spaces}"
+        header = f"{header_text}\n{'=' * len(header_text)}\n"
+
+        _body_fields: list[tuple[str, str | int | list[str]]] = [
+            (k, "None" if v is None else v) for k, v in body_fields
+        ]
+        longest_field_name_length = max(len(name) for name, _ in _body_fields)
+        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
+        body = "\n".join(field_line_format.format(name, value) for name, value in _body_fields)
+        return header + body
diff --git a/pyproject.toml b/pyproject.toml
index 2bf762b09..8c463968b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ dependencies = [
   "pyarrow",
   "tqdm",  # For MinIO download progress bars
 ]
-requires-python = ">=3.8"
+requires-python = ">=3.10,<3.15"    
 maintainers = [
   { name = "Pieter Gijsbers", email="p.gijsbers@tue.nl"},
   { name = "Lennart Purucker"},
@@ -50,12 +50,11 @@ classifiers = [
   "Operating System :: Unix",
   "Operating System :: MacOS",
   "Programming Language :: Python :: 3",
-  "Programming Language :: Python :: 3.8",
-  "Programming Language :: Python :: 3.9",
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
   "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: 3.14",
 ]
 license = { file = "LICENSE" }
 
@@ -127,6 +126,7 @@ version = {attr = "openml.__version__.__version__"}
 
 # https://docs.pytest.org/en/7.2.x/reference/reference.html#ini-options-ref
 [tool.pytest.ini_options]
+log_level="DEBUG"
 testpaths = ["tests"]
 minversion = "7.0"
 xfail_strict = true
@@ -134,15 +134,15 @@ filterwarnings=[
     "ignore:the matrix subclass:PendingDeprecationWarning"
 ]
 markers = [
-  "server: anything that connects to a server",
   "upload: anything that uploads to a server",
-  "production: any interaction with the production server",
+  "production_server: any interaction with the production server",
   "cache: anything that interacts with the (test) cache",
+  "test_server: tests that require the OpenML test server",
 ]
 
 # https://github.com/charliermarsh/ruff
 [tool.ruff]
-target-version = "py38"
+target-version = "py310"
 line-length = 100
 output-format = "grouped"
 src = ["openml", "tests", "examples"]
@@ -275,9 +275,11 @@ ignore = [
   "S101",    # Use of assert detected.
   "W292",    # No newline at end of file
   "PLC1901", # "" can be simplified to be falsey
-  "TCH003",  # Move stdlib import into TYPE_CHECKING
+  "TC003",  # Move stdlib import into TYPE_CHECKING
   "COM812",  # Trailing comma missing (handled by linter, ruff recommend disabling if using formatter)
   "N803",    # Argument should be lowercase (but we accept things like `X`)
+  "PLC0415", # Allow imports inside functions / non-top-level scope
+  "FBT001",  # Allow Boolean-typed positional argument in function definition
 
   # TODO(@eddibergman): These should be enabled
   "D100",    # Missing docstring in public module
@@ -308,7 +310,7 @@ force-wrap-aliases = true
 convention = "numpy"
 
 [tool.mypy]
-python_version = "3.8"
+python_version = "3.10"
 packages = ["openml", "tests"]
 
 show_error_codes = true
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 000000000..000969b80
--- /dev/null
+++ b/scripts/__init__.py
@@ -0,0 +1 @@
+"""Package for scripts and utilities."""
diff --git a/tests/conftest.py b/tests/conftest.py
index 40a801e86..1967f1fad 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -98,7 +98,7 @@ def delete_remote_files(tracker, flow_names) -> None:
     :return: None
     """
     openml.config.server = TestBase.test_server
-    openml.config.apikey = TestBase.apikey
+    openml.config.apikey = TestBase.user_key
 
     # reordering to delete sub flows at the end of flows
     # sub-flows have shorter names, hence, sorting by descending order of flow name length
@@ -251,7 +251,7 @@ def test_files_directory() -> Path:
 
 @pytest.fixture(scope="session")
 def test_api_key() -> str:
-    return "c0c42819af31e706efe1f4b88c23c6c1"
+    return TestBase.user_key
 
 
 @pytest.fixture(autouse=True, scope="function")
@@ -272,34 +272,42 @@ def as_robot() -> Iterator[None]:
 
 @pytest.fixture(autouse=True)
 def with_server(request):
-    if "production" in request.keywords:
+    if os.getenv("OPENML_USE_LOCAL_SERVICES") == "true":
+        openml.config.TEST_SERVER_URL = "http://localhost:8000"
+    if "production_server" in request.keywords:
         openml.config.server = "https://www.openml.org/api/v1/xml"
+        openml.config.apikey = None
         yield
         return
-    openml.config.server = "https://test.openml.org/api/v1/xml"
-    openml.config.apikey = "c0c42819af31e706efe1f4b88c23c6c1"
+    openml.config.server = f"{openml.config.TEST_SERVER_URL}/api/v1/xml"
+    openml.config.apikey = TestBase.user_key
     yield
 
 
 @pytest.fixture(autouse=True)
 def with_test_cache(test_files_directory, request):
+    # Skip this fixture for TestBase subclasses - they manage their own cache directory
+    # in setUp()/tearDown(). Having both mechanisms fight over the global config
+    # causes race conditions.
+    if request.instance is not None and isinstance(request.instance, TestBase):
+        yield
+        return
+
     if not test_files_directory.exists():
         raise ValueError(
             f"Cannot find test cache dir, expected it to be {test_files_directory!s}!",
         )
     _root_cache_directory = openml.config._root_cache_directory
-    tmp_cache = test_files_directory / request.node.name
+    tmp_cache = test_files_directory / request.node.nodeid.replace("/", ".").replace("::", ".")
     openml.config.set_root_cache_directory(tmp_cache)
     yield
     openml.config.set_root_cache_directory(_root_cache_directory)
     if tmp_cache.exists():
         shutil.rmtree(tmp_cache)
         
-        
 
 @pytest.fixture
 def static_cache_dir():
-    
     return Path(__file__).parent / "files" 
 
 @pytest.fixture
@@ -307,4 +315,4 @@ def workdir(tmp_path):
     original_cwd = Path.cwd()
     os.chdir(tmp_path)
     yield tmp_path
-    os.chdir(original_cwd)
+    os.chdir(original_cwd)
\ No newline at end of file
diff --git a/tests/files/localhost_8000 b/tests/files/localhost_8000
new file mode 120000
index 000000000..334c709ef
--- /dev/null
+++ b/tests/files/localhost_8000
@@ -0,0 +1 @@
+org/openml/test
\ No newline at end of file
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 86a4d3f57..c651845fb 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -18,7 +18,7 @@
 import pytest
 
 
-@pytest.mark.production()
+@pytest.mark.production_server()
 class OpenMLDatasetTest(TestBase):
     _multiprocess_can_split_ = True
 
@@ -102,21 +102,24 @@ def test_get_data_pandas(self):
         assert isinstance(data, pd.DataFrame)
         assert data.shape[1] == len(self.titanic.features)
         assert data.shape[0] == 1309
+        # Dynamically detect what this version of Pandas calls string columns.
+        str_dtype = data["name"].dtype.name
+
         col_dtype = {
             "pclass": "uint8",
             "survived": "category",
-            "name": "object",
+            "name": str_dtype,
             "sex": "category",
             "age": "float64",
             "sibsp": "uint8",
             "parch": "uint8",
-            "ticket": "object",
+            "ticket": str_dtype,
             "fare": "float64",
-            "cabin": "object",
+            "cabin": str_dtype,
             "embarked": "category",
-            "boat": "object",
+            "boat": str_dtype,
             "body": "float64",
-            "home.dest": "object",
+            "home.dest": str_dtype,
         }
         for col_name in data.columns:
             assert data[col_name].dtype.name == col_dtype[col_name]
@@ -278,6 +281,7 @@ def test_equality_comparison(self):
         self.assertNotEqual(self.titanic, "Wrong_object")
 
 
+@pytest.mark.test_server()
 def test_tagging():
     dataset = openml.datasets.get_dataset(125, download_data=False)
 
@@ -294,6 +298,7 @@ def test_tagging():
     datasets = openml.datasets.list_datasets(tag=tag)
     assert datasets.empty
 
+@pytest.mark.test_server()
 def test_get_feature_with_ontology_data_id_11():
     # test on car dataset, which has built-in ontology references
     dataset = openml.datasets.get_dataset(11)
@@ -302,6 +307,7 @@ def test_get_feature_with_ontology_data_id_11():
     assert len(dataset.features[2].ontologies) >= 1
     assert len(dataset.features[3].ontologies) >= 1   
 
+@pytest.mark.test_server()
 def test_add_remove_ontology_to_dataset():
     did = 1
     feature_index = 1
@@ -309,6 +315,7 @@ def test_add_remove_ontology_to_dataset():
     openml.datasets.functions.data_feature_add_ontology(did, feature_index, ontology)
     openml.datasets.functions.data_feature_remove_ontology(did, feature_index, ontology)    
 
+@pytest.mark.test_server()
 def test_add_same_ontology_multiple_features():
     did = 1
     ontology = "https://www.openml.org/unittest/" + str(time())
@@ -317,6 +324,7 @@ def test_add_same_ontology_multiple_features():
         openml.datasets.functions.data_feature_add_ontology(did, i, ontology)    
 
 
+@pytest.mark.test_server()
 def test_add_illegal_long_ontology():
     did = 1
     ontology = "http://www.google.com/" + ("a" * 257)
@@ -328,6 +336,7 @@ def test_add_illegal_long_ontology():
     
 
 
+@pytest.mark.test_server()
 def test_add_illegal_url_ontology():
     did = 1
     ontology = "not_a_url" + str(time())
@@ -338,7 +347,7 @@ def test_add_illegal_url_ontology():
         assert e.code == 1106
 
 
-@pytest.mark.production()
+@pytest.mark.production_server()
 class OpenMLDatasetTestSparse(TestBase):
     _multiprocess_can_split_ = True
 
@@ -351,7 +360,7 @@ def setUp(self):
     def test_get_sparse_dataset_dataframe_with_target(self):
         X, y, _, attribute_names = self.sparse_dataset.get_data(target="class")
         assert isinstance(X, pd.DataFrame)
-        assert isinstance(X.dtypes[0], pd.SparseDtype)
+        assert isinstance(X.dtypes.iloc[0], pd.SparseDtype)
         assert X.shape == (600, 20000)
 
         assert isinstance(y, pd.Series)
@@ -399,6 +408,7 @@ def test_get_sparse_categorical_data_id_395(self):
         assert len(feature.nominal_values) == 25
 
 
+@pytest.mark.test_server()
 def test__read_features(mocker, workdir, static_cache_dir):
     """Test we read the features from the xml if no cache pickle is available.
     This test also does some simple checks to verify that the features are read correctly
@@ -430,6 +440,7 @@ def test__read_features(mocker, workdir, static_cache_dir):
     assert pickle_mock.dump.call_count == 1
 
 
+@pytest.mark.test_server()
 def test__read_qualities(static_cache_dir, workdir, mocker):
     """Test we read the qualities from the xml if no cache pickle is available.
     This test also does some minor checks to ensure that the qualities are read correctly.
@@ -470,4 +481,4 @@ def test__check_qualities():
 
     qualities = [{"oml:name": "a", "oml:value": None}]
     qualities = openml.datasets.dataset._check_qualities(qualities)
-    assert qualities["a"] != qualities["a"]
\ No newline at end of file
+    assert qualities["a"] != qualities["a"]
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 4145b86ad..974fb36ef 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -107,6 +107,7 @@ def _check_datasets(self, datasets):
         for did in datasets:
             self._check_dataset(datasets[did])
 
+    @pytest.mark.test_server()
     def test_tag_untag_dataset(self):
         tag = "test_tag_%d" % random.randint(1, 1000000)
         all_tags = _tag_entity("data", 1, tag)
@@ -114,10 +115,12 @@ def test_tag_untag_dataset(self):
         all_tags = _tag_entity("data", 1, tag, untag=True)
         assert tag not in all_tags
 
+    @pytest.mark.test_server()
     def test_list_datasets_length(self):
         datasets = openml.datasets.list_datasets()
         assert len(datasets) >= 100
 
+    @pytest.mark.test_server()
     def test_list_datasets_paginate(self):
         size = 10
         max = 100
@@ -132,11 +135,12 @@ def test_list_datasets_paginate(self):
                 categories=["in_preparation", "active", "deactivated"],
             )
 
+    @pytest.mark.test_server()
     def test_list_datasets_empty(self):
         datasets = openml.datasets.list_datasets(tag="NoOneWouldUseThisTagAnyway")
         assert datasets.empty
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_check_datasets_active(self):
         # Have to test on live because there is no deactivated dataset on the test server.
         self.use_production_server()
@@ -155,6 +159,7 @@ def test_check_datasets_active(self):
         )
         openml.config.server = self.test_server
 
+    @pytest.mark.test_server()
     def test_illegal_character_tag(self):
         dataset = openml.datasets.get_dataset(1)
         tag = "illegal_tag&"
@@ -164,6 +169,7 @@ def test_illegal_character_tag(self):
         except openml.exceptions.OpenMLServerException as e:
             assert e.code == 477
 
+    @pytest.mark.test_server()
     def test_illegal_length_tag(self):
         dataset = openml.datasets.get_dataset(1)
         tag = "a" * 65
@@ -173,7 +179,7 @@ def test_illegal_length_tag(self):
         except openml.exceptions.OpenMLServerException as e:
             assert e.code == 477
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test__name_to_id_with_deactivated(self):
         """Check that an activated dataset is returned if an earlier deactivated one exists."""
         self.use_production_server()
@@ -181,19 +187,19 @@ def test__name_to_id_with_deactivated(self):
         assert openml.datasets.functions._name_to_id("anneal") == 2
         openml.config.server = self.test_server
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test__name_to_id_with_multiple_active(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.use_production_server()
         assert openml.datasets.functions._name_to_id("iris") == 61
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test__name_to_id_with_version(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.use_production_server()
         assert openml.datasets.functions._name_to_id("iris", version=3) == 969
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test__name_to_id_with_multiple_active_error(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.use_production_server()
@@ -205,6 +211,7 @@ def test__name_to_id_with_multiple_active_error(self):
             error_if_multiple=True,
         )
 
+    @pytest.mark.test_server()
     def test__name_to_id_name_does_not_exist(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.assertRaisesRegex(
@@ -214,6 +221,7 @@ def test__name_to_id_name_does_not_exist(self):
             dataset_name="does_not_exist",
         )
 
+    @pytest.mark.test_server()
     def test__name_to_id_version_does_not_exist(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.assertRaisesRegex(
@@ -224,6 +232,7 @@ def test__name_to_id_version_does_not_exist(self):
             version=100000,
         )
 
+    @pytest.mark.test_server()
     def test_get_datasets_by_name(self):
         # did 1 and 2 on the test server:
         dids = ["anneal", "kr-vs-kp"]
@@ -231,6 +240,7 @@ def test_get_datasets_by_name(self):
         assert len(datasets) == 2
         _assert_datasets_retrieved_successfully([1, 2])
 
+    @pytest.mark.test_server()
     def test_get_datasets_by_mixed(self):
         # did 1 and 2 on the test server:
         dids = ["anneal", 2]
@@ -238,12 +248,14 @@ def test_get_datasets_by_mixed(self):
         assert len(datasets) == 2
         _assert_datasets_retrieved_successfully([1, 2])
 
+    @pytest.mark.test_server()
     def test_get_datasets(self):
         dids = [1, 2]
         datasets = openml.datasets.get_datasets(dids)
         assert len(datasets) == 2
         _assert_datasets_retrieved_successfully([1, 2])
 
+    @pytest.mark.test_server()
     def test_get_dataset_by_name(self):
         dataset = openml.datasets.get_dataset("anneal")
         assert type(dataset) == OpenMLDataset
@@ -262,6 +274,7 @@ def test_get_dataset_download_all_files(self):
         # test_get_dataset_lazy
         raise NotImplementedError
 
+    @pytest.mark.test_server()
     def test_get_dataset_uint8_dtype(self):
         dataset = openml.datasets.get_dataset(1)
         assert type(dataset) == OpenMLDataset
@@ -269,7 +282,7 @@ def test_get_dataset_uint8_dtype(self):
         df, _, _, _ = dataset.get_data()
         assert df["carbon"].dtype == "uint8"
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_dataset_cannot_access_private_data(self):
         # Issue324 Properly handle private datasets when trying to access them
         self.use_production_server()
@@ -280,6 +293,7 @@ def test_dataset_by_name_cannot_access_private_data(self):
         self.use_production_server()
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")
 
+    @pytest.mark.test_server()
     def test_get_dataset_lazy_all_functions(self):
         """Test that all expected functionality is available without downloading the dataset."""
         dataset = openml.datasets.get_dataset(1)
@@ -309,24 +323,28 @@ def ensure_absence_of_real_data():
         assert classes == ["1", "2", "3", "4", "5", "U"]
         ensure_absence_of_real_data()
 
+    @pytest.mark.test_server()
     def test_get_dataset_sparse(self):
         dataset = openml.datasets.get_dataset(102)
         X, *_ = dataset.get_data()
         assert isinstance(X, pd.DataFrame)
         assert all(isinstance(col, pd.SparseDtype) for col in X.dtypes)
 
+    @pytest.mark.test_server()
     def test_download_rowid(self):
         # Smoke test which checks that the dataset has the row-id set correctly
         did = 44
         dataset = openml.datasets.get_dataset(did)
         assert dataset.row_id_attribute == "Counter"
 
+    @pytest.mark.test_server()
     def test__get_dataset_description(self):
         description = _get_dataset_description(self.workdir, 2)
         assert isinstance(description, dict)
         description_xml_path = os.path.join(self.workdir, "description.xml")
         assert os.path.exists(description_xml_path)
 
+    @pytest.mark.test_server()
     def test__getarff_path_dataset_arff(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         description = _get_dataset_description(self.workdir, 2)
@@ -390,6 +408,7 @@ def test__download_minio_file_works_with_bucket_subdirectory(self):
 
 
     @mock.patch("openml._api_calls._download_minio_file")
+    @pytest.mark.test_server()
     def test__get_dataset_parquet_is_cached(self, patch):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         patch.side_effect = RuntimeError(
@@ -430,18 +449,21 @@ def test__getarff_md5_issue(self):
 
         openml.config.connection_n_retries = n
 
+    @pytest.mark.test_server()
     def test__get_dataset_features(self):
         features_file = _get_dataset_features_file(self.workdir, 2)
         assert isinstance(features_file, Path)
         features_xml_path = self.workdir / "features.xml"
         assert features_xml_path.exists()
 
+    @pytest.mark.test_server()
     def test__get_dataset_qualities(self):
         qualities = _get_dataset_qualities_file(self.workdir, 2)
         assert isinstance(qualities, Path)
         qualities_xml_path = self.workdir / "qualities.xml"
         assert qualities_xml_path.exists()
 
+    @pytest.mark.test_server()
     def test_get_dataset_force_refresh_cache(self):
         did_cache_dir = _create_cache_directory_for_id(
             DATASETS_CACHE_DIR_NAME,
@@ -464,6 +486,7 @@ def test_get_dataset_force_refresh_cache(self):
             did_cache_dir,
         )
 
+    @pytest.mark.test_server()
     def test_get_dataset_force_refresh_cache_clean_start(self):
         did_cache_dir = _create_cache_directory_for_id(
             DATASETS_CACHE_DIR_NAME,
@@ -500,21 +523,16 @@ def test_deletion_of_cache_dir(self):
 
     # get_dataset_description is the only data guaranteed to be downloaded
     @mock.patch("openml.datasets.functions._get_dataset_description")
+    @pytest.mark.test_server()
     def test_deletion_of_cache_dir_faulty_download(self, patch):
         patch.side_effect = Exception("Boom!")
         self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1)
-        datasets_cache_dir = os.path.join(self.workdir, "org", "openml", "test", "datasets")
+        datasets_cache_dir = os.path.join(openml.config.get_cache_directory(), "datasets")
         assert len(os.listdir(datasets_cache_dir)) == 0
 
+    @pytest.mark.test_server()
     def test_publish_dataset(self):
-        # lazy loading not possible as we need the arff-file.
-        openml.datasets.get_dataset(3, download_data=True)
-        file_path = os.path.join(
-            openml.config.get_cache_directory(),
-            "datasets",
-            "3",
-            "dataset.arff",
-        )
+        arff_file_path = self.static_cache_dir / "org" / "openml" / "test" / "datasets" / "2" / "dataset.arff"
         dataset = OpenMLDataset(
             "anneal",
             "test",
@@ -522,7 +540,7 @@ def test_publish_dataset(self):
             version=1,
             licence="public",
             default_target_attribute="class",
-            data_file=file_path,
+            data_file=arff_file_path,
         )
         dataset.publish()
         TestBase._mark_entity_for_removal("data", dataset.dataset_id)
@@ -531,6 +549,7 @@ def test_publish_dataset(self):
         )
         assert isinstance(dataset.dataset_id, int)
 
+    @pytest.mark.test_server()
     def test__retrieve_class_labels(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         labels = openml.datasets.get_dataset(2).retrieve_class_labels()
@@ -547,6 +566,7 @@ def test__retrieve_class_labels(self):
         labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name)
         assert labels == ["COIL", "SHEET"]
 
+    @pytest.mark.test_server()
     def test_upload_dataset_with_url(self):
         dataset = OpenMLDataset(
             f"{self._get_sentinel()}-UploadTestWithURL",
@@ -572,7 +592,12 @@ def _assert_status_of_dataset(self, *, did: int, status: str):
         assert len(result) == 1
         assert result[did]["status"] == status
 
+    @pytest.mark.skipif(
+        not os.environ.get(openml.config.OPENML_TEST_SERVER_ADMIN_KEY_ENV_VAR),
+        reason="Test requires admin key. Set OPENML_TEST_SERVER_ADMIN_KEY environment variable.",
+    )
     @pytest.mark.flaky()
+    @pytest.mark.test_server()
     def test_data_status(self):
         dataset = OpenMLDataset(
             f"{self._get_sentinel()}-UploadTestWithURL",
@@ -586,9 +611,9 @@ def test_data_status(self):
         TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {dataset.id}")
         did = dataset.id
 
-        # admin key for test server (only adminds can activate datasets.
+        # admin key for test server (only admins can activate datasets.
         # all users can deactivate their own datasets)
-        openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
+        openml.config.apikey = TestBase.admin_key
 
         openml.datasets.status_update(did, "active")
         self._assert_status_of_dataset(did=did, status="active")
@@ -664,6 +689,7 @@ def test_attributes_arff_from_df_unknown_dtype(self):
             with pytest.raises(ValueError, match=err_msg):
                 attributes_arff_from_df(df)
 
+    @pytest.mark.test_server()
     def test_create_dataset_numpy(self):
         data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T
 
@@ -697,6 +723,7 @@ def test_create_dataset_numpy(self):
         ), "Uploaded arff does not match original one"
         assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
 
+    @pytest.mark.test_server()
     def test_create_dataset_list(self):
         data = [
             ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
@@ -751,6 +778,7 @@ def test_create_dataset_list(self):
         ), "Uploaded ARFF does not match original one"
         assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
 
+    @pytest.mark.test_server()
     def test_create_dataset_sparse(self):
         # test the scipy.sparse.coo_matrix
         sparse_data = scipy.sparse.coo_matrix(
@@ -853,8 +881,9 @@ def test_create_invalid_dataset(self):
         param["data"] = data[0]
         self.assertRaises(ValueError, create_dataset, **param)
 
+    @pytest.mark.test_server()
     def test_get_online_dataset_arff(self):
-        dataset_id = 100  # Australian
+        dataset_id = 128  # iris -- one of the few datasets without parquet file
         # lazy loading not used as arff file is checked.
         dataset = openml.datasets.get_dataset(dataset_id, download_data=True)
         decoder = arff.ArffDecoder()
@@ -868,6 +897,7 @@ def test_get_online_dataset_arff(self):
             return_type=arff.DENSE if d_format == "arff" else arff.COO,
         ), "ARFF files are not equal"
 
+    @pytest.mark.test_server()
     def test_topic_api_error(self):
         # Check server exception when non-admin accessses apis
         self.assertRaisesRegex(
@@ -886,6 +916,7 @@ def test_topic_api_error(self):
             topic="business",
         )
 
+    @pytest.mark.test_server()
     def test_get_online_dataset_format(self):
         # Phoneme dataset
         dataset_id = 77
@@ -895,6 +926,7 @@ def test_get_online_dataset_format(self):
             dataset_id
         ), "The format of the ARFF files is different"
 
+    @pytest.mark.test_server()
     def test_create_dataset_pandas(self):
         data = [
             ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
@@ -1119,6 +1151,7 @@ def test_ignore_attributes_dataset(self):
                 paper_url=paper_url,
             )
 
+    @pytest.mark.test_server()
     def test_publish_fetch_ignore_attribute(self):
         """Test to upload and retrieve dataset and check ignore_attributes"""
         data = [
@@ -1237,6 +1270,7 @@ def test_create_dataset_row_id_attribute_error(self):
                 paper_url=paper_url,
             )
 
+    @pytest.mark.test_server()
     def test_create_dataset_row_id_attribute_inference(self):
         # meta-information
         name = f"{self._get_sentinel()}-pandas_testing_dataset"
@@ -1327,11 +1361,13 @@ def test_create_dataset_attributes_auto_without_df(self):
                 paper_url=paper_url,
             )
 
+    @pytest.mark.test_server()
     def test_list_qualities(self):
         qualities = openml.datasets.list_qualities()
         assert isinstance(qualities, list) is True
         assert all(isinstance(q, str) for q in qualities) is True
 
+    @pytest.mark.test_server()
     def test_get_dataset_cache_format_pickle(self):
         dataset = openml.datasets.get_dataset(1)
         dataset.get_data()
@@ -1347,6 +1383,7 @@ def test_get_dataset_cache_format_pickle(self):
         assert len(categorical) == X.shape[1]
         assert len(attribute_names) == X.shape[1]
 
+    @pytest.mark.test_server()
     def test_get_dataset_cache_format_feather(self):
         # This test crashed due to using the parquet file by default, which is downloaded
         # from minio. However, there is a mismatch between OpenML test server and minio IDs.
@@ -1379,6 +1416,7 @@ def test_get_dataset_cache_format_feather(self):
         assert len(categorical) == X.shape[1]
         assert len(attribute_names) == X.shape[1]
 
+    @pytest.mark.test_server()
     def test_data_edit_non_critical_field(self):
         # Case 1
         # All users can edit non-critical fields of datasets
@@ -1400,6 +1438,7 @@ def test_data_edit_non_critical_field(self):
         edited_dataset = openml.datasets.get_dataset(did)
         assert edited_dataset.description == desc
 
+    @pytest.mark.test_server()
     def test_data_edit_critical_field(self):
         # Case 2
         # only owners (or admin) can edit all critical fields of datasets
@@ -1422,10 +1461,12 @@ def test_data_edit_critical_field(self):
                     raise e
                 time.sleep(10)
                 # Delete the cache dir to get the newer version of the dataset
+                
                 shutil.rmtree(
-                    os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)),
+                    os.path.join(openml.config.get_cache_directory(), "datasets", str(did)),
                 )
 
+    @pytest.mark.test_server()
     def test_data_edit_requires_field(self):
         # Check server exception when no field to edit is provided
         self.assertRaisesRegex(
@@ -1438,6 +1479,7 @@ def test_data_edit_requires_field(self):
             data_id=64,  # blood-transfusion-service-center
         )
 
+    @pytest.mark.test_server()
     def test_data_edit_requires_valid_dataset(self):
         # Check server exception when unknown dataset is provided
         self.assertRaisesRegex(
@@ -1448,6 +1490,7 @@ def test_data_edit_requires_valid_dataset(self):
             description="xor operation dataset",
         )
 
+    @pytest.mark.test_server()
     def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
         # Need to own a dataset to be able to edit meta-data
         # Will be creating a forked version of an existing dataset to allow the unit test user
@@ -1474,6 +1517,7 @@ def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
             default_target_attribute="y",
         )
 
+    @pytest.mark.test_server()
     def test_edit_data_user_cannot_edit_critical_field_of_other_users_dataset(self):
         # Check server exception when a non-owner or non-admin tries to edit critical fields
         self.assertRaisesRegex(
@@ -1485,6 +1529,7 @@ def test_edit_data_user_cannot_edit_critical_field_of_other_users_dataset(self):
             default_target_attribute="y",
         )
 
+    @pytest.mark.test_server()
     def test_data_fork(self):
         did = 1
         result = fork_dataset(did)
@@ -1498,7 +1543,7 @@ def test_data_fork(self):
         )
 
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_datasets_with_high_size_parameter(self):
         # Testing on prod since concurrent deletion of uploded datasets make the test fail
         self.use_production_server()
@@ -1683,7 +1728,6 @@ def test_delete_dataset(self):
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = (
         test_files_directory / "mock_responses" / "datasets" / "data_delete_not_owned.xml"
     )
@@ -1698,14 +1742,13 @@ def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_ke
     ):
         openml.datasets.delete_dataset(40_000)
 
-    dataset_url = "https://test.openml.org/api/v1/xml/data/40000"
+    dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/40000"
     assert dataset_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = (
         test_files_directory / "mock_responses" / "datasets" / "data_delete_has_tasks.xml"
     )
@@ -1720,14 +1763,13 @@ def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key
     ):
         openml.datasets.delete_dataset(40_000)
 
-    dataset_url = "https://test.openml.org/api/v1/xml/data/40000"
+    dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/40000"
     assert dataset_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = (
         test_files_directory / "mock_responses" / "datasets" / "data_delete_successful.xml"
     )
@@ -1739,14 +1781,13 @@ def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key)
     success = openml.datasets.delete_dataset(40000)
     assert success
 
-    dataset_url = "https://test.openml.org/api/v1/xml/data/40000"
+    dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/40000"
     assert dataset_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = (
         test_files_directory / "mock_responses" / "datasets" / "data_delete_not_exist.xml"
     )
@@ -1761,7 +1802,7 @@ def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key)
     ):
         openml.datasets.delete_dataset(9_999_999)
 
-    dataset_url = "https://test.openml.org/api/v1/xml/data/9999999"
+    dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/9999999"
     assert dataset_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
@@ -1776,6 +1817,7 @@ def all_datasets():
     return openml.datasets.list_datasets()
 
 
+@pytest.mark.test_server()
 def test_list_datasets(all_datasets: pd.DataFrame):
     # We can only perform a smoke test here because we test on dynamic
     # data from the internet...
@@ -1784,42 +1826,49 @@ def test_list_datasets(all_datasets: pd.DataFrame):
     _assert_datasets_have_id_and_valid_status(all_datasets)
 
 
+@pytest.mark.test_server()
 def test_list_datasets_by_tag(all_datasets: pd.DataFrame):
     tag_datasets = openml.datasets.list_datasets(tag="study_14")
     assert 0 < len(tag_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(tag_datasets)
 
 
+@pytest.mark.test_server()
 def test_list_datasets_by_size():
     datasets = openml.datasets.list_datasets(size=5)
     assert len(datasets) == 5
     _assert_datasets_have_id_and_valid_status(datasets)
 
 
+@pytest.mark.test_server()
 def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame):
     small_datasets = openml.datasets.list_datasets(number_instances="5..100")
     assert 0 < len(small_datasets) <= len(all_datasets)
     _assert_datasets_have_id_and_valid_status(small_datasets)
 
 
+@pytest.mark.test_server()
 def test_list_datasets_by_number_features(all_datasets: pd.DataFrame):
     wide_datasets = openml.datasets.list_datasets(number_features="50..100")
     assert 8 <= len(wide_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(wide_datasets)
 
 
+@pytest.mark.test_server()
 def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame):
     five_class_datasets = openml.datasets.list_datasets(number_classes="5")
     assert 3 <= len(five_class_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(five_class_datasets)
 
 
+@pytest.mark.test_server()
 def test_list_datasets_by_number_missing_values(all_datasets: pd.DataFrame):
     na_datasets = openml.datasets.list_datasets(number_missing_values="5..100")
     assert 5 <= len(na_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(na_datasets)
 
 
+@pytest.mark.test_server()
 def test_list_datasets_combined_filters(all_datasets: pd.DataFrame):
     combined_filter_datasets = openml.datasets.list_datasets(
         tag="study_14",
@@ -1848,9 +1897,8 @@ def _dataset_features_is_downloaded(did: int):
 
 
 def _dataset_data_file_is_downloaded(did: int):
-    parquet_present = _dataset_file_is_downloaded(did, "dataset.pq")
-    arff_present = _dataset_file_is_downloaded(did, "dataset.arff")
-    return parquet_present or arff_present
+    cache_directory = Path(openml.config.get_cache_directory()) / "datasets" / str(did)
+    return any(f.suffix in (".pq", ".arff") for f in cache_directory.iterdir())
 
 
 def _assert_datasets_retrieved_successfully(
@@ -1892,6 +1940,7 @@ def isolate_for_test():
     ("with_data", "with_qualities", "with_features"),
     itertools.product([True, False], repeat=3),
 )
+@pytest.mark.test_server()
 def test_get_dataset_lazy_behavior(
     isolate_for_test, with_data: bool, with_qualities: bool, with_features: bool
 ):
@@ -1918,6 +1967,7 @@ def test_get_dataset_lazy_behavior(
     )
 
 
+@pytest.mark.test_server()
 def test_get_dataset_with_invalid_id() -> None:
     INVALID_ID = 123819023109238  # Well, at some point this will probably be valid...
     with pytest.raises(OpenMLServerNoResult, match="Unknown dataset") as e:
@@ -1945,6 +1995,7 @@ def test_read_features_from_xml_with_whitespace() -> None:
     assert dict[1].nominal_values == [" - 50000.", " 50000+."]
 
 
+@pytest.mark.test_server()
 def test_get_dataset_parquet(requests_mock, test_files_directory):
     # Parquet functionality is disabled on the test server
     # There is no parquet-copy of the test server yet.
@@ -1952,9 +2003,9 @@ def test_get_dataset_parquet(requests_mock, test_files_directory):
             test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
     )
     # While the mocked example is from production, unit tests by default connect to the test server.
-    requests_mock.get("https://test.openml.org/api/v1/xml/data/61", text=content_file.read_text())
+    requests_mock.get(f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/61", text=content_file.read_text())
     dataset = openml.datasets.get_dataset(61, download_data=True)
     assert dataset._parquet_url is not None
     assert dataset.parquet_file is not None
     assert os.path.isfile(dataset.parquet_file)
-    assert dataset.data_file is None  # is alias for arff path
+    assert dataset.data_file is None  # is alias for arff path
\ No newline at end of file
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index ffd3d9f78..e15556d7b 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -50,7 +50,7 @@ def _check_list_evaluation_setups(self, **kwargs):
             self.assertSequenceEqual(sorted(list1), sorted(list2))
         return evals_setups
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_filter_task(self):
         self.use_production_server()
 
@@ -70,7 +70,7 @@ def test_evaluation_list_filter_task(self):
             assert evaluations[run_id].value is not None
             assert evaluations[run_id].values is None
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_filter_uploader_ID_16(self):
         self.use_production_server()
 
@@ -85,7 +85,7 @@ def test_evaluation_list_filter_uploader_ID_16(self):
 
         assert len(evaluations) > 50
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_filter_uploader_ID_10(self):
         self.use_production_server()
 
@@ -104,7 +104,7 @@ def test_evaluation_list_filter_uploader_ID_10(self):
             assert evaluations[run_id].value is not None
             assert evaluations[run_id].values is None
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_filter_flow(self):
         self.use_production_server()
 
@@ -124,7 +124,7 @@ def test_evaluation_list_filter_flow(self):
             assert evaluations[run_id].value is not None
             assert evaluations[run_id].values is None
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_filter_run(self):
         self.use_production_server()
 
@@ -144,7 +144,7 @@ def test_evaluation_list_filter_run(self):
             assert evaluations[run_id].value is not None
             assert evaluations[run_id].values is None
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_limit(self):
         self.use_production_server()
 
@@ -155,6 +155,7 @@ def test_evaluation_list_limit(self):
         )
         assert len(evaluations) == 100
 
+    @pytest.mark.test_server()
     def test_list_evaluations_empty(self):
         evaluations = openml.evaluations.list_evaluations("unexisting_measure")
         if len(evaluations) > 0:
@@ -162,7 +163,7 @@ def test_list_evaluations_empty(self):
 
         assert isinstance(evaluations, dict)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_per_fold(self):
         self.use_production_server()
         size = 1000
@@ -200,7 +201,7 @@ def test_evaluation_list_per_fold(self):
             assert evaluations[run_id].value is not None
             assert evaluations[run_id].values is None
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_sort(self):
         self.use_production_server()
         size = 10
@@ -232,12 +233,13 @@ def test_evaluation_list_sort(self):
         test_output = sorted(unsorted_output, reverse=True)
         assert test_output[:size] == sorted_output
 
+    @pytest.mark.test_server()
     def test_list_evaluation_measures(self):
         measures = openml.evaluations.list_evaluation_measures()
         assert isinstance(measures, list) is True
         assert all(isinstance(s, str) for s in measures) is True
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_evaluations_setups_filter_flow(self):
         self.use_production_server()
         flow_id = [405]
@@ -255,7 +257,8 @@ def test_list_evaluations_setups_filter_flow(self):
         keys = list(evals["parameters"].values[0].keys())
         assert all(elem in columns for elem in keys)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_list_evaluations_setups_filter_task(self):
         self.use_production_server()
         task_id = [6]
diff --git a/tests/test_evaluations/test_evaluations_example.py b/tests/test_evaluations/test_evaluations_example.py
index a9ad7e8c1..b321f475d 100644
--- a/tests/test_evaluations/test_evaluations_example.py
+++ b/tests/test_evaluations/test_evaluations_example.py
@@ -3,14 +3,13 @@
 
 import unittest
 
-from openml.config import overwrite_config_context
-
+import openml
 
 class TestEvaluationsExample(unittest.TestCase):
     def test_example_python_paper(self):
         # Example script which will appear in the upcoming OpenML-Python paper
         # This test ensures that the example will keep running!
-        with overwrite_config_context(
+        with openml.config.overwrite_config_context(  # noqa: F823
             {
                 "server": "https://www.openml.org/api/v1/xml",
                 "apikey": None,
@@ -18,7 +17,6 @@ def test_example_python_paper(self):
         ):
             import matplotlib.pyplot as plt
             import numpy as np
-            import openml
 
             df = openml.evaluations.list_evaluations_setups(
                 "predictive_accuracy",
diff --git a/tests/test_extensions/test_functions.py b/tests/test_extensions/test_functions.py
index ac4610a15..90fbaa9f1 100644
--- a/tests/test_extensions/test_functions.py
+++ b/tests/test_extensions/test_functions.py
@@ -1,12 +1,14 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
-import inspect
+from collections import OrderedDict
 
+import inspect
+import numpy as np
 import pytest
-
+from unittest.mock import patch
 import openml.testing
-from openml.extensions import get_extension_by_flow, get_extension_by_model, register_extension
+from openml.extensions import Extension, get_extension_by_flow, get_extension_by_model, register_extension
 
 
 class DummyFlow:
@@ -40,54 +42,197 @@ def can_handle_model(model):
         return False
 
 
-def _unregister():
-    # "Un-register" the test extensions
-    while True:
-        rem_dum_ext1 = False
-        rem_dum_ext2 = False
-        try:
-            openml.extensions.extensions.remove(DummyExtension1)
-            rem_dum_ext1 = True
-        except ValueError:
-            pass
-        try:
-            openml.extensions.extensions.remove(DummyExtension2)
-            rem_dum_ext2 = True
-        except ValueError:
-            pass
-        if not rem_dum_ext1 and not rem_dum_ext2:
-            break
+class DummyExtension(Extension):
+    @classmethod
+    def can_handle_flow(cls, flow):
+        return isinstance(flow, DummyFlow)
+
+    @classmethod
+    def can_handle_model(cls, model):
+        return isinstance(model, DummyModel)
+
+    def flow_to_model(
+        self,
+        flow,
+        initialize_with_defaults=False,
+        strict_version=True,
+    ):
+        if not isinstance(flow, DummyFlow):
+            raise ValueError("Invalid flow")
+
+        model = DummyModel()
+        model.defaults = initialize_with_defaults
+        model.strict_version = strict_version
+        return model
+
+    def model_to_flow(self, model):
+        if not isinstance(model, DummyModel):
+            raise ValueError("Invalid model")
+        return DummyFlow()
+
+    def get_version_information(self):
+        return ["dummy==1.0"]
+
+    def create_setup_string(self, model):
+        return "DummyModel()"
+
+    def is_estimator(self, model):
+        return isinstance(model, DummyModel)
+
+    def seed_model(self, model, seed):
+        model.seed = seed
+        return model
+
+    def _run_model_on_fold(
+        self,
+        model,
+        task,
+        X_train,
+        rep_no,
+        fold_no,
+        y_train=None,
+        X_test=None,
+    ):
+        preds = np.zeros(len(X_train))
+        probs = None
+        measures = OrderedDict()
+        trace = None
+        return preds, probs, measures, trace
+
+    def obtain_parameter_values(self, flow, model=None):
+        return []
+
+    def check_if_model_fitted(self, model):
+        return False
+
+    def instantiate_model_from_hpo_class(self, model, trace_iteration):
+        return DummyModel()
+
 
 
 class TestInit(openml.testing.TestBase):
-    def setUp(self):
-        super().setUp()
-        _unregister()
 
     def test_get_extension_by_flow(self):
-        assert get_extension_by_flow(DummyFlow()) is None
-        with pytest.raises(ValueError, match="No extension registered which can handle flow:"):
-            get_extension_by_flow(DummyFlow(), raise_if_no_extension=True)
-        register_extension(DummyExtension1)
-        assert isinstance(get_extension_by_flow(DummyFlow()), DummyExtension1)
-        register_extension(DummyExtension2)
-        assert isinstance(get_extension_by_flow(DummyFlow()), DummyExtension1)
-        register_extension(DummyExtension1)
-        with pytest.raises(
-            ValueError, match="Multiple extensions registered which can handle flow:"
-        ):
-            get_extension_by_flow(DummyFlow())
+            # We replace the global list with a new empty list [] ONLY for this block
+            with patch("openml.extensions.extensions", []):
+                assert get_extension_by_flow(DummyFlow()) is None
+                
+                with pytest.raises(ValueError, match="No extension registered which can handle flow:"):
+                    get_extension_by_flow(DummyFlow(), raise_if_no_extension=True)
+                
+                register_extension(DummyExtension1)
+                assert isinstance(get_extension_by_flow(DummyFlow()), DummyExtension1)
+                
+                register_extension(DummyExtension2)
+                assert isinstance(get_extension_by_flow(DummyFlow()), DummyExtension1)
+                
+                register_extension(DummyExtension1)
+                with pytest.raises(
+                    ValueError, match="Multiple extensions registered which can handle flow:"
+                ):
+                    get_extension_by_flow(DummyFlow())
 
     def test_get_extension_by_model(self):
-        assert get_extension_by_model(DummyModel()) is None
-        with pytest.raises(ValueError, match="No extension registered which can handle model:"):
-            get_extension_by_model(DummyModel(), raise_if_no_extension=True)
-        register_extension(DummyExtension1)
-        assert isinstance(get_extension_by_model(DummyModel()), DummyExtension1)
-        register_extension(DummyExtension2)
-        assert isinstance(get_extension_by_model(DummyModel()), DummyExtension1)
-        register_extension(DummyExtension1)
-        with pytest.raises(
-            ValueError, match="Multiple extensions registered which can handle model:"
-        ):
-            get_extension_by_model(DummyModel())
+        # Again, we start with a fresh empty list automatically
+        with patch("openml.extensions.extensions", []):
+            assert get_extension_by_model(DummyModel()) is None
+            
+            with pytest.raises(ValueError, match="No extension registered which can handle model:"):
+                get_extension_by_model(DummyModel(), raise_if_no_extension=True)
+            
+            register_extension(DummyExtension1)
+            assert isinstance(get_extension_by_model(DummyModel()), DummyExtension1)
+            
+            register_extension(DummyExtension2)
+            assert isinstance(get_extension_by_model(DummyModel()), DummyExtension1)
+            
+            register_extension(DummyExtension1)
+            with pytest.raises(
+                ValueError, match="Multiple extensions registered which can handle model:"
+            ):
+                get_extension_by_model(DummyModel())
+
+
+def test_flow_to_model_with_defaults():
+    """Test flow_to_model with initialize_with_defaults=True."""
+    ext = DummyExtension()
+    flow = DummyFlow()
+
+    model = ext.flow_to_model(flow, initialize_with_defaults=True)
+
+    assert isinstance(model, DummyModel)
+    assert model.defaults is True
+
+def test_flow_to_model_strict_version():
+    """Test flow_to_model with strict_version parameter."""
+    ext = DummyExtension()
+    flow = DummyFlow()
+
+    model_strict = ext.flow_to_model(flow, strict_version=True)
+    model_non_strict = ext.flow_to_model(flow, strict_version=False)
+
+    assert isinstance(model_strict, DummyModel)
+    assert model_strict.strict_version is True
+
+    assert isinstance(model_non_strict, DummyModel)
+    assert model_non_strict.strict_version is False
+
+def test_model_to_flow_conversion():
+    """Test converting a model back to flow representation."""
+    ext = DummyExtension()
+    model = DummyModel()
+
+    flow = ext.model_to_flow(model)
+
+    assert isinstance(flow, DummyFlow)
+
+
+def test_invalid_flow_raises_error():
+    """Test that invalid flow raises appropriate error."""
+    class InvalidFlow:
+        pass
+
+    ext = DummyExtension()
+    flow = InvalidFlow()
+
+    with pytest.raises(ValueError, match="Invalid flow"):
+        ext.flow_to_model(flow)
+
+
+@patch("openml.extensions.extensions", [])
+def test_extension_not_found_error_message():
+    """Test error message contains helpful information."""
+    class UnknownModel:
+        pass
+
+    with pytest.raises(ValueError, match="No extension registered"):
+        get_extension_by_model(UnknownModel(), raise_if_no_extension=True)
+
+ 
+def test_register_same_extension_twice():
+    """Test behavior when registering same extension twice."""
+    # Using a context manager here to isolate the list
+    with patch("openml.extensions.extensions", []):
+        register_extension(DummyExtension)
+        register_extension(DummyExtension)
+
+        matches = [
+            ext for ext in openml.extensions.extensions
+            if ext is DummyExtension
+        ]
+        assert len(matches) == 2
+
+
+@patch("openml.extensions.extensions", [])
+def test_extension_priority_order():
+    """Test that extensions are checked in registration order."""    
+    class DummyExtensionA(DummyExtension):
+        pass
+    class DummyExtensionB(DummyExtension):
+        pass
+
+    register_extension(DummyExtensionA)
+    register_extension(DummyExtensionB)
+
+    assert openml.extensions.extensions[0] is DummyExtensionA
+    assert openml.extensions.extensions[1] is DummyExtensionB
\ No newline at end of file
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index 0b034c3b4..4e391fd3b 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -5,6 +5,7 @@
 import copy
 import hashlib
 import re
+import os
 import time
 from packaging.version import Version
 from unittest import mock
@@ -33,7 +34,6 @@
 from openml.testing import SimpleImputer, TestBase
 
 
-
 class TestFlow(TestBase):
     _multiprocess_can_split_ = True
 
@@ -44,7 +44,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_flow(self):
         # We need to use the production server here because 4024 is not the
         # test server
@@ -77,7 +77,8 @@ def test_get_flow(self):
         assert subflow_3.parameters["L"] == "-1"
         assert len(subflow_3.components) == 0
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_structure(self):
         # also responsible for testing: flow.get_subflow
         # We need to use the production server here because 4024 is not the
@@ -102,6 +103,7 @@ def test_get_structure(self):
                 subflow = flow.get_subflow(structure)
                 assert subflow.flow_id == sub_flow_id
 
+    @pytest.mark.test_server()
     def test_tagging(self):
         flows = openml.flows.list_flows(size=1)
         flow_id = flows["id"].iloc[0]
@@ -119,6 +121,7 @@ def test_tagging(self):
         flows = openml.flows.list_flows(tag=tag)
         assert len(flows) == 0
 
+    @pytest.mark.test_server()
     def test_from_xml_to_xml(self):
         # Get the raw xml thing
         # TODO maybe get this via get_flow(), which would have to be refactored
@@ -178,6 +181,7 @@ def test_to_xml_from_xml(self):
         assert new_flow is not flow
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_publish_flow(self):
         flow = openml.OpenMLFlow(
             name="sklearn.dummy.DummyClassifier",
@@ -219,6 +223,7 @@ def test_publish_existing_flow(self, flow_exists_mock):
         )
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_publish_flow_with_similar_components(self):
         clf = sklearn.ensemble.VotingClassifier(
             [("lr", sklearn.linear_model.LogisticRegression(solver="lbfgs"))],
@@ -269,6 +274,7 @@ def test_publish_flow_with_similar_components(self):
         TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}")
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_semi_legal_flow(self):
         # TODO: Test if parameters are set correctly!
         # should not throw error as it contains two differentiable forms of
@@ -360,6 +366,7 @@ def test_illegal_flow(self):
         )
         self.assertRaises(ValueError, self.extension.model_to_flow, illegal)
 
+    @pytest.mark.test_server()
     def test_nonexisting_flow_exists(self):
         def get_sentinel():
             # Create a unique prefix for the flow. Necessary because the flow
@@ -377,6 +384,7 @@ def get_sentinel():
         assert not flow_id
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_existing_flow_exists(self):
         # create a flow
         nb = sklearn.naive_bayes.GaussianNB()
@@ -417,6 +425,7 @@ def test_existing_flow_exists(self):
             assert downloaded_flow_id == flow.flow_id
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_sklearn_to_upload_to_flow(self):
         iris = sklearn.datasets.load_iris()
         X = iris.data
@@ -556,7 +565,7 @@ def test_extract_tags(self):
         tags = openml.utils.extract_xml_tags("oml:tag", flow_dict["oml:flow"])
         assert tags == ["OpenmlWeka", "weka"]
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_download_non_scikit_learn_flows(self):
         self.use_production_server()
 
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index ef4759e54..14bb78060 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -12,6 +12,7 @@
 from unittest import mock
 from unittest.mock import patch
 
+import os
 import pandas as pd
 import pytest
 import requests
@@ -41,12 +42,13 @@ def _check_flow(self, flow):
         assert isinstance(flow["full_name"], str)
         assert isinstance(flow["version"], str)
         # There are some runs on openml.org that can have an empty external version
+        ext_version = flow["external_version"]
         ext_version_str_or_none = (
-            isinstance(flow["external_version"], str) or flow["external_version"] is None
+            isinstance(ext_version, str) or ext_version is None or pd.isna(ext_version)
         )
         assert ext_version_str_or_none
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_flows(self):
         self.use_production_server()
         # We can only perform a smoke test here because we test on dynamic
@@ -57,7 +59,7 @@ def test_list_flows(self):
         for flow in flows.to_dict(orient="index").values():
             self._check_flow(flow)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_flows_output_format(self):
         self.use_production_server()
         # We can only perform a smoke test here because we test on dynamic
@@ -66,14 +68,13 @@ def test_list_flows_output_format(self):
         assert isinstance(flows, pd.DataFrame)
         assert len(flows) >= 1500
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_flows_empty(self):
         self.use_production_server()
-        openml.config.server = self.production_server
         flows = openml.flows.list_flows(tag="NoOneEverUsesThisTag123")
         assert flows.empty
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_flows_by_tag(self):
         self.use_production_server()
         flows = openml.flows.list_flows(tag="weka")
@@ -81,7 +82,7 @@ def test_list_flows_by_tag(self):
         for flow in flows.to_dict(orient="index").values():
             self._check_flow(flow)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_flows_paginate(self):
         self.use_production_server()
         size = 10
@@ -280,6 +281,8 @@ def test_are_flows_equal_ignore_if_older(self):
         reason="OrdinalEncoder introduced in 0.20. "
         "No known models with list of lists parameters in older versions.",
     )
+    @pytest.mark.test_server()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_sklearn_to_flow_list_of_lists(self):
         from sklearn.preprocessing import OrdinalEncoder
 
@@ -299,7 +302,7 @@ def test_sklearn_to_flow_list_of_lists(self):
         assert server_flow.parameters["categories"] == "[[0, 1], [0, 1]]"
         assert server_flow.model.categories == flow.model.categories
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_flow1(self):
         # Regression test for issue #305
         # Basically, this checks that a flow without an external version can be loaded
@@ -308,6 +311,7 @@ def test_get_flow1(self):
         assert flow.external_version is None
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_get_flow_reinstantiate_model(self):
         model = ensemble.RandomForestClassifier(n_estimators=33)
         extension = openml.extensions.get_extension_by_model(model)
@@ -319,6 +323,7 @@ def test_get_flow_reinstantiate_model(self):
         downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
         assert isinstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier)
 
+    @pytest.mark.test_server()
     def test_get_flow_reinstantiate_model_no_extension(self):
         # Flow 10 is a WEKA flow
         self.assertRaisesRegex(
@@ -334,7 +339,7 @@ def test_get_flow_reinstantiate_model_no_extension(self):
         Version(sklearn.__version__) == Version("0.19.1"),
         reason="Requires scikit-learn!=0.19.1, because target flow is from that version.",
     )
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(self):
         self.use_production_server()
         flow = 8175
@@ -355,7 +360,7 @@ def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(
         # Because scikit-learn dropped min_impurity_split hyperparameter in 1.0,
         # and the requested flow is from 1.0.0 exactly.
     )
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_flow_reinstantiate_flow_not_strict_post_1(self):
         self.use_production_server()
         flow = openml.flows.get_flow(flow_id=19190, reinstantiate=True, strict_version=False)
@@ -369,7 +374,7 @@ def test_get_flow_reinstantiate_flow_not_strict_post_1(self):
         reason="Requires scikit-learn 0.23.2 or ~0.24.",
         # Because these still have min_impurity_split, but with new scikit-learn module structure."
     )
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self):
         self.use_production_server()
         flow = openml.flows.get_flow(flow_id=18587, reinstantiate=True, strict_version=False)
@@ -381,7 +386,7 @@ def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self):
         Version(sklearn.__version__) > Version("0.23"),
         reason="Requires scikit-learn<=0.23, because the scikit-learn module structure changed.",
     )
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_flow_reinstantiate_flow_not_strict_pre_023(self):
         self.use_production_server()
         flow = openml.flows.get_flow(flow_id=8175, reinstantiate=True, strict_version=False)
@@ -389,6 +394,7 @@ def test_get_flow_reinstantiate_flow_not_strict_pre_023(self):
         assert "sklearn==0.19.1" not in flow.dependencies
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_get_flow_id(self):
         if self.long_version:
             list_all = openml.utils._list_all
@@ -417,9 +423,13 @@ def test_get_flow_id(self):
                 name=flow.name,
                 exact_version=False,
             )
-            assert flow_ids_exact_version_True == flow_ids_exact_version_False
             assert flow.flow_id in flow_ids_exact_version_True
+            assert set(flow_ids_exact_version_True).issubset(set(flow_ids_exact_version_False))
+            # instead of the assertion above, the assertion below used to be used.
+            pytest.skip(reason="Not sure why there should only be one version of this flow.")
+            assert flow_ids_exact_version_True == flow_ids_exact_version_False
 
+    @pytest.mark.test_server()
     def test_delete_flow(self):
         flow = openml.OpenMLFlow(
             name="sklearn.dummy.DummyClassifier",
@@ -444,7 +454,6 @@ def test_delete_flow(self):
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_flow_not_owned(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_not_owned.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -457,14 +466,13 @@ def test_delete_flow_not_owned(mock_delete, test_files_directory, test_api_key):
     ):
         openml.flows.delete_flow(40_000)
 
-    flow_url = "https://test.openml.org/api/v1/xml/flow/40000"
+    flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/40000"
     assert flow_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_flow_with_run(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_has_runs.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -477,14 +485,13 @@ def test_delete_flow_with_run(mock_delete, test_files_directory, test_api_key):
     ):
         openml.flows.delete_flow(40_000)
 
-    flow_url = "https://test.openml.org/api/v1/xml/flow/40000"
+    flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/40000"
     assert flow_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_subflow(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_is_subflow.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -497,14 +504,13 @@ def test_delete_subflow(mock_delete, test_files_directory, test_api_key):
     ):
         openml.flows.delete_flow(40_000)
 
-    flow_url = "https://test.openml.org/api/v1/xml/flow/40000"
+    flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/40000"
     assert flow_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_flow_success(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_successful.xml"
     mock_delete.return_value = create_request_response(
         status_code=200,
@@ -514,14 +520,14 @@ def test_delete_flow_success(mock_delete, test_files_directory, test_api_key):
     success = openml.flows.delete_flow(33364)
     assert success
 
-    flow_url = "https://test.openml.org/api/v1/xml/flow/33364"
+    flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/33364"
     assert flow_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
+@pytest.mark.xfail(reason="failures_issue_1544", strict=False)
 def test_delete_unknown_flow(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_not_exist.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -534,6 +540,6 @@ def test_delete_unknown_flow(mock_delete, test_files_directory, test_api_key):
     ):
         openml.flows.delete_flow(9_999_999)
 
-    flow_url = "https://test.openml.org/api/v1/xml/flow/9999999"
+    flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/9999999"
     assert flow_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py
index da6857b6e..f2a81be9f 100644
--- a/tests/test_openml/test_api_calls.py
+++ b/tests/test_openml/test_api_calls.py
@@ -7,20 +7,22 @@
 
 import minio
 import pytest
+import os
 
 import openml
-from openml.config import ConfigurationForExamples
 import openml.testing
 from openml._api_calls import _download_minio_bucket, API_TOKEN_HELP_LINK
 
 
 class TestConfig(openml.testing.TestBase):
+    @pytest.mark.test_server()
     def test_too_long_uri(self):
         with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"):
             openml.datasets.list_datasets(data_id=list(range(10000)))
 
     @unittest.mock.patch("time.sleep")
     @unittest.mock.patch("requests.Session")
+    @pytest.mark.test_server()
     def test_retry_on_database_error(self, Session_class_mock, _):
         response_mock = unittest.mock.Mock()
         response_mock.text = (
@@ -115,11 +117,12 @@ def test_download_minio_failure(mock_minio, tmp_path: Path) -> None:
         ("task/42", "delete"),  # 460
     ],
 )
+@pytest.mark.test_server()
 def test_authentication_endpoints_requiring_api_key_show_relevant_help_link(
     endpoint: str,
     method: str,
 ) -> None:
     # We need to temporarily disable the API key to test the error message
     with openml.config.overwrite_config_context({"apikey": None}):
-        with pytest.raises(openml.exceptions.OpenMLNotAuthorizedError, match=API_TOKEN_HELP_LINK):
+        with pytest.raises(openml.exceptions.OpenMLAuthenticationError, match=API_TOKEN_HELP_LINK):
             openml._api_calls._perform_api_call(call=endpoint, request_method=method, data=None)
diff --git a/tests/test_openml/test_cli.py b/tests/test_openml/test_cli.py
new file mode 100644
index 000000000..eb213b561
--- /dev/null
+++ b/tests/test_openml/test_cli.py
@@ -0,0 +1,44 @@
+# License: BSD 3-Clause
+from __future__ import annotations
+
+import shutil
+import subprocess
+import sys
+
+import openml
+import pytest
+
+
+def test_cli_version_prints_package_version():
+    # Invoke the CLI via module to avoid relying on console script installation
+    result = subprocess.run(
+        [sys.executable, "-m", "openml.cli", "--version"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        check=False,
+    )
+
+    # Ensure successful exit and version present in stdout only
+    assert result.returncode == 0
+    assert result.stderr == ""
+    assert openml.__version__ in result.stdout
+
+
+def test_console_script_version_prints_package_version():
+    # Try to locate the console script; skip if not installed in PATH
+    console = shutil.which("openml")
+    if console is None:
+        pytest.skip("'openml' console script not found in PATH")
+
+    result = subprocess.run(
+        [console, "--version"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        check=False,
+    )
+
+    assert result.returncode == 0
+    assert result.stderr == ""
+    assert openml.__version__ in result.stdout
diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py
index 0324545a7..f3feca784 100644
--- a/tests/test_openml/test_config.py
+++ b/tests/test_openml/test_config.py
@@ -12,8 +12,9 @@
 
 import pytest
 
-import openml.config
+import openml
 import openml.testing
+from openml.testing import TestBase
 
 
 @contextmanager
@@ -36,7 +37,7 @@ def safe_environ_patcher(key: str, value: Any) -> Iterator[None]:
 
 class TestConfig(openml.testing.TestBase):
     @unittest.mock.patch("openml.config.openml_logger.warning")
-    @unittest.mock.patch("openml.config._create_log_handlers")
+    @unittest.mock.patch("openml._config.OpenMLConfigManager._create_log_handlers")
     @unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033")
     @unittest.skipIf(
         platform.uname().release.endswith(("-Microsoft", "microsoft-standard-WSL2")),
@@ -76,8 +77,8 @@ def test_get_config_as_dict(self):
         """Checks if the current configuration is returned accurately as a dict."""
         config = openml.config.get_config_as_dict()
         _config = {}
-        _config["apikey"] = "610344db6388d9ba34f6db45a3cf71de"
-        _config["server"] = "https://test.openml.org/api/v1/xml"
+        _config["apikey"] = TestBase.user_key
+        _config["server"] = f"{openml.config.TEST_SERVER_URL}/api/v1/xml"
         _config["cachedir"] = self.workdir
         _config["avoid_duplicate_runs"] = False
         _config["connection_n_retries"] = 20
@@ -90,7 +91,7 @@ def test_get_config_as_dict(self):
     def test_setup_with_config(self):
         """Checks if the OpenML configuration can be updated using _setup()."""
         _config = {}
-        _config["apikey"] = "610344db6388d9ba34f6db45a3cf71de"
+        _config["apikey"] = TestBase.user_key
         _config["server"] = "https://www.openml.org/api/v1/xml"
         _config["cachedir"] = self.workdir
         _config["avoid_duplicate_runs"] = True
@@ -105,29 +106,28 @@ def test_setup_with_config(self):
 
 
 class TestConfigurationForExamples(openml.testing.TestBase):
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_switch_to_example_configuration(self):
         """Verifies the test configuration is loaded properly."""
         # Below is the default test key which would be used anyway, but just for clarity:
-        openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de"
+        openml.config.apikey = "any-api-key"
         openml.config.server = self.production_server
 
         openml.config.start_using_configuration_for_example()
 
-        assert openml.config.apikey == "c0c42819af31e706efe1f4b88c23c6c1"
+        assert openml.config.apikey == TestBase.user_key
         assert openml.config.server == self.test_server
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_switch_from_example_configuration(self):
         """Verifies the previous configuration is loaded after stopping."""
         # Below is the default test key which would be used anyway, but just for clarity:
-        openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de"
+        openml.config.apikey = TestBase.user_key
         openml.config.server = self.production_server
 
         openml.config.start_using_configuration_for_example()
         openml.config.stop_using_configuration_for_example()
-
-        assert openml.config.apikey == "610344db6388d9ba34f6db45a3cf71de"
+        assert openml.config.apikey == TestBase.user_key
         assert openml.config.server == self.production_server
 
     def test_example_configuration_stop_before_start(self):
@@ -135,24 +135,24 @@ def test_example_configuration_stop_before_start(self):
         error_regex = ".*stop_use_example_configuration.*start_use_example_configuration.*first"
         # Tests do not reset the state of this class. Thus, we ensure it is in
         # the original state before the test.
-        openml.config.ConfigurationForExamples._start_last_called = False
+        openml.config._examples._start_last_called = False
         self.assertRaisesRegex(
             RuntimeError,
             error_regex,
             openml.config.stop_using_configuration_for_example,
         )
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_example_configuration_start_twice(self):
         """Checks that the original config can be returned to if `start..` is called twice."""
-        openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de"
+        openml.config.apikey = TestBase.user_key
         openml.config.server = self.production_server
 
         openml.config.start_using_configuration_for_example()
         openml.config.start_using_configuration_for_example()
         openml.config.stop_using_configuration_for_example()
 
-        assert openml.config.apikey == "610344db6388d9ba34f6db45a3cf71de"
+        assert openml.config.apikey == TestBase.user_key
         assert openml.config.server == self.production_server
 
 
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 034b731aa..22a8bc936 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -25,6 +25,7 @@ class TestRun(TestBase):
     # Splitting not helpful, these test's don't rely on the server and take
     # less than 1 seconds
 
+    @pytest.mark.test_server()
     def test_tagging(self):
         runs = openml.runs.list_runs(size=1)
         assert not runs.empty, "Test server state is incorrect"
@@ -118,6 +119,7 @@ def _check_array(array, type_):
             assert run_prime_trace_content is None
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_to_from_filesystem_vanilla(self):
         model = Pipeline(
             [
@@ -153,6 +155,7 @@ def test_to_from_filesystem_vanilla(self):
 
     @pytest.mark.sklearn()
     @pytest.mark.flaky()
+    @pytest.mark.test_server()
     def test_to_from_filesystem_search(self):
         model = Pipeline(
             [
@@ -187,6 +190,7 @@ def test_to_from_filesystem_search(self):
         )
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_to_from_filesystem_no_model(self):
         model = Pipeline(
             [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())],
@@ -292,6 +296,7 @@ def assert_run_prediction_data(task, run, model):
             assert_method(y_test, saved_y_test)
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_publish_with_local_loaded_flow(self):
         """
         Publish a run tied to a local flow after it has first been saved to
@@ -335,6 +340,8 @@ def test_publish_with_local_loaded_flow(self):
             openml.runs.get_run(loaded_run.run_id)
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
+    @pytest.mark.skip(reason="https://github.com/openml/openml-python/issues/1586")
     def test_offline_and_online_run_identical(self):
         extension = SklearnExtension()
 
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index b02acdf51..8d5a00f9b 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -398,6 +398,7 @@ def _check_sample_evaluations(
                             assert evaluation < max_time_allowed
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_regression_on_classif_task(self):
         task_id = 259  # collins; crossvalidation; has numeric targets
 
@@ -414,6 +415,7 @@ def test_run_regression_on_classif_task(self):
             )
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_check_erronous_sklearn_flow_fails(self):
         task_id = 115  # diabetes; crossvalidation
         task = openml.tasks.get_task(task_id)
@@ -626,6 +628,7 @@ def _run_and_upload_regression(
         )
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_and_upload_logistic_regression(self):
         lr = LogisticRegression(solver="lbfgs", max_iter=1000)
         task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
@@ -634,6 +637,7 @@ def test_run_and_upload_logistic_regression(self):
         self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_and_upload_linear_regression(self):
         lr = LinearRegression()
         task_id = self.TEST_SERVER_TASK_REGRESSION["task_id"]
@@ -664,6 +668,7 @@ def test_run_and_upload_linear_regression(self):
         self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_and_upload_pipeline_dummy_pipeline(self):
         pipeline1 = Pipeline(
             steps=[
@@ -681,6 +686,7 @@ def test_run_and_upload_pipeline_dummy_pipeline(self):
         Version(sklearn.__version__) < Version("0.20"),
         reason="columntransformer introduction in 0.20.0",
     )
+    @pytest.mark.test_server()
     def test_run_and_upload_column_transformer_pipeline(self):
         import sklearn.compose
         import sklearn.impute
@@ -793,6 +799,7 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock):
         assert call_count == 3
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_and_upload_gridsearch(self):
         estimator_name = (
             "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
@@ -815,6 +822,7 @@ def test_run_and_upload_gridsearch(self):
         assert len(run.trace.trace_iterations) == 9
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_and_upload_randomsearch(self):
         randomsearch = RandomizedSearchCV(
             RandomForestClassifier(n_estimators=5),
@@ -847,6 +855,7 @@ def test_run_and_upload_randomsearch(self):
         assert len(trace.trace_iterations) == 5
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_and_upload_maskedarrays(self):
         # This testcase is important for 2 reasons:
         # 1) it verifies the correct handling of masked arrays (not all
@@ -874,6 +883,7 @@ def test_run_and_upload_maskedarrays(self):
     ##########################################################################
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_learning_curve_task_1(self):
         task_id = 801  # diabates dataset
         num_test_instances = 6144  # for learning curve
@@ -898,6 +908,7 @@ def test_learning_curve_task_1(self):
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_learning_curve_task_2(self):
         task_id = 801  # diabates dataset
         num_test_instances = 6144  # for learning curve
@@ -938,6 +949,7 @@ def test_learning_curve_task_2(self):
         Version(sklearn.__version__) < Version("0.21"),
         reason="Pipelines don't support indexing (used for the assert check)",
     )
+    @pytest.mark.test_server()
     def test_initialize_cv_from_run(self):
         randomsearch = Pipeline(
             [
@@ -1012,6 +1024,7 @@ def _test_local_evaluations(self, run):
                 assert alt_scores[idx] <= 1
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_local_run_swapped_parameter_order_model(self):
         clf = DecisionTreeClassifier()
         australian_task = 595  # Australian; crossvalidation
@@ -1027,10 +1040,12 @@ def test_local_run_swapped_parameter_order_model(self):
         self._test_local_evaluations(run)
 
     @pytest.mark.sklearn()
+    @pytest.mark.skip("https://github.com/openml/openml-python/issues/1586")
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
+    @pytest.mark.test_server()
     def test_local_run_swapped_parameter_order_flow(self):
         # construct sci-kit learn classifier
         clf = Pipeline(
@@ -1054,11 +1069,13 @@ def test_local_run_swapped_parameter_order_flow(self):
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.skip(reason="https://github.com/openml/openml-python/issues/1586")
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
+    @pytest.mark.test_server()
     def test_local_run_metric_score(self):
         # construct sci-kit learn classifier
         clf = Pipeline(
@@ -1081,7 +1098,7 @@ def test_local_run_metric_score(self):
 
         self._test_local_evaluations(run)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_online_run_metric_score(self):
         self.use_production_server()
 
@@ -1096,6 +1113,7 @@ def test_online_run_metric_score(self):
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
+    @pytest.mark.test_server()
     def test_initialize_model_from_run(self):
         clf = sklearn.pipeline.Pipeline(
             steps=[
@@ -1157,6 +1175,7 @@ def test_initialize_model_from_run(self):
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
+    @pytest.mark.test_server()
     def test__run_exists(self):
         # would be better to not sentinel these clfs,
         # so we do not have to perform the actual runs
@@ -1212,6 +1231,7 @@ def test__run_exists(self):
             assert run_ids, (run_ids, clf)
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_with_illegal_flow_id(self):
         # check the case where the user adds an illegal flow id to a
         # non-existing flo
@@ -1231,6 +1251,7 @@ def test_run_with_illegal_flow_id(self):
             )
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_with_illegal_flow_id_after_load(self):
         # Same as `test_run_with_illegal_flow_id`, but test this error is also
         # caught if the run is stored to and loaded from disk first.
@@ -1262,6 +1283,7 @@ def test_run_with_illegal_flow_id_after_load(self):
             TestBase.logger.info(f"collected from test_run_functions: {loaded_run.run_id}")
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_with_illegal_flow_id_1(self):
         # Check the case where the user adds an illegal flow id to an existing
         # flow. Comes to a different value error than the previous test
@@ -1287,6 +1309,7 @@ def test_run_with_illegal_flow_id_1(self):
             )
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_with_illegal_flow_id_1_after_load(self):
         # Same as `test_run_with_illegal_flow_id_1`, but test this error is
         # also caught if the run is stored to and loaded from disk first.
@@ -1329,6 +1352,7 @@ def test_run_with_illegal_flow_id_1_after_load(self):
         Version(sklearn.__version__) < Version("0.20"),
         reason="OneHotEncoder cannot handle mixed type DataFrame as input",
     )
+    @pytest.mark.test_server()
     def test__run_task_get_arffcontent(self):
         task = openml.tasks.get_task(7)  # kr-vs-kp; crossvalidation
         num_instances = 3196
@@ -1385,7 +1409,7 @@ def test__create_trace_from_arff(self):
             trace_arff = arff.load(arff_file)
         OpenMLRunTrace.trace_from_arff(trace_arff)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_run(self):
         # this run is not available on test
         self.use_production_server()
@@ -1407,9 +1431,8 @@ def test_get_run(self):
             assert run.fold_evaluations["f_measure"][0][i] == value
         assert "weka" in run.tags
         assert "weka_3.7.12" in run.tags
-        assert run.predictions_url == (
-            "https://api.openml.org/data/download/1667125/"
-            "weka_generated_predictions4575715871712251329.arff"
+        assert run.predictions_url.endswith(
+            "/data/download/1667125/weka_generated_predictions4575715871712251329.arff"
         )
 
     def _check_run(self, run):
@@ -1421,7 +1444,7 @@ def _check_run(self, run):
         assert isinstance(run, dict)
         assert len(run) == 8, str(run)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_runs_list(self):
         # TODO: comes from live, no such lists on test
         self.use_production_server()
@@ -1430,11 +1453,12 @@ def test_get_runs_list(self):
         for run in runs.to_dict(orient="index").values():
             self._check_run(run)
 
+    @pytest.mark.test_server()
     def test_list_runs_empty(self):
         runs = openml.runs.list_runs(task=[0])
         assert runs.empty
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_runs_list_by_task(self):
         # TODO: comes from live, no such lists on test
         self.use_production_server()
@@ -1453,7 +1477,7 @@ def test_get_runs_list_by_task(self):
             assert run["task_id"] in task_ids
             self._check_run(run)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_runs_list_by_uploader(self):
         # TODO: comes from live, no such lists on test
         self.use_production_server()
@@ -1475,7 +1499,7 @@ def test_get_runs_list_by_uploader(self):
             assert run["uploader"] in uploader_ids
             self._check_run(run)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_runs_list_by_flow(self):
         # TODO: comes from live, no such lists on test
         self.use_production_server()
@@ -1494,7 +1518,7 @@ def test_get_runs_list_by_flow(self):
             assert run["flow_id"] in flow_ids
             self._check_run(run)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_runs_pagination(self):
         # TODO: comes from live, no such lists on test
         self.use_production_server()
@@ -1507,7 +1531,7 @@ def test_get_runs_pagination(self):
             for run in runs.to_dict(orient="index").values():
                 assert run["uploader"] in uploader_ids
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_runs_list_by_filters(self):
         # TODO: comes from live, no such lists on test
         self.use_production_server()
@@ -1544,13 +1568,13 @@ def test_get_runs_list_by_filters(self):
         )
         assert len(runs) == 2
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_runs_list_by_tag(self):
-        # TODO: comes from live, no such lists on test
-        # Unit test works on production server only
-
+        # We don't have tagged runs on the test server
         self.use_production_server()
-        runs = openml.runs.list_runs(tag="curves")
+        # Don't remove the size restriction: this query is too expensive without
+        runs = openml.runs.list_runs(tag="curves", size=2)
         assert len(runs) >= 1
 
     @pytest.mark.sklearn()
@@ -1558,6 +1582,7 @@ def test_get_runs_list_by_tag(self):
         Version(sklearn.__version__) < Version("0.20"),
         reason="columntransformer introduction in 0.20.0",
     )
+    @pytest.mark.test_server()
     def test_run_on_dataset_with_missing_labels_dataframe(self):
         # Check that _run_task_get_arffcontent works when one of the class
         # labels only declared in the arff file, but is not present in the
@@ -1594,6 +1619,7 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
         Version(sklearn.__version__) < Version("0.20"),
         reason="columntransformer introduction in 0.20.0",
     )
+    @pytest.mark.test_server()
     def test_run_on_dataset_with_missing_labels_array(self):
         # Check that _run_task_get_arffcontent works when one of the class
         # labels only declared in the arff file, but is not present in the
@@ -1632,6 +1658,7 @@ def test_run_on_dataset_with_missing_labels_array(self):
             # repeat, fold, row_id, 6 confidences, prediction and correct label
             assert len(row) == 12
 
+    @pytest.mark.test_server()
     def test_get_cached_run(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         openml.runs.functions._get_cached_run(1)
@@ -1642,6 +1669,7 @@ def test_get_uncached_run(self):
             openml.runs.functions._get_cached_run(10)
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_run_flow_on_task_downloaded_flow(self):
         model = sklearn.ensemble.RandomForestClassifier(n_estimators=33)
         flow = self.extension.model_to_flow(model)
@@ -1661,7 +1689,7 @@ def test_run_flow_on_task_downloaded_flow(self):
         TestBase._mark_entity_for_removal("run", run.run_id)
         TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {run.run_id}")
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_format_prediction_non_supervised(self):
         # non-supervised tasks don't exist on the test server
         self.use_production_server()
@@ -1672,6 +1700,7 @@ def test_format_prediction_non_supervised(self):
         ):
             format_prediction(clustering, *ignored_input)
 
+    @pytest.mark.test_server()
     def test_format_prediction_classification_no_probabilities(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1681,6 +1710,7 @@ def test_format_prediction_classification_no_probabilities(self):
         with pytest.raises(ValueError, match="`proba` is required for classification task"):
             format_prediction(classification, *ignored_input, proba=None)
 
+    @pytest.mark.test_server()
     def test_format_prediction_classification_incomplete_probabilities(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1691,6 +1721,7 @@ def test_format_prediction_classification_incomplete_probabilities(self):
         with pytest.raises(ValueError, match="Each class should have a predicted probability"):
             format_prediction(classification, *ignored_input, proba=incomplete_probabilities)
 
+    @pytest.mark.test_server()
     def test_format_prediction_task_without_classlabels_set(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1701,6 +1732,7 @@ def test_format_prediction_task_without_classlabels_set(self):
         with pytest.raises(ValueError, match="The classification task must have class labels set"):
             format_prediction(classification, *ignored_input, proba={})
 
+    @pytest.mark.test_server()
     def test_format_prediction_task_learning_curve_sample_not_set(self):
         learning_curve = openml.tasks.get_task(801, download_data=False)  # diabetes;crossvalidation
         probabilities = {c: 0.2 for c in learning_curve.class_labels}
@@ -1708,6 +1740,7 @@ def test_format_prediction_task_learning_curve_sample_not_set(self):
         with pytest.raises(ValueError, match="`sample` can not be none for LearningCurveTask"):
             format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities)
 
+    @pytest.mark.test_server()
     def test_format_prediction_task_regression(self):
         task_meta_data = self.TEST_SERVER_TASK_REGRESSION["task_meta_data"]
         _task_id = check_task_existence(**task_meta_data)
@@ -1736,12 +1769,12 @@ def test_format_prediction_task_regression(self):
         self.assertListEqual(res, [0] * 5)
 
 
-
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_delete_run(self):
         rs = np.random.randint(1, 2**31 - 1)
         clf = sklearn.pipeline.Pipeline(
@@ -1766,6 +1799,7 @@ def test_delete_run(self):
         _run_id = run.run_id
         assert delete_run(_run_id)
 
+    @pytest.mark.skip(reason="run id is in problematic state on test server due to PR#1454")
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
@@ -1780,7 +1814,6 @@ def test_initialize_model_from_run_nonstrict(self):
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_run_not_owned(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_not_owned.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -1793,14 +1826,13 @@ def test_delete_run_not_owned(mock_delete, test_files_directory, test_api_key):
     ):
         openml.runs.delete_run(40_000)
 
-    run_url = "https://test.openml.org/api/v1/xml/run/40000"
+    run_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/run/40000"
     assert run_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_run_success(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_successful.xml"
     mock_delete.return_value = create_request_response(
         status_code=200,
@@ -1810,14 +1842,13 @@ def test_delete_run_success(mock_delete, test_files_directory, test_api_key):
     success = openml.runs.delete_run(10591880)
     assert success
 
-    run_url = "https://test.openml.org/api/v1/xml/run/10591880"
+    run_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/run/10591880"
     assert run_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_not_exist.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -1830,7 +1861,7 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key):
     ):
         openml.runs.delete_run(9_999_999)
 
-    run_url = "https://test.openml.org/api/v1/xml/run/9999999"
+    run_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/run/9999999"
     assert run_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
@@ -1840,7 +1871,12 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key):
     Version(sklearn.__version__) < Version("0.21"),
     reason="couldn't perform local tests successfully w/o bloating RAM",
     )
+@unittest.skipIf(
+    Version(sklearn.__version__) >= Version("1.8"),
+    reason="predictions differ significantly",
+    )
 @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
+@pytest.mark.test_server()
 def test__run_task_get_arffcontent_2(parallel_mock):
     """Tests if a run executed in parallel is collated correctly."""
     task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
@@ -1931,6 +1967,7 @@ def test__run_task_get_arffcontent_2(parallel_mock):
         (-1, "threading", 10),  # the threading backend does preserve mocks even with parallelizing
     ]
 )
+@pytest.mark.test_server()
 def test_joblib_backends(parallel_mock, n_jobs, backend, call_count):
     """Tests evaluation of a run using various joblib backends and n_jobs."""
     if backend is None:
@@ -1980,6 +2017,7 @@ def test_joblib_backends(parallel_mock, n_jobs, backend, call_count):
         n_jobs=n_jobs,
     )
     from openml_sklearn import SklearnExtension
+
     extension = SklearnExtension()
     with parallel_backend(backend, n_jobs=n_jobs):
         res = openml.runs.functions._run_task_get_arffcontent(
@@ -1996,4 +2034,4 @@ def test_joblib_backends(parallel_mock, n_jobs, backend, call_count):
     # *_time_millis_* not recorded when n_jobs = -1
     assert len(res[2]["predictive_accuracy"][0]) == 10
     assert len(res[3]["predictive_accuracy"][0]) == 10
-    assert parallel_mock.call_count == call_count
+    assert parallel_mock.call_count == call_count
\ No newline at end of file
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index 6fd11638f..30943ea70 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -4,7 +4,7 @@
 import hashlib
 import time
 import unittest.mock
-
+import os
 import pandas as pd
 import pytest
 import sklearn.base
@@ -35,6 +35,7 @@ def setUp(self):
         super().setUp()
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_nonexisting_setup_exists(self):
         # first publish a non-existing flow
         sentinel = get_sentinel()
@@ -82,6 +83,7 @@ def _existing_setup_exists(self, classif):
         assert setup_id == run.setup_id
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_existing_setup_exists_1(self):
         def side_effect(self):
             self.var_smoothing = 1e-9
@@ -97,11 +99,13 @@ def side_effect(self):
             self._existing_setup_exists(nb)
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_exisiting_setup_exists_2(self):
         # Check a flow with one hyperparameter
         self._existing_setup_exists(sklearn.naive_bayes.GaussianNB())
 
     @pytest.mark.sklearn()
+    @pytest.mark.test_server()
     def test_existing_setup_exists_3(self):
         # Check a flow with many hyperparameters
         self._existing_setup_exists(
@@ -114,11 +118,10 @@ def test_existing_setup_exists_3(self):
             ),
         )
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_setup(self):
+        self.use_production_server()
         # no setups in default test server
-        openml.config.server = "https://www.openml.org/api/v1/xml/"
-
         # contains all special cases, 0 params, 1 param, n params.
         # Non scikitlearn flows.
         setups = [18, 19, 20, 118]
@@ -132,7 +135,7 @@ def test_get_setup(self):
             else:
                 assert len(current.parameters) == num_params[idx]
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_setup_list_filter_flow(self):
         self.use_production_server()
 
@@ -140,10 +143,11 @@ def test_setup_list_filter_flow(self):
 
         setups = openml.setups.list_setups(flow=flow_id)
 
-        assert len(setups) > 0  # TODO: please adjust 0
+        assert len(setups) >= 2
         for setup_id in setups:
             assert setups[setup_id].flow_id == flow_id
 
+    @pytest.mark.test_server()
     def test_list_setups_empty(self):
         setups = openml.setups.list_setups(setup=[0])
         if len(setups) > 0:
@@ -151,7 +155,7 @@ def test_list_setups_empty(self):
 
         assert isinstance(setups, dict)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_setups_output_format(self):
         self.use_production_server()
         flow_id = 6794
@@ -164,6 +168,7 @@ def test_list_setups_output_format(self):
         assert isinstance(setups, pd.DataFrame)
         assert len(setups) == 10
 
+    @pytest.mark.test_server()
     def test_setuplist_offset(self):
         size = 10
         setups = openml.setups.list_setups(offset=0, size=size)
@@ -175,6 +180,7 @@ def test_setuplist_offset(self):
 
         assert len(all) == size * 2
 
+    @pytest.mark.test_server()
     def test_get_cached_setup(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         openml.setups.functions._get_cached_setup(1)
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index 40026592f..7dc6b6d2a 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -12,7 +12,8 @@
 class TestStudyFunctions(TestBase):
     _multiprocess_can_split_ = True
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_study_old(self):
         self.use_production_server()
 
@@ -23,7 +24,7 @@ def test_get_study_old(self):
         assert len(study.setups) == 30
         assert study.runs is None
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_study_new(self):
         self.use_production_server()
 
@@ -34,7 +35,7 @@ def test_get_study_new(self):
         assert len(study.setups) == 1253
         assert len(study.runs) == 1693
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_openml100(self):
         self.use_production_server()
 
@@ -44,7 +45,7 @@ def test_get_openml100(self):
         assert isinstance(study_2, openml.study.OpenMLBenchmarkSuite)
         assert study.study_id == study_2.study_id
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_study_error(self):
         self.use_production_server()
 
@@ -53,7 +54,7 @@ def test_get_study_error(self):
         ):
             openml.study.get_study(99)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_suite(self):
         self.use_production_server()
 
@@ -64,7 +65,7 @@ def test_get_suite(self):
         assert study.runs is None
         assert study.setups is None
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_suite_error(self):
         self.use_production_server()
 
@@ -73,6 +74,7 @@ def test_get_suite_error(self):
         ):
             openml.study.get_suite(123)
 
+    @pytest.mark.test_server()
     def test_publish_benchmark_suite(self):
         fixture_alias = None
         fixture_name = "unit tested benchmark suite"
@@ -141,13 +143,16 @@ def _test_publish_empty_study_is_allowed(self, explicit: bool):
         assert study_downloaded.main_entity_type == "run"
         assert study_downloaded.runs is None
 
+    @pytest.mark.test_server()
     def test_publish_empty_study_explicit(self):
         self._test_publish_empty_study_is_allowed(explicit=True)
 
+    @pytest.mark.test_server()
     def test_publish_empty_study_implicit(self):
         self._test_publish_empty_study_is_allowed(explicit=False)
 
     @pytest.mark.flaky()
+    @pytest.mark.test_server()
     def test_publish_study(self):
         # get some random runs to attach
         run_list = openml.evaluations.list_evaluations("predictive_accuracy", size=10)
@@ -217,11 +222,12 @@ def test_publish_study(self):
         res = openml.study.delete_study(study.id)
         assert res
 
+    @pytest.mark.test_server()
     def test_study_attach_illegal(self):
         run_list = openml.runs.list_runs(size=10)
         assert len(run_list) == 10
         run_list_more = openml.runs.list_runs(size=20)
-        assert len(run_list_more) == 20
+        assert len(run_list_more) > 10  # a fresh db should have 15 evaluated runs
 
         study = openml.study.create_study(
             alias=None,
diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py
index d4f2ed9d7..65dcebc1d 100644
--- a/tests/test_tasks/test_classification_task.py
+++ b/tests/test_tasks/test_classification_task.py
@@ -18,6 +18,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_CLASSIFICATION
         self.estimation_procedure = 5
 
+    @pytest.mark.test_server()
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
@@ -25,12 +26,13 @@ def test_download_task(self):
         assert task.dataset_id == 20
         assert task.estimation_procedure_id == self.estimation_procedure
 
+    @pytest.mark.test_server()
     def test_class_labels(self):
         task = get_task(self.task_id)
         assert task.class_labels == ["tested_negative", "tested_positive"]
 
 
-@pytest.mark.server()
+@pytest.mark.test_server()
 def test_get_X_and_Y():
     task = get_task(119)
     X, Y = task.get_X_and_y()
diff --git a/tests/test_tasks/test_clustering_task.py b/tests/test_tasks/test_clustering_task.py
index dcc024388..29f5663c4 100644
--- a/tests/test_tasks/test_clustering_task.py
+++ b/tests/test_tasks/test_clustering_task.py
@@ -20,14 +20,15 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.CLUSTERING
         self.estimation_procedure = 17
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_dataset(self):
         # no clustering tasks on test server
         self.use_production_server()
         task = openml.tasks.get_task(self.task_id)
         task.get_dataset()
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
+    @pytest.mark.test_server()
     def test_download_task(self):
         # no clustering tasks on test server
         self.use_production_server()
@@ -36,6 +37,7 @@ def test_download_task(self):
         assert task.task_type_id == TaskType.CLUSTERING
         assert task.dataset_id == 36
 
+    @pytest.mark.test_server()
     def test_upload_task(self):
         compatible_datasets = self._get_compatible_rand_dataset()
         for i in range(100):
diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py
index 885f80a27..465d9c0be 100644
--- a/tests/test_tasks/test_learning_curve_task.py
+++ b/tests/test_tasks/test_learning_curve_task.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import pandas as pd
+import pytest
 
 from openml.tasks import TaskType, get_task
 
@@ -17,6 +18,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.LEARNING_CURVE
         self.estimation_procedure = 13
 
+    @pytest.mark.test_server()
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (768, 8)
@@ -25,12 +27,14 @@ def test_get_X_and_Y(self):
         assert isinstance(Y, pd.Series)
         assert pd.api.types.is_categorical_dtype(Y)
 
+    @pytest.mark.test_server()
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
         assert task.task_type_id == TaskType.LEARNING_CURVE
         assert task.dataset_id == 20
 
+    @pytest.mark.test_server()
     def test_class_labels(self):
         task = get_task(self.task_id)
         assert task.class_labels == ["tested_negative", "tested_positive"]
diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py
index 14ed59470..26d7dc94b 100644
--- a/tests/test_tasks/test_regression_task.py
+++ b/tests/test_tasks/test_regression_task.py
@@ -4,6 +4,7 @@
 import ast
 
 import pandas as pd
+import pytest
 
 import openml
 from openml.exceptions import OpenMLServerException
@@ -48,6 +49,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_REGRESSION
 
 
+    @pytest.mark.test_server()
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (194, 32)
@@ -56,6 +58,7 @@ def test_get_X_and_Y(self):
         assert isinstance(Y, pd.Series)
         assert pd.api.types.is_numeric_dtype(Y)
 
+    @pytest.mark.test_server()
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
diff --git a/tests/test_tasks/test_split.py b/tests/test_tasks/test_split.py
index 12cb632d9..e3320ae80 100644
--- a/tests/test_tasks/test_split.py
+++ b/tests/test_tasks/test_split.py
@@ -3,6 +3,8 @@
 
 import inspect
 import os
+import shutil
+import tempfile
 from pathlib import Path
 
 import numpy as np
@@ -18,8 +20,7 @@ class OpenMLSplitTest(TestBase):
     def setUp(self):
         __file__ = inspect.getfile(OpenMLSplitTest)
         self.directory = os.path.dirname(__file__)
-        # This is for dataset
-        self.arff_filepath = (
+        source_arff = (
             Path(self.directory).parent
             / "files"
             / "org"
@@ -29,13 +30,18 @@ def setUp(self):
             / "1882"
             / "datasplits.arff"
         )
+        # Use a unique temp directory for each test to avoid race conditions
+        # when running tests in parallel (see issue #1641)
+        self._temp_dir = tempfile.TemporaryDirectory()
+        self.arff_filepath = Path(self._temp_dir.name) / "datasplits.arff"
+        shutil.copy(source_arff, self.arff_filepath)
         self.pd_filename = self.arff_filepath.with_suffix(".pkl.py3")
 
     def tearDown(self):
+        # Clean up the entire temp directory
         try:
-            os.remove(self.pd_filename)
+            self._temp_dir.cleanup()
         except (OSError, FileNotFoundError):
-            #  Replaced bare except. Not sure why these exceptions are acceptable.
             pass
 
     def test_eq(self):
diff --git a/tests/test_tasks/test_supervised_task.py b/tests/test_tasks/test_supervised_task.py
index 9c90b7e03..99df3cace 100644
--- a/tests/test_tasks/test_supervised_task.py
+++ b/tests/test_tasks/test_supervised_task.py
@@ -6,6 +6,7 @@
 import pandas as pd
 
 from openml.tasks import get_task
+import pytest
 
 from .test_task import OpenMLTaskTest
 
@@ -27,6 +28,7 @@ def setUpClass(cls):
     def setUp(self, n_levels: int = 1):
         super().setUp()
 
+    @pytest.mark.test_server()
     def test_get_X_and_Y(self) -> tuple[pd.DataFrame, pd.Series]:
         task = get_task(self.task_id)
         X, Y = task.get_X_and_y()
diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py
index e4c9418f2..1d0df1210 100644
--- a/tests/test_tasks/test_task.py
+++ b/tests/test_tasks/test_task.py
@@ -4,6 +4,8 @@
 import unittest
 from random import randint, shuffle
 
+import pytest
+
 from openml.datasets import (
     get_dataset,
     list_datasets,
@@ -30,9 +32,11 @@ def setUpClass(cls):
     def setUp(self, n_levels: int = 1):
         super().setUp()
 
+    @pytest.mark.test_server()
     def test_download_task(self):
         return get_task(self.task_id)
 
+    @pytest.mark.test_server()
     def test_upload_task(self):
         # We don't know if the task in question already exists, so we try a few times. Checking
         # beforehand would not be an option because a concurrent unit test could potentially
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index 856352ac2..df3c0a3b6 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -26,6 +26,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
+    @pytest.mark.test_server()
     def test__get_cached_tasks(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         tasks = openml.tasks.functions._get_cached_tasks()
@@ -33,6 +34,7 @@ def test__get_cached_tasks(self):
         assert len(tasks) == 3
         assert isinstance(next(iter(tasks.values())), OpenMLTask)
 
+    @pytest.mark.test_server()
     def test__get_cached_task(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.functions._get_cached_task(1)
@@ -47,16 +49,18 @@ def test__get_cached_task_not_cached(self):
             2,
         )
 
+    @pytest.mark.test_server()
     def test__get_estimation_procedure_list(self):
         estimation_procedures = openml.tasks.functions._get_estimation_procedure_list()
         assert isinstance(estimation_procedures, list)
         assert isinstance(estimation_procedures[0], dict)
         assert estimation_procedures[0]["task_type_id"] == TaskType.SUPERVISED_CLASSIFICATION
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_list_clustering_task(self):
+        self.use_production_server()
         # as shown by #383, clustering tasks can give list/dict casting problems
-        openml.config.server = self.production_server
         openml.tasks.list_tasks(task_type=TaskType.CLUSTERING, size=10)
         # the expected outcome is that it doesn't crash. No assertions.
 
@@ -69,6 +73,7 @@ def _check_task(self, task):
         assert isinstance(task["status"], str)
         assert task["status"] in ["in_preparation", "active", "deactivated"]
 
+    @pytest.mark.test_server()
     def test_list_tasks_by_type(self):
         num_curves_tasks = 198  # number is flexible, check server if fails
         ttid = TaskType.LEARNING_CURVE
@@ -78,28 +83,35 @@ def test_list_tasks_by_type(self):
             assert ttid == task["ttid"]
             self._check_task(task)
 
+    @pytest.mark.test_server()
     def test_list_tasks_length(self):
         ttid = TaskType.LEARNING_CURVE
         tasks = openml.tasks.list_tasks(task_type=ttid)
         assert len(tasks) > 100
 
+    @pytest.mark.test_server()
     def test_list_tasks_empty(self):
         tasks = openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag")
         assert tasks.empty
 
+    @pytest.mark.test_server()
     def test_list_tasks_by_tag(self):
-        num_basic_tasks = 100  # number is flexible, check server if fails
+        # Server starts with 99 active tasks with the tag, and one 'in_preparation',
+        # so depending on the processing of the last dataset, there may be 99 or 100 matches.
+        num_basic_tasks = 99
         tasks = openml.tasks.list_tasks(tag="OpenML100")
         assert len(tasks) >= num_basic_tasks
         for task in tasks.to_dict(orient="index").values():
             self._check_task(task)
 
+    @pytest.mark.test_server()
     def test_list_tasks(self):
         tasks = openml.tasks.list_tasks()
         assert len(tasks) >= 900
         for task in tasks.to_dict(orient="index").values():
             self._check_task(task)
 
+    @pytest.mark.test_server()
     def test_list_tasks_paginate(self):
         size = 10
         max = 100
@@ -109,6 +121,7 @@ def test_list_tasks_paginate(self):
             for task in tasks.to_dict(orient="index").values():
                 self._check_task(task)
 
+    @pytest.mark.test_server()
     def test_list_tasks_per_type_paginate(self):
         size = 40
         max = 100
@@ -125,6 +138,7 @@ def test_list_tasks_per_type_paginate(self):
                     assert j == task["ttid"]
                     self._check_task(task)
 
+    @pytest.mark.test_server()
     def test__get_task(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         openml.tasks.get_task(1882)
@@ -132,48 +146,51 @@ def test__get_task(self):
     @unittest.skip(
         "Please await outcome of discussion: https://github.com/openml/OpenML/issues/776",
     )
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test__get_task_live(self):
+        self.use_production_server()
         # Test the following task as it used to throw an Unicode Error.
         # https://github.com/openml/openml-python/issues/378
-        openml.config.server = self.production_server
         openml.tasks.get_task(34536)
 
+    @pytest.mark.test_server()
     def test_get_task(self):
         task = openml.tasks.get_task(1, download_data=True)  # anneal; crossvalidation
         assert isinstance(task, OpenMLTask)
         assert os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "task.xml")
+            os.path.join(openml.config.get_cache_directory(), "tasks", "1", "task.xml")
         )
         assert not os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "datasplits.arff")
+            os.path.join(openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff")
         )
         assert os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "datasets", "1", "dataset.arff")
+            os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset_1.pq")
         )
 
+    @pytest.mark.test_server()
     def test_get_task_lazy(self):
         task = openml.tasks.get_task(2, download_data=False)  # anneal; crossvalidation
         assert isinstance(task, OpenMLTask)
         assert os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "task.xml")
+            os.path.join(openml.config.get_cache_directory(), "tasks", "2", "task.xml")
         )
         assert task.class_labels == ["1", "2", "3", "4", "5", "U"]
 
         assert not os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "datasplits.arff")
+            os.path.join(openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff")
         )
         # Since the download_data=False is propagated to get_dataset
         assert not os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "datasets", "2", "dataset.arff")
+            os.path.join(openml.config.get_cache_directory(), "datasets", "2", "dataset.arff")
         )
 
         task.download_split()
         assert os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "datasplits.arff")
+            os.path.join(openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff")
         )
 
     @mock.patch("openml.tasks.functions.get_dataset")
+    @pytest.mark.test_server()
     def test_removal_upon_download_failure(self, get_dataset):
         class WeirdException(Exception):
             pass
@@ -191,14 +208,15 @@ def assert_and_raise(*args, **kwargs):
         # Now the file should no longer exist
         assert not os.path.exists(os.path.join(os.getcwd(), "tasks", "1", "tasks.xml"))
 
+    @pytest.mark.test_server()
     def test_get_task_with_cache(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.get_task(1)
         assert isinstance(task, OpenMLTask)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_task_different_types(self):
-        openml.config.server = self.production_server
+        self.use_production_server()
         # Regression task
         openml.tasks.functions.get_task(5001)
         # Learning curve
@@ -206,12 +224,13 @@ def test_get_task_different_types(self):
         # Issue 538, get_task failing with clustering task.
         openml.tasks.functions.get_task(126033)
 
+    @pytest.mark.test_server()
     def test_download_split(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
         split = task.download_split()
         assert type(split) == OpenMLSplit
         assert os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "datasplits.arff")
+            os.path.join(openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff")
         )
 
     def test_deletion_of_cache_dir(self):
@@ -227,7 +246,6 @@ def test_deletion_of_cache_dir(self):
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_task_not_owned(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_owned.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -240,14 +258,13 @@ def test_delete_task_not_owned(mock_delete, test_files_directory, test_api_key):
     ):
         openml.tasks.delete_task(1)
 
-    task_url = "https://test.openml.org/api/v1/xml/task/1"
+    task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/1"
     assert task_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_task_with_run(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_has_runs.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -260,14 +277,13 @@ def test_delete_task_with_run(mock_delete, test_files_directory, test_api_key):
     ):
         openml.tasks.delete_task(3496)
 
-    task_url = "https://test.openml.org/api/v1/xml/task/3496"
+    task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/3496"
     assert task_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_success(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_successful.xml"
     mock_delete.return_value = create_request_response(
         status_code=200,
@@ -277,14 +293,13 @@ def test_delete_success(mock_delete, test_files_directory, test_api_key):
     success = openml.tasks.delete_task(361323)
     assert success
 
-    task_url = "https://test.openml.org/api/v1/xml/task/361323"
+    task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/361323"
     assert task_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_unknown_task(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_exist.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -297,6 +312,6 @@ def test_delete_unknown_task(mock_delete, test_files_directory, test_api_key):
     ):
         openml.tasks.delete_task(9_999_999)
 
-    task_url = "https://test.openml.org/api/v1/xml/task/9999999"
+    task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/9999999"
     assert task_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
index 4480c2cbc..9316d0876 100644
--- a/tests/test_tasks/test_task_methods.py
+++ b/tests/test_tasks/test_task_methods.py
@@ -5,6 +5,7 @@
 
 import openml
 from openml.testing import TestBase
+import pytest
 
 
 # Common methods between tasks
@@ -15,6 +16,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
+    @pytest.mark.test_server()
     def test_tagging(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
         # tags can be at most 64 alphanumeric (+ underscore) chars
@@ -30,6 +32,7 @@ def test_tagging(self):
         tasks = openml.tasks.list_tasks(tag=tag)
         assert len(tasks) == 0
 
+    @pytest.mark.test_server()
     def test_get_train_and_test_split_indices(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.get_task(1882)
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 3b4a34b57..75f24ebf0 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -27,20 +27,20 @@ def min_number_flows_on_test_server() -> int:
 
 @pytest.fixture()
 def min_number_setups_on_test_server() -> int:
-    """After a reset at least 50 setups are on the test server"""
+    """After a reset at least 20 setups are on the test server"""
     return 50
 
 
 @pytest.fixture()
 def min_number_runs_on_test_server() -> int:
     """After a reset at least 21 runs are on the test server"""
-    return 21
+    return 15
 
 
 @pytest.fixture()
 def min_number_evaluations_on_test_server() -> int:
-    """After a reset at least 22 evaluations are on the test server"""
-    return 22
+    """After a reset at least 8 evaluations are on the test server"""
+    return 8
 
 
 def _mocked_perform_api_call(call, request_method):
@@ -48,18 +48,18 @@ def _mocked_perform_api_call(call, request_method):
     return openml._api_calls._download_text_file(url)
 
 
-@pytest.mark.server()
+@pytest.mark.test_server()
 def test_list_all():
     openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
 
 
-@pytest.mark.server()
+@pytest.mark.test_server()
 def test_list_all_for_tasks(min_number_tasks_on_test_server):
     tasks = openml.tasks.list_tasks(size=min_number_tasks_on_test_server)
     assert min_number_tasks_on_test_server == len(tasks)
 
 
-@pytest.mark.server()
+@pytest.mark.test_server()
 def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
     # By setting the batch size one lower than the minimum we guarantee at least two
     # batches and at the same time do as few batches (roundtrips) as possible.
@@ -72,7 +72,7 @@ def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
     assert min_number_tasks_on_test_server <= sum(len(batch) for batch in batches)
 
 
-@pytest.mark.server()
+@pytest.mark.test_server()
 def test_list_all_for_datasets(min_number_datasets_on_test_server):
     datasets = openml.datasets.list_datasets(
         size=min_number_datasets_on_test_server,
@@ -83,29 +83,29 @@ def test_list_all_for_datasets(min_number_datasets_on_test_server):
         _check_dataset(dataset)
 
 
-@pytest.mark.server()
+@pytest.mark.test_server()
 def test_list_all_for_flows(min_number_flows_on_test_server):
     flows = openml.flows.list_flows(size=min_number_flows_on_test_server)
     assert min_number_flows_on_test_server == len(flows)
 
 
-@pytest.mark.server()
 @pytest.mark.flaky()  # Other tests might need to upload runs first
+@pytest.mark.test_server()
 def test_list_all_for_setups(min_number_setups_on_test_server):
     # TODO apparently list_setups function does not support kwargs
     setups = openml.setups.list_setups(size=min_number_setups_on_test_server)
     assert min_number_setups_on_test_server == len(setups)
 
 
-@pytest.mark.server()
 @pytest.mark.flaky()  # Other tests might need to upload runs first
+@pytest.mark.test_server()
 def test_list_all_for_runs(min_number_runs_on_test_server):
     runs = openml.runs.list_runs(size=min_number_runs_on_test_server)
     assert min_number_runs_on_test_server == len(runs)
 
 
-@pytest.mark.server()
 @pytest.mark.flaky()  # Other tests might need to upload runs first
+@pytest.mark.test_server()
 def test_list_all_for_evaluations(min_number_evaluations_on_test_server):
     # TODO apparently list_evaluations function does not support kwargs
     evaluations = openml.evaluations.list_evaluations(
@@ -115,8 +115,8 @@ def test_list_all_for_evaluations(min_number_evaluations_on_test_server):
     assert min_number_evaluations_on_test_server == len(evaluations)
 
 
-@pytest.mark.server()
 @unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=_mocked_perform_api_call)
+@pytest.mark.test_server()
 def test_list_all_few_results_available(_perform_api_call):
     datasets = openml.datasets.list_datasets(size=1000, data_name="iris", data_version=1)
     assert len(datasets) == 1, "only one iris dataset version 1 should be present"
@@ -141,7 +141,7 @@ def test__create_cache_directory(config_mock, tmp_path):
         openml.utils._create_cache_directory("ghi")
 
 
-@pytest.mark.server()
+@pytest.mark.test_server()
 def test_correct_test_server_download_state():
     """This test verifies that the test server downloads the data from the correct source.
 
@@ -152,3 +152,30 @@ def test_correct_test_server_download_state():
     task = openml.tasks.get_task(119)
     dataset = task.get_dataset()
     assert len(dataset.features) == dataset.get_data()[0].shape[1]
+
+@unittest.mock.patch("openml.config.get_cache_directory")
+def test_get_cache_size(config_mock,tmp_path):
+    """
+    Test that the OpenML cache size utility correctly reports the cache directory
+    size before and after fetching a dataset.
+
+    This test uses a temporary directory (tmp_path) as the cache location by
+    patching the configuration via config_mock. It verifies two conditions:
+    empty cache and after dataset fetch. 
+
+    Parameters
+    ----------
+    config_mock : unittest.mock.Mock
+         A mock that overrides the configured cache directory to point to tmp_path.
+    tmp_path : pathlib.Path
+         A pytest-provided temporary directory used as an isolated cache location.
+    """
+    
+    config_mock.return_value = tmp_path
+    cache_size = openml.utils.get_cache_size()
+    assert cache_size == 0
+    sub_dir = tmp_path / "subdir"
+    sub_dir.mkdir()
+    (sub_dir / "nested_file.txt").write_bytes(b"b" * 100)
+    
+    assert openml.utils.get_cache_size() == 100