diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index 286b5f5a85f07..f8522594f36f4 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -28,10 +28,10 @@ dependencies: - beautifulsoup4=4.12.3 - bottleneck=1.3.6 - fastparquet=2024.2.0 - - fsspec=2024.2.0 + - fsspec=2023.12.2 - html5lib=1.1 - hypothesis=6.84.0 - - gcsfs=2024.2.0 + - gcsfs=2023.12.2 - jinja2=3.1.3 - lxml=4.9.2 - matplotlib=3.8.3 @@ -42,6 +42,7 @@ dependencies: - openpyxl=3.1.2 - psycopg2=2.9.6 - pyarrow=10.0.1 + - pyiceberg=0.7.1 - pymysql=1.1.0 - pyqt=5.15.9 - pyreadstat=1.2.6 @@ -49,7 +50,7 @@ dependencies: - python-calamine=0.1.7 - pytz=2023.4 - pyxlsb=1.0.10 - - s3fs=2024.2.0 + - s3fs=2023.12.2 - scipy=1.12.0 - sqlalchemy=2.0.0 - tabulate=0.9.0 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 5b38d7abb8540..ea22bc411dedd 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -26,10 +26,10 @@ dependencies: - beautifulsoup4>=4.12.3 - bottleneck>=1.3.6 - fastparquet>=2024.2.0 - - fsspec>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - hypothesis>=6.84.0 - - gcsfs>=2024.2.0 + - gcsfs>=2023.12.2 - jinja2>=3.1.3 - lxml>=4.9.2 - matplotlib>=3.8.3 @@ -40,6 +40,7 @@ dependencies: - openpyxl>=3.1.2 - psycopg2>=2.9.6 - pyarrow>=10.0.1 + - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyqt>=5.15.9 - pyreadstat>=1.2.6 @@ -47,7 +48,7 @@ dependencies: - python-calamine>=0.1.7 - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2024.2.0 + - s3fs>=2023.12.2 - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 5fac58193f932..e981be9891dec 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -27,10 +27,10 @@ dependencies: - beautifulsoup4>=4.12.3 - bottleneck>=1.3.6 - fastparquet>=2024.2.0 - - fsspec>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - hypothesis>=6.84.0 - - gcsfs>=2024.2.0 + - gcsfs>=2023.12.2 - jinja2>=3.1.3 - lxml>=4.9.2 - matplotlib>=3.8.3 @@ -41,6 +41,7 @@ dependencies: - openpyxl>=3.1.2 - psycopg2>=2.9.6 - pyarrow>=10.0.1 + - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyqt>=5.15.9 - pyreadstat>=1.2.6 @@ -48,7 +49,7 @@ dependencies: - python-calamine>=0.1.7 - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2024.2.0 + - s3fs>=2023.12.2 - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 9840278d22eab..f03d518fd22fb 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -26,10 +26,10 @@ dependencies: - beautifulsoup4>=4.12.3 - bottleneck>=1.3.6 - fastparquet>=2024.2.0 - - fsspec>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - hypothesis>=6.84.0 - - gcsfs>=2024.2.0 + - gcsfs>=2023.12.2 - jinja2>=3.1.3 - lxml>=4.9.2 - matplotlib>=3.8.3 @@ -41,13 +41,14 @@ dependencies: - openpyxl>=3.1.2 - psycopg2>=2.9.6 - pyarrow>=10.0.1 + - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyreadstat>=1.2.6 - pytables>=3.8.0 - python-calamine>=0.1.7 - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2024.2.0 + - s3fs>=2023.12.2 - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 7d3d2ea1a0ec2..58c6c2ca3210c 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -26,10 +26,10 @@ dependencies: - beautifulsoup4>=4.12.3 - bottleneck>=1.3.6 - fastparquet>=2024.2.0 - - fsspec>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - hypothesis>=6.84.0 - - gcsfs>=2024.2.0 + - gcsfs>=2023.12.2 - jinja2>=3.1.3 - lxml>=4.9.2 - matplotlib>=3.8.3 @@ -41,13 +41,14 @@ dependencies: - openpyxl>=3.1.2 - psycopg2>=2.9.6 - pyarrow>=10.0.1 + - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyreadstat>=1.2.6 - pytables>=3.8.0 - python-calamine>=0.1.7 - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2024.2.0 + - s3fs>=2023.12.2 - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 diff --git a/ci/deps/actions-313.yaml b/ci/deps/actions-313.yaml index 3184ae9724bd3..f94138a98e127 100644 --- a/ci/deps/actions-313.yaml +++ b/ci/deps/actions-313.yaml @@ -27,10 +27,10 @@ dependencies: - blosc>=1.21.3 - bottleneck>=1.3.6 - fastparquet>=2024.2.0 - - fsspec>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - hypothesis>=6.84.0 - - gcsfs>=2024.2.0 + - gcsfs>=2023.12.2 - jinja2>=3.1.3 - lxml>=4.9.2 - matplotlib>=3.8.3 @@ -48,7 +48,7 @@ dependencies: - python-calamine>=0.1.7 - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2024.2.0 + - s3fs>=2023.12.2 - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 8b847d82a9916..93663c1cced7e 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -299,7 +299,7 @@ Dependency Minimum Versi Other data sources ^^^^^^^^^^^^^^^^^^ -Installable with ``pip install "pandas[hdf5, parquet, feather, spss, excel]"`` +Installable with ``pip install "pandas[hdf5, parquet, iceberg, feather, spss, excel]"`` ====================================================== ================== ================ ========================================================== Dependency Minimum Version pip extra Notes @@ -308,6 +308,7 @@ Dependency Minimum Version pip ex `zlib `__ hdf5 Compression for HDF5 `fastparquet `__ 2024.2.0 - Parquet reading / writing (pyarrow is default) `pyarrow `__ 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing +`PyIceberg `__ 0.7.1 iceberg Apache Iceberg reading `pyreadstat `__ 1.2.6 spss SPSS files (.sav) reading `odfpy `__ 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing ====================================================== ================== ================ ========================================================== @@ -328,10 +329,10 @@ Installable with ``pip install "pandas[fss, aws, gcp]"`` ============================================ ================== =============== ========================================================== Dependency Minimum Version pip extra Notes ============================================ ================== =============== ========================================================== -`fsspec `__ 2024.2.0 fss, gcp, aws Handling files aside from simple local and HTTP (required +`fsspec `__ 2023.12.2 fss, gcp, aws Handling files aside from simple local and HTTP (required dependency of s3fs, gcsfs). -`gcsfs `__ 2024.2.0 gcp Google Cloud Storage access -`s3fs `__ 2024.2.0 aws Amazon S3 access +`gcsfs `__ 2023.12.2 gcp Google Cloud Storage access +`s3fs `__ 2023.12.2 aws Amazon S3 access ============================================ ================== =============== ========================================================== Clipboard diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst index 805fb8b783459..6e5992916f800 100644 --- a/doc/source/reference/io.rst +++ b/doc/source/reference/io.rst @@ -156,6 +156,15 @@ Parquet read_parquet DataFrame.to_parquet +Iceberg +~~~~~~~ +.. autosummary:: + :toctree: api/ + + read_iceberg + +.. warning:: ``read_iceberg`` is experimental and may change without warning. + ORC ~~~ .. autosummary:: diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 51af262c20a72..2a7cab701eecf 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -29,6 +29,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like binary,`HDF5 Format `__, :ref:`read_hdf`, :ref:`to_hdf` binary,`Feather Format `__, :ref:`read_feather`, :ref:`to_feather` binary,`Parquet Format `__, :ref:`read_parquet`, :ref:`to_parquet` + binary,`Apache Iceberg `__, :ref:`read_iceberg` , NA binary,`ORC Format `__, :ref:`read_orc`, :ref:`to_orc` binary,`Stata `__, :ref:`read_stata`, :ref:`to_stata` binary,`SAS `__, :ref:`read_sas` , NA @@ -5403,6 +5404,102 @@ The above example creates a partitioned dataset that may look like: except OSError: pass +.. _io.iceberg: + +Iceberg +------- + +.. versionadded:: 3.0.0 + +Apache Iceberg is a high performance open-source format for large analytic tables. +Iceberg enables the use of SQL tables for big data while making it possible for different +engines to safely work with the same tables at the same time. + +Iceberg support predicate pushdown and column pruning, which are available to pandas +users via the ``row_filter`` and ``selected_fields`` parameters of the :func:`~pandas.read_iceberg` +function. This is convenient to extract from large tables a subset that fits in memory asa +pandas ``DataFrame``. + +Internally, pandas uses PyIceberg_ to query Iceberg. + +.. _PyIceberg: https://py.iceberg.apache.org/ + +A simple example loading all data from an Iceberg table ``my_table`` defined in the +``my_catalog`` catalog. + +.. code-block:: python + + df = pd.read_iceberg("my_table", catalog_name="my_catalog") + +Catalogs must be defined in the ``.pyiceberg.yaml`` file, usually in the home directory. +It is possible to to change properties of the catalog definition with the +``catalog_properties`` parameter: + +.. code-block:: python + + df = pd.read_iceberg( + "my_table", + catalog_name="my_catalog", + catalog_properties={"s3.secret-access-key": "my_secret"}, + ) + +It is also possible to fully specify the catalog in ``catalog_properties`` and not provide +a ``catalog_name``: + +.. code-block:: python + + df = pd.read_iceberg( + "my_table", + catalog_properties={ + "uri": "http://127.0.0.1:8181", + "s3.endpoint": "http://127.0.0.1:9000", + }, + ) + +To create the ``DataFrame`` with only a subset of the columns: + +.. code-block:: python + + df = pd.read_iceberg( + "my_table", + catalog_name="my_catalog", + selected_fields=["my_column_3", "my_column_7"] + ) + +This will execute the function faster, since other columns won't be read. And it will also +save memory, since the data from other columns won't be loaded into the underlying memory of +the ``DataFrame``. + +To fetch only a subset of the rows, we can do it with the ``limit`` parameter: + +.. code-block:: python + + df = pd.read_iceberg( + "my_table", + catalog_name="my_catalog", + limit=100, + ) + +This will create a ``DataFrame`` with 100 rows, assuming there are at least this number in +the table. + +To fetch a subset of the rows based on a condition, this can be done using the ``row_filter`` +parameter: + +.. code-block:: python + + df = pd.read_iceberg( + "my_table", + catalog_name="my_catalog", + row_filter="distance > 10.0", + ) + +Reading a particular snapshot is also possible providing the snapshot ID as an argument to +``snapshot_id``. + +More information about the Iceberg format can be found in the `Apache Iceberg official +page `__. + .. _io.orc: ORC diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8695e196c4f38..6642f5855f4fe 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -78,6 +78,7 @@ Other enhancements - :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`) - Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`). - Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`) +- Added support to read from Apache Iceberg tables with the new :func:`read_iceberg` function (:issue:`61383`) - Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`) - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) - Improved deprecation message for offset aliases (:issue:`60820`) diff --git a/environment.yml b/environment.yml index 4677614dc7858..6300e32b5a1b5 100644 --- a/environment.yml +++ b/environment.yml @@ -29,10 +29,10 @@ dependencies: - beautifulsoup4>=4.12.3 - bottleneck>=1.3.6 - fastparquet>=2024.2.0 - - fsspec>=2024.2.0 + - fsspec>=2023.12.2 - html5lib>=1.1 - hypothesis>=6.84.0 - - gcsfs>=2024.2.0 + - gcsfs>=2023.12.2 - ipython - pickleshare # Needed for IPython Sphinx directive in the docs GH#60429 - jinja2>=3.1.3 @@ -44,13 +44,14 @@ dependencies: - odfpy>=1.4.1 - psycopg2>=2.9.6 - pyarrow>=10.0.1 + - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyreadstat>=1.2.6 - pytables>=3.8.0 - python-calamine>=0.1.7 - pytz>=2023.4 - pyxlsb>=1.0.10 - - s3fs>=2024.2.0 + - s3fs>=2023.12.2 - scipy>=1.12.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 diff --git a/pandas/__init__.py b/pandas/__init__.py index 7d6dd7b7c1a88..8b92ad6cdfebb 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -164,6 +164,7 @@ read_stata, read_sas, read_spss, + read_iceberg, ) from pandas.io.json._normalize import json_normalize @@ -319,6 +320,7 @@ "read_fwf", "read_hdf", "read_html", + "read_iceberg", "read_json", "read_orc", "read_parquet", diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 9f4615d183766..f01dfab0de829 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -25,10 +25,10 @@ "bs4": "4.12.3", "bottleneck": "1.3.6", "fastparquet": "2024.2.0", - "fsspec": "2024.2.0", + "fsspec": "2023.12.2", "html5lib": "1.1", "hypothesis": "6.84.0", - "gcsfs": "2024.2.0", + "gcsfs": "2023.12.2", "jinja2": "3.1.3", "lxml.etree": "4.9.2", "matplotlib": "3.8.3", @@ -39,12 +39,13 @@ "psycopg2": "2.9.6", # (dt dec pq3 ext lo64) "pymysql": "1.1.0", "pyarrow": "10.0.1", + "pyiceberg": "0.7.1", "pyreadstat": "1.2.6", "pytest": "7.3.2", "python-calamine": "0.1.7", "pytz": "2023.4", "pyxlsb": "1.0.10", - "s3fs": "2024.2.0", + "s3fs": "2023.12.2", "scipy": "1.12.0", "sqlalchemy": "2.0.0", "tables": "3.8.0", diff --git a/pandas/io/api.py b/pandas/io/api.py index d4982399a604b..5900c94384384 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -10,6 +10,7 @@ ) from pandas.io.feather_format import read_feather from pandas.io.html import read_html +from pandas.io.iceberg import read_iceberg from pandas.io.json import read_json from pandas.io.orc import read_orc from pandas.io.parquet import read_parquet @@ -47,6 +48,7 @@ "read_fwf", "read_hdf", "read_html", + "read_iceberg", "read_json", "read_orc", "read_parquet", diff --git a/pandas/io/iceberg.py b/pandas/io/iceberg.py new file mode 100644 index 0000000000000..8a3e8f5da49b3 --- /dev/null +++ b/pandas/io/iceberg.py @@ -0,0 +1,93 @@ +from typing import ( + Any, +) + +from pandas.compat._optional import import_optional_dependency + +from pandas import DataFrame + + +def read_iceberg( + table_identifier: str, + catalog_name: str | None = None, + catalog_properties: dict[str, Any] | None = None, + row_filter: str | None = None, + selected_fields: tuple[str] | None = None, + case_sensitive: bool = True, + snapshot_id: int | None = None, + limit: int | None = None, + scan_properties: dict[str, Any] | None = None, +) -> DataFrame: + """ + Read an Apache Iceberg table into a pandas DataFrame. + + .. warning:: + + read_iceberg is experimental and may change without warning. + + Parameters + ---------- + table_identifier : str + Table identifier. + catalog_name : str, optional + The name of the catalog. + catalog_properties : dict of {str: str}, optional + The properties that are used next to the catalog configuration. + row_filter : str, optional + A string that describes the desired rows. + selected_fields : tuple of str, optional + A tuple of strings representing the column names to return in the output + dataframe. + case_sensitive : bool, default True + If True column matching is case sensitive. + snapshot_id : int, optional + Snapshot ID to time travel to. By default the table will be scanned as of the + current snapshot ID. + limit : int, optional + An integer representing the number of rows to return in the scan result. + By default all matching rows will be fetched. + scan_properties : dict of {str: obj}, optional + Additional Table properties as a dictionary of string key value pairs to use + for this scan. + + Returns + ------- + DataFrame + DataFrame based on the Iceberg table. + + See Also + -------- + read_parquet : Read a Parquet file. + + Examples + -------- + >>> df = pd.read_iceberg( + ... table_identifier="my_table", + ... catalog_name="my_catalog", + ... catalog_properties={"s3.secret-access-key": "my-secret"}, + ... row_filter="trip_distance >= 10.0", + ... selected_fields=("VendorID", "tpep_pickup_datetime"), + ... ) # doctest: +SKIP + """ + pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog") + pyiceberg_expressions = import_optional_dependency("pyiceberg.expressions") + + if catalog_properties is None: + catalog_properties = {} + catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties) + table = catalog.load_table(table_identifier) + if row_filter is None: + row_filter = pyiceberg_expressions.AlwaysTrue() + if selected_fields is None: + selected_fields = ("*",) + if scan_properties is None: + scan_properties = {} + result = table.scan( + row_filter=row_filter, + selected_fields=selected_fields, + case_sensitive=case_sensitive, + snapshot_id=snapshot_id, + options=scan_properties, + limit=limit, + ) + return result.to_pandas() diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 2ba90948be399..871e977cbe2f8 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -168,6 +168,7 @@ class TestPDApi(Base): "read_parquet", "read_orc", "read_spss", + "read_iceberg", ] # top-level json funcs diff --git a/pandas/tests/io/test_iceberg.py b/pandas/tests/io/test_iceberg.py new file mode 100644 index 0000000000000..765eccb602434 --- /dev/null +++ b/pandas/tests/io/test_iceberg.py @@ -0,0 +1,143 @@ +""" +Tests for the Apache Iceberg format. + +Tests in this file use a simple Iceberg catalog based on SQLite, with the same +data used for Parquet tests (``pandas/tests/io/data/parquet/simple.parquet``). +""" + +import collections +import importlib +import pathlib + +import pytest + +import pandas as pd +import pandas._testing as tm + +from pandas.io.iceberg import read_iceberg + +pytestmark = pytest.mark.single_cpu + +pyiceberg = pytest.importorskip("pyiceberg") +pyiceberg_catalog = pytest.importorskip("pyiceberg.catalog") +pq = pytest.importorskip("pyarrow.parquet") + +Catalog = collections.namedtuple("Catalog", ["name", "uri"]) + + +@pytest.fixture +def catalog(request, tmp_path): + # the catalog stores the full path of data files, so the catalog needs to be + # created dynamically, and not saved in pandas/tests/io/data as other formats + uri = f"sqlite:///{tmp_path}/catalog.sqlite" + warehouse = f"file://{tmp_path}" + catalog_name = request.param if hasattr(request, "param") else None + catalog = pyiceberg_catalog.load_catalog( + catalog_name or "default", + type="sql", + uri=uri, + warehouse=warehouse, + ) + catalog.create_namespace("ns") + + df = pq.read_table( + pathlib.Path(__file__).parent / "data" / "parquet" / "simple.parquet" + ) + table = catalog.create_table("ns.my_table", schema=df.schema) + table.append(df) + + if catalog_name is not None: + config_path = pathlib.Path.home() / ".pyiceberg.yaml" + with open(config_path, "w", encoding="utf-8") as f: + f.write(f"""\ +catalog: + {catalog_name}: + type: sql + uri: {uri} + warehouse: {warehouse}""") + + importlib.reload(pyiceberg_catalog) # needed to reload the config file + + yield Catalog(name=catalog_name or "default", uri=uri) + + if catalog_name is not None: + config_path.unlink() + + +class TestIceberg: + def test_read(self, catalog): + expected = pd.DataFrame( + { + "A": [1, 2, 3], + "B": ["foo", "foo", "foo"], + } + ) + result = read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("catalog", ["default", "pandas_tests"], indirect=True) + def test_read_by_catalog_name(self, catalog): + expected = pd.DataFrame( + { + "A": [1, 2, 3], + "B": ["foo", "foo", "foo"], + } + ) + result = read_iceberg( + "ns.my_table", + catalog_name=catalog.name, + ) + tm.assert_frame_equal(result, expected) + + def test_read_with_row_filter(self, catalog): + expected = pd.DataFrame( + { + "A": [2, 3], + "B": ["foo", "foo"], + } + ) + result = read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + row_filter="A > 1", + ) + tm.assert_frame_equal(result, expected) + + def test_read_with_case_sensitive(self, catalog): + expected = pd.DataFrame( + { + "A": [1, 2, 3], + } + ) + result = read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + selected_fields=["a"], + case_sensitive=False, + ) + tm.assert_frame_equal(result, expected) + + with pytest.raises(ValueError, match="^Could not find column"): + read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + selected_fields=["a"], + case_sensitive=True, + ) + + def test_read_with_limit(self, catalog): + expected = pd.DataFrame( + { + "A": [1, 2], + "B": ["foo", "foo"], + } + ) + result = read_iceberg( + "ns.my_table", + catalog_properties={"uri": catalog.uri}, + limit=2, + ) + tm.assert_frame_equal(result, expected) diff --git a/pyproject.toml b/pyproject.toml index 480e58b62c1d0..adaec5458c035 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,12 +62,13 @@ test = ['hypothesis>=6.84.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0'] pyarrow = ['pyarrow>=10.0.1'] performance = ['bottleneck>=1.3.6', 'numba>=0.59.0', 'numexpr>=2.9.0'] computation = ['scipy>=1.12.0', 'xarray>=2024.1.1'] -fss = ['fsspec>=2024.2.0'] -aws = ['s3fs>=2024.2.0'] -gcp = ['gcsfs>=2024.2.0'] +fss = ['fsspec>=2023.12.2'] +aws = ['s3fs>=2023.12.2'] +gcp = ['gcsfs>=2023.12.2'] excel = ['odfpy>=1.4.1', 'openpyxl>=3.1.2', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.10', 'xlrd>=2.0.1', 'xlsxwriter>=3.2.0'] parquet = ['pyarrow>=10.0.1'] feather = ['pyarrow>=10.0.1'] +iceberg = ['pyiceberg>=0.7.1'] hdf5 = ['tables>=3.8.0'] spss = ['pyreadstat>=1.2.6'] postgresql = ['SQLAlchemy>=2.0.0', 'psycopg2>=2.9.6', 'adbc-driver-postgresql>=0.10.0'] @@ -85,8 +86,8 @@ all = ['adbc-driver-postgresql>=0.10.0', 'beautifulsoup4>=4.12.3', 'bottleneck>=1.3.6', 'fastparquet>=2024.2.0', - 'fsspec>=2024.2.0', - 'gcsfs>=2024.2.0', + 'fsspec>=2023.12.2', + 'gcsfs>=2023.12.2', 'html5lib>=1.1', 'hypothesis>=6.84.0', 'jinja2>=3.1.3', @@ -98,6 +99,7 @@ all = ['adbc-driver-postgresql>=0.10.0', 'openpyxl>=3.1.2', 'psycopg2>=2.9.6', 'pyarrow>=10.0.1', + 'pyiceberg>=0.7.1', 'pymysql>=1.1.0', 'PyQt5>=5.15.9', 'pyreadstat>=1.2.6', @@ -108,7 +110,7 @@ all = ['adbc-driver-postgresql>=0.10.0', 'pyxlsb>=1.0.10', 'qtpy>=2.3.0', 'scipy>=1.12.0', - 's3fs>=2024.2.0', + 's3fs>=2023.12.2', 'SQLAlchemy>=2.0.0', 'tables>=3.8.0', 'tabulate>=0.9.0', diff --git a/requirements-dev.txt b/requirements-dev.txt index 297f1778495b7..c2bac550bc664 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -18,10 +18,10 @@ numpy<3 beautifulsoup4>=4.12.3 bottleneck>=1.3.6 fastparquet>=2024.2.0 -fsspec>=2024.2.0 +fsspec>=2023.12.2 html5lib>=1.1 hypothesis>=6.84.0 -gcsfs>=2024.2.0 +gcsfs>=2023.12.2 ipython pickleshare jinja2>=3.1.3 @@ -33,13 +33,14 @@ openpyxl>=3.1.2 odfpy>=1.4.1 psycopg2-binary>=2.9.6 pyarrow>=10.0.1 +pyiceberg>=0.7.1 pymysql>=1.1.0 pyreadstat>=1.2.6 tables>=3.8.0 python-calamine>=0.1.7 pytz>=2023.4 pyxlsb>=1.0.10 -s3fs>=2024.2.0 +s3fs>=2023.12.2 scipy>=1.12.0 SQLAlchemy>=2.0.0 tabulate>=0.9.0