From 8b513906315a0749b9f5cd6f34bf259ab4dd1add Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 1 Feb 2025 08:29:48 -0500 Subject: [PATCH 001/145] feat: remove DataFusion pyarrow feat (#1000) * Add developer instructions to speed up build processes * Remove pyarrow dep from datafusion. Add in PyScalarValue wrapper and rename DataFusionError to PyDataFusionError to be less confusing * Removed unnecessary cloning of scalar value when going from rust to python. Also removed the rust unit tests copied over from upstream repo that were failing due to #941 in pyo3 * Change return types to PyDataFusionError to simplify code * Update exception handling to fix build errors in recent rust toolchains --- Cargo.lock | 145 +++++++++++------- Cargo.toml | 2 +- .../source/contributor-guide/introduction.rst | 53 +++++++ python/tests/test_indexing.py | 3 +- src/catalog.rs | 8 +- src/common/data_type.rs | 14 ++ src/config.rs | 11 +- src/context.rs | 136 ++++++++-------- src/dataframe.rs | 119 +++++++------- src/dataset_exec.rs | 6 +- src/errors.rs | 42 ++--- src/expr.rs | 38 ++--- src/expr/conditional_expr.rs | 6 +- src/expr/literal.rs | 4 +- src/expr/window.rs | 13 +- src/functions.rs | 55 ++++--- src/lib.rs | 1 + src/physical_plan.rs | 13 +- src/pyarrow_filter_expression.rs | 24 +-- src/pyarrow_util.rs | 61 ++++++++ src/record_batch.rs | 3 +- src/sql/exceptions.rs | 16 +- src/sql/logical.rs | 14 +- src/substrait.rs | 54 ++++--- src/udaf.rs | 21 ++- src/udwf.rs | 4 +- src/utils.rs | 6 +- 27 files changed, 524 insertions(+), 348 deletions(-) create mode 100644 src/pyarrow_util.rs diff --git a/Cargo.lock b/Cargo.lock index 5a74a4839..c6590fd21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -79,7 +79,7 @@ checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", - "getrandom", + "getrandom 0.2.15", "once_cell", "version_check", "zerocopy", @@ -449,9 +449,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.85" +version = "0.1.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056" +checksum = "644dd749086bf3771a2fbc5f256fdb982d53f011c7d5d560304eafeecebce79d" dependencies = [ "proc-macro2", "quote", @@ -576,9 +576,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "4.0.1" +version = "4.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a45bd2e4095a8b518033b128020dd4a55aab1c0a381ba4404a472630f4bc362" +checksum = "74fa05ad7d803d413eb8380983b092cbbaf9a85f151b871360e7b00cd7060b37" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -586,9 +586,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.16.0" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" [[package]] name = "byteorder" @@ -635,9 +635,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.10" +version = "1.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13208fcbb66eaeffe09b99fffbe1af420f00a7b35aa99ad683dfc1aa76145229" +checksum = "e4730490333d58093109dc02c23174c3f4d490998c3fed3cc8e82d57afedb9cf" dependencies = [ "jobserver", "libc", @@ -692,9 +692,9 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.52" +version = "0.1.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c682c223677e0e5b6b7f63a64b9351844c3f1b1678a68b7ee617e30fb082620e" +checksum = "e24a03c8b52922d68a1589ad61032f2c1aa5a8158d2aa0d93c6e9534944bbad6" dependencies = [ "cc", ] @@ -725,7 +725,7 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom", + "getrandom 0.2.15", "once_cell", "tiny-keccak", ] @@ -784,9 +784,9 @@ checksum = "69f3b219d28b6e3b4ac87bc1fc522e0803ab22e055da177bff0068c4150c61a6" [[package]] name = "cpufeatures" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" dependencies = [ "libc", ] @@ -817,9 +817,9 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" [[package]] name = "crypto-common" @@ -961,7 +961,6 @@ dependencies = [ "object_store", "parquet", "paste", - "pyo3", "recursive", "sqlparser", "tokio", @@ -1411,9 +1410,9 @@ dependencies = [ [[package]] name = "dyn-clone" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" +checksum = "feeef44e73baff3a26d371801df019877a9866a8c493d315ab00177843314f35" [[package]] name = "either" @@ -1607,10 +1606,22 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "wasm-bindgen", ] +[[package]] +name = "getrandom" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.13.3+wasi-0.2.2", + "windows-targets", +] + [[package]] name = "gimli" version = "0.31.1" @@ -1722,9 +1733,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.9.5" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" +checksum = "f2d708df4e7140240a16cd6ab0ab65c972d7433ab77819ea693fde9c43811e2a" [[package]] name = "humantime" @@ -1734,9 +1745,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "1.5.2" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "256fb8d4bd6413123cc9d91832d78325c48ff41677595be797d90f42969beae0" +checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" dependencies = [ "bytes", "futures-channel", @@ -1953,9 +1964,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.7.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" dependencies = [ "equivalent", "hashbrown 0.15.2", @@ -1975,9 +1986,9 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "ipnet" -version = "2.10.1" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "itertools" @@ -2243,7 +2254,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.52.0", ] @@ -2377,9 +2388,9 @@ checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "openssl-probe" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] name = "ordered-float" @@ -2661,9 +2672,9 @@ dependencies = [ [[package]] name = "protobuf-src" -version = "2.1.0+27.1" +version = "2.1.1+27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7edafa3bcc668fa93efafcbdf58d7821bbda0f4b458ac7fae3d57ec0fec8167" +checksum = "6217c3504da19b85a3a4b2e9a5183d635822d83507ba0986624b5c05b83bfc40" dependencies = [ "cmake", ] @@ -2794,7 +2805,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" dependencies = [ "bytes", - "getrandom", + "getrandom 0.2.15", "rand", "ring", "rustc-hash", @@ -2857,7 +2868,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.15", ] [[package]] @@ -2926,9 +2937,9 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "regress" -version = "0.10.2" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f56e622c2378013c6c61e2bd776604c46dc1087b2dc5293275a0c20a44f0771" +checksum = "78ef7fa9ed0256d64a688a3747d0fef7a88851c18a5e1d57f115f38ec2e09366" dependencies = [ "hashbrown 0.15.2", "memchr", @@ -2997,7 +3008,7 @@ checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" dependencies = [ "cc", "cfg-if", - "getrandom", + "getrandom 0.2.15", "libc", "spin", "untrusted", @@ -3033,9 +3044,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.43" +version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ "bitflags 2.8.0", "errno", @@ -3046,9 +3057,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.21" +version = "0.23.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f287924602bf649d949c63dc8ac8b235fa5387d394020705b80c4eb597ce5b8" +checksum = "9fb9263ab4eb695e42321db096e3b8fbd715a59b154d5c88d82db2175b681ba7" dependencies = [ "once_cell", "ring", @@ -3081,9 +3092,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.10.1" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2bf47e6ff922db3825eb750c4e2ff784c6ff8fb9e13046ef6a1d1c5401b0b37" +checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" dependencies = [ "web-time", ] @@ -3107,9 +3118,9 @@ checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" [[package]] name = "ryu" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" [[package]] name = "same-file" @@ -3184,9 +3195,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cb6eb87a131f756572d7fb904f6e7b68633f09cca868c5df1c4b8d1a694bbba" +checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03" dependencies = [ "serde", ] @@ -3239,9 +3250,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.136" +version = "1.0.138" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "336a0c23cf42a38d9eaa7cd22c7040d04e1228a19a933890805ffd00a16437d2" +checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" dependencies = [ "itoa", "memchr", @@ -3514,13 +3525,13 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.15.0" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704" +checksum = "38c246215d7d24f48ae091a2902398798e05d978b24315d6efbc00ede9a8bb91" dependencies = [ "cfg-if", "fastrand", - "getrandom", + "getrandom 0.3.1", "once_cell", "rustix", "windows-sys 0.59.0", @@ -3831,9 +3842,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.14" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" [[package]] name = "unicode-segmentation" @@ -3890,11 +3901,11 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.12.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744018581f9a3454a9e15beb8a33b017183f1e7c0cd170232a2d1453b23a51c4" +checksum = "b3758f5e68192bb96cc8f9b7e2c2cfdabb435499a28499a42f8f984092adad4b" dependencies = [ - "getrandom", + "getrandom 0.2.15", "serde", ] @@ -3929,6 +3940,15 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.13.3+wasi-0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "wasm-bindgen" version = "0.2.100" @@ -4185,6 +4205,15 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "wit-bindgen-rt" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" +dependencies = [ + "bitflags 2.8.0", +] + [[package]] name = "write16" version = "1.0.0" diff --git a/Cargo.toml b/Cargo.toml index 10cffccb1..003ba36e5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,7 +38,7 @@ tokio = { version = "1.42", features = ["macros", "rt", "rt-multi-thread", "sync pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] } pyo3-async-runtimes = { version = "0.22", features = ["tokio-runtime"]} arrow = { version = "53", features = ["pyarrow"] } -datafusion = { version = "44.0.0", features = ["pyarrow", "avro", "unicode_expressions"] } +datafusion = { version = "44.0.0", features = ["avro", "unicode_expressions"] } datafusion-substrait = { version = "44.0.0", optional = true } datafusion-proto = { version = "44.0.0" } datafusion-ffi = { version = "44.0.0" } diff --git a/docs/source/contributor-guide/introduction.rst b/docs/source/contributor-guide/introduction.rst index fb98cfd1d..25f2c21a4 100644 --- a/docs/source/contributor-guide/introduction.rst +++ b/docs/source/contributor-guide/introduction.rst @@ -95,3 +95,56 @@ To update dependencies, run .. code-block:: shell uv sync --dev --no-install-package datafusion + +Improving Build Speed +--------------------- + +The `pyo3 `_ dependency of this project contains a ``build.rs`` file which +can cause it to rebuild frequently. You can prevent this from happening by defining a ``PYO3_CONFIG_FILE`` +environment variable that points to a file with your build configuration. Whenever your build configuration +changes, such as during some major version updates, you will need to regenerate this file. This variable +should point to a fully resolved path on your build machine. + +To generate this file, use the following command: + +.. code-block:: shell + + PYO3_PRINT_CONFIG=1 cargo build + +This will generate some output that looks like the following. You will want to copy these contents intro +a file. If you place this file in your project directory with filename ``.pyo3_build_config`` it will +be ignored by ``git``. + +.. code-block:: + + implementation=CPython + version=3.8 + shared=true + abi3=true + lib_name=python3.12 + lib_dir=/opt/homebrew/opt/python@3.12/Frameworks/Python.framework/Versions/3.12/lib + executable=/Users/myusername/src/datafusion-python/.venv/bin/python + pointer_width=64 + build_flags= + suppress_build_script_link_lines=false + +Add the environment variable to your system. + +.. code-block:: shell + + export PYO3_CONFIG_FILE="/Users//myusername/src/datafusion-python/.pyo3_build_config" + +If you are on a Mac and you use VS Code for your IDE, you will want to add these variables +to your settings. You can find the appropriate rust flags by looking in the +``.cargo/config.toml`` file. + +.. code-block:: + + "rust-analyzer.cargo.extraEnv": { + "RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup", + "PYO3_CONFIG_FILE": "/Users/myusername/src/datafusion-python/.pyo3_build_config" + }, + "rust-analyzer.runnables.extraEnv": { + "RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup", + "PYO3_CONFIG_FILE": "/Users/myusername/src/personal/datafusion-python/.pyo3_build_config" + } diff --git a/python/tests/test_indexing.py b/python/tests/test_indexing.py index 5b0d08610..327decd2f 100644 --- a/python/tests/test_indexing.py +++ b/python/tests/test_indexing.py @@ -43,7 +43,8 @@ def test_err(df): with pytest.raises(Exception) as e_info: df["c"] - assert "Schema error: No field named c." in e_info.value.args[0] + for e in ["SchemaError", "FieldNotFound", 'name: "c"']: + assert e in e_info.value.args[0] with pytest.raises(Exception) as e_info: df[1] diff --git a/src/catalog.rs b/src/catalog.rs index 1ce66a4dc..1e189a5aa 100644 --- a/src/catalog.rs +++ b/src/catalog.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use pyo3::exceptions::PyKeyError; use pyo3::prelude::*; -use crate::errors::DataFusionError; +use crate::errors::{PyDataFusionError, PyDataFusionResult}; use crate::utils::wait_for_future; use datafusion::{ arrow::pyarrow::ToPyArrow, @@ -96,11 +96,13 @@ impl PyDatabase { self.database.table_names().into_iter().collect() } - fn table(&self, name: &str, py: Python) -> PyResult { + fn table(&self, name: &str, py: Python) -> PyDataFusionResult { if let Some(table) = wait_for_future(py, self.database.table(name))? { Ok(PyTable::new(table)) } else { - Err(DataFusionError::Common(format!("Table not found: {name}")).into()) + Err(PyDataFusionError::Common(format!( + "Table not found: {name}" + ))) } } diff --git a/src/common/data_type.rs b/src/common/data_type.rs index 7f9c75bfd..f5f8a6b06 100644 --- a/src/common/data_type.rs +++ b/src/common/data_type.rs @@ -23,6 +23,20 @@ use pyo3::{exceptions::PyValueError, prelude::*}; use crate::errors::py_datafusion_err; +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)] +pub struct PyScalarValue(pub ScalarValue); + +impl From for PyScalarValue { + fn from(value: ScalarValue) -> Self { + Self(value) + } +} +impl From for ScalarValue { + fn from(value: PyScalarValue) -> Self { + value.0 + } +} + #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] #[pyclass(eq, eq_int, name = "RexType", module = "datafusion.common")] pub enum RexType { diff --git a/src/config.rs b/src/config.rs index 3f2a05580..cc725b9a3 100644 --- a/src/config.rs +++ b/src/config.rs @@ -21,6 +21,8 @@ use pyo3::types::*; use datafusion::common::ScalarValue; use datafusion::config::ConfigOptions; +use crate::errors::PyDataFusionResult; + #[pyclass(name = "Config", module = "datafusion", subclass)] #[derive(Clone)] pub(crate) struct PyConfig { @@ -38,7 +40,7 @@ impl PyConfig { /// Get configurations from environment variables #[staticmethod] - pub fn from_env() -> PyResult { + pub fn from_env() -> PyDataFusionResult { Ok(Self { config: ConfigOptions::from_env()?, }) @@ -56,11 +58,10 @@ impl PyConfig { } /// Set a configuration option - pub fn set(&mut self, key: &str, value: PyObject, py: Python) -> PyResult<()> { + pub fn set(&mut self, key: &str, value: PyObject, py: Python) -> PyDataFusionResult<()> { let scalar_value = py_obj_to_scalar_value(py, value); - self.config - .set(key, scalar_value.to_string().as_str()) - .map_err(|e| e.into()) + self.config.set(key, scalar_value.to_string().as_str())?; + Ok(()) } /// Get all configuration options diff --git a/src/context.rs b/src/context.rs index bab7fd42a..f53b15576 100644 --- a/src/context.rs +++ b/src/context.rs @@ -28,16 +28,17 @@ use object_store::ObjectStore; use url::Url; use uuid::Uuid; -use pyo3::exceptions::{PyKeyError, PyNotImplementedError, PyTypeError, PyValueError}; +use pyo3::exceptions::{PyKeyError, PyValueError}; use pyo3::prelude::*; use crate::catalog::{PyCatalog, PyTable}; use crate::dataframe::PyDataFrame; use crate::dataset::Dataset; -use crate::errors::{py_datafusion_err, DataFusionError}; +use crate::errors::{py_datafusion_err, PyDataFusionResult}; use crate::expr::sort_expr::PySortExpr; use crate::physical_plan::PyExecutionPlan; use crate::record_batch::PyRecordBatchStream; +use crate::sql::exceptions::py_value_err; use crate::sql::logical::PyLogicalPlan; use crate::store::StorageContexts; use crate::udaf::PyAggregateUDF; @@ -277,7 +278,7 @@ impl PySessionContext { pub fn new( config: Option, runtime: Option, - ) -> PyResult { + ) -> PyDataFusionResult { let config = if let Some(c) = config { c.config } else { @@ -348,7 +349,7 @@ impl PySessionContext { schema: Option>, file_sort_order: Option>>, py: Python, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { let options = ListingOptions::new(Arc::new(ParquetFormat::new())) .with_file_extension(file_extension) .with_table_partition_cols(convert_table_partition_cols(table_partition_cols)?) @@ -365,7 +366,7 @@ impl PySessionContext { None => { let state = self.ctx.state(); let schema = options.infer_schema(&state, &table_path); - wait_for_future(py, schema).map_err(DataFusionError::from)? + wait_for_future(py, schema)? } }; let config = ListingTableConfig::new(table_path) @@ -382,9 +383,9 @@ impl PySessionContext { } /// Returns a PyDataFrame whose plan corresponds to the SQL statement. - pub fn sql(&mut self, query: &str, py: Python) -> PyResult { + pub fn sql(&mut self, query: &str, py: Python) -> PyDataFusionResult { let result = self.ctx.sql(query); - let df = wait_for_future(py, result).map_err(DataFusionError::from)?; + let df = wait_for_future(py, result)?; Ok(PyDataFrame::new(df)) } @@ -394,14 +395,14 @@ impl PySessionContext { query: &str, options: Option, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let options = if let Some(options) = options { options.options } else { SQLOptions::new() }; let result = self.ctx.sql_with_options(query, options); - let df = wait_for_future(py, result).map_err(DataFusionError::from)?; + let df = wait_for_future(py, result)?; Ok(PyDataFrame::new(df)) } @@ -412,14 +413,14 @@ impl PySessionContext { name: Option<&str>, schema: Option>, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let schema = if let Some(schema) = schema { SchemaRef::from(schema.0) } else { partitions.0[0][0].schema() }; - let table = MemTable::try_new(schema, partitions.0).map_err(DataFusionError::from)?; + let table = MemTable::try_new(schema, partitions.0)?; // generate a random (unique) name for this table if none is provided // table name cannot start with numeric digit @@ -433,11 +434,9 @@ impl PySessionContext { } }; - self.ctx - .register_table(&*table_name, Arc::new(table)) - .map_err(DataFusionError::from)?; + self.ctx.register_table(&*table_name, Arc::new(table))?; - let table = wait_for_future(py, self._table(&table_name)).map_err(DataFusionError::from)?; + let table = wait_for_future(py, self._table(&table_name))?; let df = PyDataFrame::new(table); Ok(df) @@ -495,15 +494,14 @@ impl PySessionContext { data: Bound<'_, PyAny>, name: Option<&str>, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let (schema, batches) = if let Ok(stream_reader) = ArrowArrayStreamReader::from_pyarrow_bound(&data) { // Works for any object that implements __arrow_c_stream__ in pycapsule. let schema = stream_reader.schema().as_ref().to_owned(); let batches = stream_reader - .collect::, arrow::error::ArrowError>>() - .map_err(DataFusionError::from)?; + .collect::, arrow::error::ArrowError>>()?; (schema, batches) } else if let Ok(array) = RecordBatch::from_pyarrow_bound(&data) { @@ -512,8 +510,8 @@ impl PySessionContext { (array.schema().as_ref().to_owned(), vec![array]) } else { - return Err(PyTypeError::new_err( - "Expected either a Arrow Array or Arrow Stream in from_arrow().", + return Err(crate::errors::PyDataFusionError::Common( + "Expected either a Arrow Array or Arrow Stream in from_arrow().".to_string(), )); }; @@ -559,17 +557,13 @@ impl PySessionContext { Ok(df) } - pub fn register_table(&mut self, name: &str, table: &PyTable) -> PyResult<()> { - self.ctx - .register_table(name, table.table()) - .map_err(DataFusionError::from)?; + pub fn register_table(&mut self, name: &str, table: &PyTable) -> PyDataFusionResult<()> { + self.ctx.register_table(name, table.table())?; Ok(()) } - pub fn deregister_table(&mut self, name: &str) -> PyResult<()> { - self.ctx - .deregister_table(name) - .map_err(DataFusionError::from)?; + pub fn deregister_table(&mut self, name: &str) -> PyDataFusionResult<()> { + self.ctx.deregister_table(name)?; Ok(()) } @@ -578,10 +572,10 @@ impl PySessionContext { &mut self, name: &str, provider: Bound<'_, PyAny>, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { if provider.hasattr("__datafusion_table_provider__")? { let capsule = provider.getattr("__datafusion_table_provider__")?.call0()?; - let capsule = capsule.downcast::()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; validate_pycapsule(capsule, "datafusion_table_provider")?; let provider = unsafe { capsule.reference::() }; @@ -591,8 +585,9 @@ impl PySessionContext { Ok(()) } else { - Err(PyNotImplementedError::new_err( - "__datafusion_table_provider__ does not exist on Table Provider object.", + Err(crate::errors::PyDataFusionError::Common( + "__datafusion_table_provider__ does not exist on Table Provider object." + .to_string(), )) } } @@ -601,12 +596,10 @@ impl PySessionContext { &mut self, name: &str, partitions: PyArrowType>>, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { let schema = partitions.0[0][0].schema(); let table = MemTable::try_new(schema, partitions.0)?; - self.ctx - .register_table(name, Arc::new(table)) - .map_err(DataFusionError::from)?; + self.ctx.register_table(name, Arc::new(table))?; Ok(()) } @@ -628,7 +621,7 @@ impl PySessionContext { schema: Option>, file_sort_order: Option>>, py: Python, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { let mut options = ParquetReadOptions::default() .table_partition_cols(convert_table_partition_cols(table_partition_cols)?) .parquet_pruning(parquet_pruning) @@ -642,7 +635,7 @@ impl PySessionContext { .collect(); let result = self.ctx.register_parquet(name, path, options); - wait_for_future(py, result).map_err(DataFusionError::from)?; + wait_for_future(py, result)?; Ok(()) } @@ -666,12 +659,12 @@ impl PySessionContext { file_extension: &str, file_compression_type: Option, py: Python, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { let delimiter = delimiter.as_bytes(); if delimiter.len() != 1 { - return Err(PyValueError::new_err( + return Err(crate::errors::PyDataFusionError::PythonError(py_value_err( "Delimiter must be a single character", - )); + ))); } let mut options = CsvReadOptions::new() @@ -685,11 +678,11 @@ impl PySessionContext { if path.is_instance_of::() { let paths = path.extract::>()?; let result = self.register_csv_from_multiple_paths(name, paths, options); - wait_for_future(py, result).map_err(DataFusionError::from)?; + wait_for_future(py, result)?; } else { let path = path.extract::()?; let result = self.ctx.register_csv(name, &path, options); - wait_for_future(py, result).map_err(DataFusionError::from)?; + wait_for_future(py, result)?; } Ok(()) @@ -713,7 +706,7 @@ impl PySessionContext { table_partition_cols: Vec<(String, String)>, file_compression_type: Option, py: Python, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { let path = path .to_str() .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; @@ -726,7 +719,7 @@ impl PySessionContext { options.schema = schema.as_ref().map(|x| &x.0); let result = self.ctx.register_json(name, path, options); - wait_for_future(py, result).map_err(DataFusionError::from)?; + wait_for_future(py, result)?; Ok(()) } @@ -745,7 +738,7 @@ impl PySessionContext { file_extension: &str, table_partition_cols: Vec<(String, String)>, py: Python, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { let path = path .to_str() .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; @@ -756,7 +749,7 @@ impl PySessionContext { options.schema = schema.as_ref().map(|x| &x.0); let result = self.ctx.register_avro(name, path, options); - wait_for_future(py, result).map_err(DataFusionError::from)?; + wait_for_future(py, result)?; Ok(()) } @@ -767,12 +760,10 @@ impl PySessionContext { name: &str, dataset: &Bound<'_, PyAny>, py: Python, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { let table: Arc = Arc::new(Dataset::new(dataset, py)?); - self.ctx - .register_table(name, table) - .map_err(DataFusionError::from)?; + self.ctx.register_table(name, table)?; Ok(()) } @@ -824,11 +815,11 @@ impl PySessionContext { Ok(PyDataFrame::new(x)) } - pub fn table_exist(&self, name: &str) -> PyResult { + pub fn table_exist(&self, name: &str) -> PyDataFusionResult { Ok(self.ctx.table_exist(name)?) } - pub fn empty_table(&self) -> PyResult { + pub fn empty_table(&self) -> PyDataFusionResult { Ok(PyDataFrame::new(self.ctx.read_empty()?)) } @@ -847,7 +838,7 @@ impl PySessionContext { table_partition_cols: Vec<(String, String)>, file_compression_type: Option, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let path = path .to_str() .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; @@ -859,10 +850,10 @@ impl PySessionContext { let df = if let Some(schema) = schema { options.schema = Some(&schema.0); let result = self.ctx.read_json(path, options); - wait_for_future(py, result).map_err(DataFusionError::from)? + wait_for_future(py, result)? } else { let result = self.ctx.read_json(path, options); - wait_for_future(py, result).map_err(DataFusionError::from)? + wait_for_future(py, result)? }; Ok(PyDataFrame::new(df)) } @@ -888,12 +879,12 @@ impl PySessionContext { table_partition_cols: Vec<(String, String)>, file_compression_type: Option, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let delimiter = delimiter.as_bytes(); if delimiter.len() != 1 { - return Err(PyValueError::new_err( + return Err(crate::errors::PyDataFusionError::PythonError(py_value_err( "Delimiter must be a single character", - )); + ))); }; let mut options = CsvReadOptions::new() @@ -909,12 +900,12 @@ impl PySessionContext { let paths = path.extract::>()?; let paths = paths.iter().map(|p| p as &str).collect::>(); let result = self.ctx.read_csv(paths, options); - let df = PyDataFrame::new(wait_for_future(py, result).map_err(DataFusionError::from)?); + let df = PyDataFrame::new(wait_for_future(py, result)?); Ok(df) } else { let path = path.extract::()?; let result = self.ctx.read_csv(path, options); - let df = PyDataFrame::new(wait_for_future(py, result).map_err(DataFusionError::from)?); + let df = PyDataFrame::new(wait_for_future(py, result)?); Ok(df) } } @@ -938,7 +929,7 @@ impl PySessionContext { schema: Option>, file_sort_order: Option>>, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let mut options = ParquetReadOptions::default() .table_partition_cols(convert_table_partition_cols(table_partition_cols)?) .parquet_pruning(parquet_pruning) @@ -952,7 +943,7 @@ impl PySessionContext { .collect(); let result = self.ctx.read_parquet(path, options); - let df = PyDataFrame::new(wait_for_future(py, result).map_err(DataFusionError::from)?); + let df = PyDataFrame::new(wait_for_future(py, result)?); Ok(df) } @@ -965,26 +956,23 @@ impl PySessionContext { table_partition_cols: Vec<(String, String)>, file_extension: &str, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let mut options = AvroReadOptions::default() .table_partition_cols(convert_table_partition_cols(table_partition_cols)?); options.file_extension = file_extension; let df = if let Some(schema) = schema { options.schema = Some(&schema.0); let read_future = self.ctx.read_avro(path, options); - wait_for_future(py, read_future).map_err(DataFusionError::from)? + wait_for_future(py, read_future)? } else { let read_future = self.ctx.read_avro(path, options); - wait_for_future(py, read_future).map_err(DataFusionError::from)? + wait_for_future(py, read_future)? }; Ok(PyDataFrame::new(df)) } - pub fn read_table(&self, table: &PyTable) -> PyResult { - let df = self - .ctx - .read_table(table.table()) - .map_err(DataFusionError::from)?; + pub fn read_table(&self, table: &PyTable) -> PyDataFusionResult { + let df = self.ctx.read_table(table.table())?; Ok(PyDataFrame::new(df)) } @@ -1011,7 +999,7 @@ impl PySessionContext { plan: PyExecutionPlan, part: usize, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let ctx: TaskContext = TaskContext::from(&self.ctx.state()); // create a Tokio runtime to run the async code let rt = &get_tokio_runtime().0; @@ -1071,13 +1059,13 @@ impl PySessionContext { pub fn convert_table_partition_cols( table_partition_cols: Vec<(String, String)>, -) -> Result, DataFusionError> { +) -> PyDataFusionResult> { table_partition_cols .into_iter() .map(|(name, ty)| match ty.as_str() { "string" => Ok((name, DataType::Utf8)), "int" => Ok((name, DataType::Int32)), - _ => Err(DataFusionError::Common(format!( + _ => Err(crate::errors::PyDataFusionError::Common(format!( "Unsupported data type '{ty}' for partition column. Supported types are 'string' and 'int'" ))), }) diff --git a/src/dataframe.rs b/src/dataframe.rs index b875480a7..6fb08ba25 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -33,20 +33,20 @@ use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::execution::SendableRecordBatchStream; use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; use datafusion::prelude::*; -use pyo3::exceptions::{PyTypeError, PyValueError}; +use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods}; use tokio::task::JoinHandle; -use crate::errors::py_datafusion_err; +use crate::errors::{py_datafusion_err, PyDataFusionError}; use crate::expr::sort_expr::to_sort_expressions; use crate::physical_plan::PyExecutionPlan; use crate::record_batch::PyRecordBatchStream; use crate::sql::logical::PyLogicalPlan; use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future}; use crate::{ - errors::DataFusionError, + errors::PyDataFusionResult, expr::{sort_expr::PySortExpr, PyExpr}, }; @@ -69,7 +69,7 @@ impl PyDataFrame { #[pymethods] impl PyDataFrame { /// Enable selection for `df[col]`, `df[col1, col2, col3]`, and `df[[col1, col2, col3]]` - fn __getitem__(&self, key: Bound<'_, PyAny>) -> PyResult { + fn __getitem__(&self, key: Bound<'_, PyAny>) -> PyDataFusionResult { if let Ok(key) = key.extract::() { // df[col] self.select_columns(vec![key]) @@ -84,12 +84,12 @@ impl PyDataFrame { // df[[col1, col2, col3]] self.select_columns(keys) } else { - let message = "DataFrame can only be indexed by string index or indices"; - Err(PyTypeError::new_err(message)) + let message = "DataFrame can only be indexed by string index or indices".to_string(); + Err(PyDataFusionError::Common(message)) } } - fn __repr__(&self, py: Python) -> PyResult { + fn __repr__(&self, py: Python) -> PyDataFusionResult { let df = self.df.as_ref().clone().limit(0, Some(10))?; let batches = wait_for_future(py, df.collect())?; let batches_as_string = pretty::pretty_format_batches(&batches); @@ -99,7 +99,7 @@ impl PyDataFrame { } } - fn _repr_html_(&self, py: Python) -> PyResult { + fn _repr_html_(&self, py: Python) -> PyDataFusionResult { let mut html_str = "\n".to_string(); let df = self.df.as_ref().clone().limit(0, Some(10))?; @@ -145,7 +145,7 @@ impl PyDataFrame { } /// Calculate summary statistics for a DataFrame - fn describe(&self, py: Python) -> PyResult { + fn describe(&self, py: Python) -> PyDataFusionResult { let df = self.df.as_ref().clone(); let stat_df = wait_for_future(py, df.describe())?; Ok(Self::new(stat_df)) @@ -157,37 +157,37 @@ impl PyDataFrame { } #[pyo3(signature = (*args))] - fn select_columns(&self, args: Vec) -> PyResult { + fn select_columns(&self, args: Vec) -> PyDataFusionResult { let args = args.iter().map(|s| s.as_ref()).collect::>(); let df = self.df.as_ref().clone().select_columns(&args)?; Ok(Self::new(df)) } #[pyo3(signature = (*args))] - fn select(&self, args: Vec) -> PyResult { + fn select(&self, args: Vec) -> PyDataFusionResult { let expr = args.into_iter().map(|e| e.into()).collect(); let df = self.df.as_ref().clone().select(expr)?; Ok(Self::new(df)) } #[pyo3(signature = (*args))] - fn drop(&self, args: Vec) -> PyResult { + fn drop(&self, args: Vec) -> PyDataFusionResult { let cols = args.iter().map(|s| s.as_ref()).collect::>(); let df = self.df.as_ref().clone().drop_columns(&cols)?; Ok(Self::new(df)) } - fn filter(&self, predicate: PyExpr) -> PyResult { + fn filter(&self, predicate: PyExpr) -> PyDataFusionResult { let df = self.df.as_ref().clone().filter(predicate.into())?; Ok(Self::new(df)) } - fn with_column(&self, name: &str, expr: PyExpr) -> PyResult { + fn with_column(&self, name: &str, expr: PyExpr) -> PyDataFusionResult { let df = self.df.as_ref().clone().with_column(name, expr.into())?; Ok(Self::new(df)) } - fn with_columns(&self, exprs: Vec) -> PyResult { + fn with_columns(&self, exprs: Vec) -> PyDataFusionResult { let mut df = self.df.as_ref().clone(); for expr in exprs { let expr: Expr = expr.into(); @@ -199,7 +199,7 @@ impl PyDataFrame { /// Rename one column by applying a new projection. This is a no-op if the column to be /// renamed does not exist. - fn with_column_renamed(&self, old_name: &str, new_name: &str) -> PyResult { + fn with_column_renamed(&self, old_name: &str, new_name: &str) -> PyDataFusionResult { let df = self .df .as_ref() @@ -208,7 +208,7 @@ impl PyDataFrame { Ok(Self::new(df)) } - fn aggregate(&self, group_by: Vec, aggs: Vec) -> PyResult { + fn aggregate(&self, group_by: Vec, aggs: Vec) -> PyDataFusionResult { let group_by = group_by.into_iter().map(|e| e.into()).collect(); let aggs = aggs.into_iter().map(|e| e.into()).collect(); let df = self.df.as_ref().clone().aggregate(group_by, aggs)?; @@ -216,14 +216,14 @@ impl PyDataFrame { } #[pyo3(signature = (*exprs))] - fn sort(&self, exprs: Vec) -> PyResult { + fn sort(&self, exprs: Vec) -> PyDataFusionResult { let exprs = to_sort_expressions(exprs); let df = self.df.as_ref().clone().sort(exprs)?; Ok(Self::new(df)) } #[pyo3(signature = (count, offset=0))] - fn limit(&self, count: usize, offset: usize) -> PyResult { + fn limit(&self, count: usize, offset: usize) -> PyDataFusionResult { let df = self.df.as_ref().clone().limit(offset, Some(count))?; Ok(Self::new(df)) } @@ -232,14 +232,15 @@ impl PyDataFrame { /// Unless some order is specified in the plan, there is no /// guarantee of the order of the result. fn collect(&self, py: Python) -> PyResult> { - let batches = wait_for_future(py, self.df.as_ref().clone().collect())?; + let batches = wait_for_future(py, self.df.as_ref().clone().collect()) + .map_err(PyDataFusionError::from)?; // cannot use PyResult> return type due to // https://github.com/PyO3/pyo3/issues/1813 batches.into_iter().map(|rb| rb.to_pyarrow(py)).collect() } /// Cache DataFrame. - fn cache(&self, py: Python) -> PyResult { + fn cache(&self, py: Python) -> PyDataFusionResult { let df = wait_for_future(py, self.df.as_ref().clone().cache())?; Ok(Self::new(df)) } @@ -247,7 +248,8 @@ impl PyDataFrame { /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch /// maintaining the input partitioning. fn collect_partitioned(&self, py: Python) -> PyResult>> { - let batches = wait_for_future(py, self.df.as_ref().clone().collect_partitioned())?; + let batches = wait_for_future(py, self.df.as_ref().clone().collect_partitioned()) + .map_err(PyDataFusionError::from)?; batches .into_iter() @@ -257,13 +259,13 @@ impl PyDataFrame { /// Print the result, 20 lines by default #[pyo3(signature = (num=20))] - fn show(&self, py: Python, num: usize) -> PyResult<()> { + fn show(&self, py: Python, num: usize) -> PyDataFusionResult<()> { let df = self.df.as_ref().clone().limit(0, Some(num))?; print_dataframe(py, df) } /// Filter out duplicate rows - fn distinct(&self) -> PyResult { + fn distinct(&self) -> PyDataFusionResult { let df = self.df.as_ref().clone().distinct()?; Ok(Self::new(df)) } @@ -274,7 +276,7 @@ impl PyDataFrame { how: &str, left_on: Vec, right_on: Vec, - ) -> PyResult { + ) -> PyDataFusionResult { let join_type = match how { "inner" => JoinType::Inner, "left" => JoinType::Left, @@ -283,10 +285,9 @@ impl PyDataFrame { "semi" => JoinType::LeftSemi, "anti" => JoinType::LeftAnti, how => { - return Err(DataFusionError::Common(format!( + return Err(PyDataFusionError::Common(format!( "The join type {how} does not exist or is not implemented" - )) - .into()); + ))); } }; @@ -303,7 +304,12 @@ impl PyDataFrame { Ok(Self::new(df)) } - fn join_on(&self, right: PyDataFrame, on_exprs: Vec, how: &str) -> PyResult { + fn join_on( + &self, + right: PyDataFrame, + on_exprs: Vec, + how: &str, + ) -> PyDataFusionResult { let join_type = match how { "inner" => JoinType::Inner, "left" => JoinType::Left, @@ -312,10 +318,9 @@ impl PyDataFrame { "semi" => JoinType::LeftSemi, "anti" => JoinType::LeftAnti, how => { - return Err(DataFusionError::Common(format!( + return Err(PyDataFusionError::Common(format!( "The join type {how} does not exist or is not implemented" - )) - .into()); + ))); } }; let exprs: Vec = on_exprs.into_iter().map(|e| e.into()).collect(); @@ -330,7 +335,7 @@ impl PyDataFrame { /// Print the query plan #[pyo3(signature = (verbose=false, analyze=false))] - fn explain(&self, py: Python, verbose: bool, analyze: bool) -> PyResult<()> { + fn explain(&self, py: Python, verbose: bool, analyze: bool) -> PyDataFusionResult<()> { let df = self.df.as_ref().clone().explain(verbose, analyze)?; print_dataframe(py, df) } @@ -341,18 +346,18 @@ impl PyDataFrame { } /// Get the optimized logical plan for this `DataFrame` - fn optimized_logical_plan(&self) -> PyResult { + fn optimized_logical_plan(&self) -> PyDataFusionResult { Ok(self.df.as_ref().clone().into_optimized_plan()?.into()) } /// Get the execution plan for this `DataFrame` - fn execution_plan(&self, py: Python) -> PyResult { + fn execution_plan(&self, py: Python) -> PyDataFusionResult { let plan = wait_for_future(py, self.df.as_ref().clone().create_physical_plan())?; Ok(plan.into()) } /// Repartition a `DataFrame` based on a logical partitioning scheme. - fn repartition(&self, num: usize) -> PyResult { + fn repartition(&self, num: usize) -> PyDataFusionResult { let new_df = self .df .as_ref() @@ -363,7 +368,7 @@ impl PyDataFrame { /// Repartition a `DataFrame` based on a logical partitioning scheme. #[pyo3(signature = (*args, num))] - fn repartition_by_hash(&self, args: Vec, num: usize) -> PyResult { + fn repartition_by_hash(&self, args: Vec, num: usize) -> PyDataFusionResult { let expr = args.into_iter().map(|py_expr| py_expr.into()).collect(); let new_df = self .df @@ -376,7 +381,7 @@ impl PyDataFrame { /// Calculate the union of two `DataFrame`s, preserving duplicate rows.The /// two `DataFrame`s must have exactly the same schema #[pyo3(signature = (py_df, distinct=false))] - fn union(&self, py_df: PyDataFrame, distinct: bool) -> PyResult { + fn union(&self, py_df: PyDataFrame, distinct: bool) -> PyDataFusionResult { let new_df = if distinct { self.df .as_ref() @@ -391,7 +396,7 @@ impl PyDataFrame { /// Calculate the distinct union of two `DataFrame`s. The /// two `DataFrame`s must have exactly the same schema - fn union_distinct(&self, py_df: PyDataFrame) -> PyResult { + fn union_distinct(&self, py_df: PyDataFrame) -> PyDataFusionResult { let new_df = self .df .as_ref() @@ -401,7 +406,7 @@ impl PyDataFrame { } #[pyo3(signature = (column, preserve_nulls=true))] - fn unnest_column(&self, column: &str, preserve_nulls: bool) -> PyResult { + fn unnest_column(&self, column: &str, preserve_nulls: bool) -> PyDataFusionResult { // TODO: expose RecursionUnnestOptions // REF: https://github.com/apache/datafusion/pull/11577 let unnest_options = UnnestOptions::default().with_preserve_nulls(preserve_nulls); @@ -414,7 +419,11 @@ impl PyDataFrame { } #[pyo3(signature = (columns, preserve_nulls=true))] - fn unnest_columns(&self, columns: Vec, preserve_nulls: bool) -> PyResult { + fn unnest_columns( + &self, + columns: Vec, + preserve_nulls: bool, + ) -> PyDataFusionResult { // TODO: expose RecursionUnnestOptions // REF: https://github.com/apache/datafusion/pull/11577 let unnest_options = UnnestOptions::default().with_preserve_nulls(preserve_nulls); @@ -428,7 +437,7 @@ impl PyDataFrame { } /// Calculate the intersection of two `DataFrame`s. The two `DataFrame`s must have exactly the same schema - fn intersect(&self, py_df: PyDataFrame) -> PyResult { + fn intersect(&self, py_df: PyDataFrame) -> PyDataFusionResult { let new_df = self .df .as_ref() @@ -438,13 +447,13 @@ impl PyDataFrame { } /// Calculate the exception of two `DataFrame`s. The two `DataFrame`s must have exactly the same schema - fn except_all(&self, py_df: PyDataFrame) -> PyResult { + fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult { let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?; Ok(Self::new(new_df)) } /// Write a `DataFrame` to a CSV file. - fn write_csv(&self, path: &str, with_header: bool, py: Python) -> PyResult<()> { + fn write_csv(&self, path: &str, with_header: bool, py: Python) -> PyDataFusionResult<()> { let csv_options = CsvOptions { has_header: Some(with_header), ..Default::default() @@ -472,7 +481,7 @@ impl PyDataFrame { compression: &str, compression_level: Option, py: Python, - ) -> PyResult<()> { + ) -> PyDataFusionResult<()> { fn verify_compression_level(cl: Option) -> Result { cl.ok_or(PyValueError::new_err("compression_level is not defined")) } @@ -496,7 +505,7 @@ impl PyDataFrame { "lz4_raw" => Compression::LZ4_RAW, "uncompressed" => Compression::UNCOMPRESSED, _ => { - return Err(PyValueError::new_err(format!( + return Err(PyDataFusionError::Common(format!( "Unrecognized compression type {compression}" ))); } @@ -522,7 +531,7 @@ impl PyDataFrame { } /// Executes a query and writes the results to a partitioned JSON file. - fn write_json(&self, path: &str, py: Python) -> PyResult<()> { + fn write_json(&self, path: &str, py: Python) -> PyDataFusionResult<()> { wait_for_future( py, self.df @@ -551,7 +560,7 @@ impl PyDataFrame { &'py mut self, py: Python<'py>, requested_schema: Option>, - ) -> PyResult> { + ) -> PyDataFusionResult> { let mut batches = wait_for_future(py, self.df.as_ref().clone().collect())?; let mut schema: Schema = self.df.schema().to_owned().into(); @@ -559,15 +568,14 @@ impl PyDataFrame { validate_pycapsule(&schema_capsule, "arrow_schema")?; let schema_ptr = unsafe { schema_capsule.reference::() }; - let desired_schema = Schema::try_from(schema_ptr).map_err(DataFusionError::from)?; + let desired_schema = Schema::try_from(schema_ptr)?; - schema = project_schema(schema, desired_schema).map_err(DataFusionError::ArrowError)?; + schema = project_schema(schema, desired_schema)?; batches = batches .into_iter() .map(|record_batch| record_batch_into_schema(record_batch, &schema)) - .collect::, ArrowError>>() - .map_err(DataFusionError::ArrowError)?; + .collect::, ArrowError>>()?; } let batches_wrapped = batches.into_iter().map(Ok); @@ -578,9 +586,10 @@ impl PyDataFrame { let ffi_stream = FFI_ArrowArrayStream::new(reader); let stream_capsule_name = CString::new("arrow_array_stream").unwrap(); PyCapsule::new_bound(py, ffi_stream, Some(stream_capsule_name)) + .map_err(PyDataFusionError::from) } - fn execute_stream(&self, py: Python) -> PyResult { + fn execute_stream(&self, py: Python) -> PyDataFusionResult { // create a Tokio runtime to run the async code let rt = &get_tokio_runtime().0; let df = self.df.as_ref().clone(); @@ -647,13 +656,13 @@ impl PyDataFrame { } // Executes this DataFrame to get the total number of rows. - fn count(&self, py: Python) -> PyResult { + fn count(&self, py: Python) -> PyDataFusionResult { Ok(wait_for_future(py, self.df.as_ref().clone().count())?) } } /// Print DataFrame -fn print_dataframe(py: Python, df: DataFrame) -> PyResult<()> { +fn print_dataframe(py: Python, df: DataFrame) -> PyDataFusionResult<()> { // Get string representation of record batches let batches = wait_for_future(py, df.collect())?; let batches_as_string = pretty::pretty_format_batches(&batches); diff --git a/src/dataset_exec.rs b/src/dataset_exec.rs index 9d2559429..ace42115b 100644 --- a/src/dataset_exec.rs +++ b/src/dataset_exec.rs @@ -42,7 +42,7 @@ use datafusion::physical_plan::{ SendableRecordBatchStream, Statistics, }; -use crate::errors::DataFusionError; +use crate::errors::PyDataFusionResult; use crate::pyarrow_filter_expression::PyArrowFilterExpression; struct PyArrowBatchesAdapter { @@ -83,8 +83,8 @@ impl DatasetExec { dataset: &Bound<'_, PyAny>, projection: Option>, filters: &[Expr], - ) -> Result { - let columns: Option, DataFusionError>> = projection.map(|p| { + ) -> PyDataFusionResult { + let columns: Option>> = projection.map(|p| { p.iter() .map(|index| { let name: String = dataset diff --git a/src/errors.rs b/src/errors.rs index d12b6ade1..b02b754a2 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -24,10 +24,10 @@ use datafusion::error::DataFusionError as InnerDataFusionError; use prost::EncodeError; use pyo3::{exceptions::PyException, PyErr}; -pub type Result = std::result::Result; +pub type PyDataFusionResult = std::result::Result; #[derive(Debug)] -pub enum DataFusionError { +pub enum PyDataFusionError { ExecutionError(InnerDataFusionError), ArrowError(ArrowError), Common(String), @@ -35,46 +35,46 @@ pub enum DataFusionError { EncodeError(EncodeError), } -impl fmt::Display for DataFusionError { +impl fmt::Display for PyDataFusionError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - DataFusionError::ExecutionError(e) => write!(f, "DataFusion error: {e:?}"), - DataFusionError::ArrowError(e) => write!(f, "Arrow error: {e:?}"), - DataFusionError::PythonError(e) => write!(f, "Python error {e:?}"), - DataFusionError::Common(e) => write!(f, "{e}"), - DataFusionError::EncodeError(e) => write!(f, "Failed to encode substrait plan: {e}"), + PyDataFusionError::ExecutionError(e) => write!(f, "DataFusion error: {e:?}"), + PyDataFusionError::ArrowError(e) => write!(f, "Arrow error: {e:?}"), + PyDataFusionError::PythonError(e) => write!(f, "Python error {e:?}"), + PyDataFusionError::Common(e) => write!(f, "{e}"), + PyDataFusionError::EncodeError(e) => write!(f, "Failed to encode substrait plan: {e}"), } } } -impl From for DataFusionError { - fn from(err: ArrowError) -> DataFusionError { - DataFusionError::ArrowError(err) +impl From for PyDataFusionError { + fn from(err: ArrowError) -> PyDataFusionError { + PyDataFusionError::ArrowError(err) } } -impl From for DataFusionError { - fn from(err: InnerDataFusionError) -> DataFusionError { - DataFusionError::ExecutionError(err) +impl From for PyDataFusionError { + fn from(err: InnerDataFusionError) -> PyDataFusionError { + PyDataFusionError::ExecutionError(err) } } -impl From for DataFusionError { - fn from(err: PyErr) -> DataFusionError { - DataFusionError::PythonError(err) +impl From for PyDataFusionError { + fn from(err: PyErr) -> PyDataFusionError { + PyDataFusionError::PythonError(err) } } -impl From for PyErr { - fn from(err: DataFusionError) -> PyErr { +impl From for PyErr { + fn from(err: PyDataFusionError) -> PyErr { match err { - DataFusionError::PythonError(py_err) => py_err, + PyDataFusionError::PythonError(py_err) => py_err, _ => PyException::new_err(err.to_string()), } } } -impl Error for DataFusionError {} +impl Error for PyDataFusionError {} pub fn py_type_err(e: impl Debug) -> PyErr { PyErr::new::(format!("{e:?}")) diff --git a/src/expr.rs b/src/expr.rs index bca0cd3fa..1e9983d42 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -24,7 +24,6 @@ use std::convert::{From, Into}; use std::sync::Arc; use window::PyWindowFrame; -use arrow::pyarrow::ToPyArrow; use datafusion::arrow::datatypes::{DataType, Field}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::functions::core::expr_ext::FieldAccessor; @@ -33,15 +32,17 @@ use datafusion::logical_expr::{ expr::{AggregateFunction, InList, InSubquery, ScalarFunction, WindowFunction}, lit, Between, BinaryExpr, Case, Cast, Expr, Like, Operator, TryCast, }; -use datafusion::scalar::ScalarValue; -use crate::common::data_type::{DataTypeMap, NullTreatment, RexType}; -use crate::errors::{py_runtime_err, py_type_err, py_unsupported_variant_err, DataFusionError}; +use crate::common::data_type::{DataTypeMap, NullTreatment, PyScalarValue, RexType}; +use crate::errors::{ + py_runtime_err, py_type_err, py_unsupported_variant_err, PyDataFusionError, PyDataFusionResult, +}; use crate::expr::aggregate_expr::PyAggregateFunction; use crate::expr::binary_expr::PyBinaryExpr; use crate::expr::column::PyColumn; use crate::expr::literal::PyLiteral; use crate::functions::add_builder_fns_to_window; +use crate::pyarrow_util::scalar_to_pyarrow; use crate::sql::logical::PyLogicalPlan; use self::alias::PyAlias; @@ -261,8 +262,8 @@ impl PyExpr { } #[staticmethod] - pub fn literal(value: ScalarValue) -> PyExpr { - lit(value).into() + pub fn literal(value: PyScalarValue) -> PyExpr { + lit(value.0).into() } #[staticmethod] @@ -356,7 +357,7 @@ impl PyExpr { /// Extracts the Expr value into a PyObject that can be shared with Python pub fn python_value(&self, py: Python) -> PyResult { match &self.expr { - Expr::Literal(scalar_value) => Ok(scalar_value.to_pyarrow(py)?), + Expr::Literal(scalar_value) => scalar_to_pyarrow(scalar_value, py), _ => Err(py_type_err(format!( "Non Expr::Literal encountered in types: {:?}", &self.expr @@ -568,7 +569,7 @@ impl PyExpr { window_frame: Option, order_by: Option>, null_treatment: Option, - ) -> PyResult { + ) -> PyDataFusionResult { match &self.expr { Expr::AggregateFunction(agg_fn) => { let window_fn = Expr::WindowFunction(WindowFunction::new( @@ -592,10 +593,9 @@ impl PyExpr { null_treatment, ), _ => Err( - DataFusionError::ExecutionError(datafusion::error::DataFusionError::Plan( + PyDataFusionError::ExecutionError(datafusion::error::DataFusionError::Plan( format!("Using {} with `over` is not allowed. Must use an aggregate or window function.", self.expr.variant_name()), )) - .into(), ), } } @@ -649,34 +649,26 @@ impl PyExprFuncBuilder { .into() } - pub fn build(&self) -> PyResult { - self.builder - .clone() - .build() - .map(|expr| expr.into()) - .map_err(|err| err.into()) + pub fn build(&self) -> PyDataFusionResult { + Ok(self.builder.clone().build().map(|expr| expr.into())?) } } impl PyExpr { - pub fn _column_name(&self, plan: &LogicalPlan) -> Result { + pub fn _column_name(&self, plan: &LogicalPlan) -> PyDataFusionResult { let field = Self::expr_to_field(&self.expr, plan)?; Ok(field.name().to_owned()) } /// Create a [Field] representing an [Expr], given an input [LogicalPlan] to resolve against - pub fn expr_to_field( - expr: &Expr, - input_plan: &LogicalPlan, - ) -> Result, DataFusionError> { + pub fn expr_to_field(expr: &Expr, input_plan: &LogicalPlan) -> PyDataFusionResult> { match expr { Expr::Wildcard { .. } => { // Since * could be any of the valid column names just return the first one Ok(Arc::new(input_plan.schema().field(0).clone())) } _ => { - let fields = - exprlist_to_fields(&[expr.clone()], input_plan).map_err(PyErr::from)?; + let fields = exprlist_to_fields(&[expr.clone()], input_plan)?; Ok(fields[0].1.clone()) } } diff --git a/src/expr/conditional_expr.rs b/src/expr/conditional_expr.rs index a8a885c54..fe3af2e25 100644 --- a/src/expr/conditional_expr.rs +++ b/src/expr/conditional_expr.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::expr::PyExpr; +use crate::{errors::PyDataFusionResult, expr::PyExpr}; use datafusion::logical_expr::conditional_expressions::CaseBuilder; use pyo3::prelude::*; @@ -44,11 +44,11 @@ impl PyCaseBuilder { } } - fn otherwise(&mut self, else_expr: PyExpr) -> PyResult { + fn otherwise(&mut self, else_expr: PyExpr) -> PyDataFusionResult { Ok(self.case_builder.otherwise(else_expr.expr)?.clone().into()) } - fn end(&mut self) -> PyResult { + fn end(&mut self) -> PyDataFusionResult { Ok(self.case_builder.end()?.clone().into()) } } diff --git a/src/expr/literal.rs b/src/expr/literal.rs index 43084ba4b..2cb2079f1 100644 --- a/src/expr/literal.rs +++ b/src/expr/literal.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::errors::DataFusionError; +use crate::errors::PyDataFusionError; use datafusion::common::ScalarValue; use pyo3::prelude::*; @@ -154,5 +154,5 @@ impl PyLiteral { } fn unexpected_literal_value(value: &ScalarValue) -> PyErr { - DataFusionError::Common(format!("getValue() - Unexpected value: {value}")).into() + PyDataFusionError::Common(format!("getValue() - Unexpected value: {value}")).into() } diff --git a/src/expr/window.rs b/src/expr/window.rs index 6486dbb32..4dc6cb9c9 100644 --- a/src/expr/window.rs +++ b/src/expr/window.rs @@ -21,8 +21,9 @@ use datafusion::logical_expr::{Expr, Window, WindowFrame, WindowFrameBound, Wind use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; +use crate::common::data_type::PyScalarValue; use crate::common::df_schema::PyDFSchema; -use crate::errors::py_type_err; +use crate::errors::{py_type_err, PyDataFusionResult}; use crate::expr::logical_node::LogicalNode; use crate::expr::sort_expr::{py_sort_expr_list, PySortExpr}; use crate::expr::PyExpr; @@ -171,8 +172,8 @@ impl PyWindowFrame { #[pyo3(signature=(unit, start_bound, end_bound))] pub fn new( unit: &str, - start_bound: Option, - end_bound: Option, + start_bound: Option, + end_bound: Option, ) -> PyResult { let units = unit.to_ascii_lowercase(); let units = match units.as_str() { @@ -187,7 +188,7 @@ impl PyWindowFrame { } }; let start_bound = match start_bound { - Some(start_bound) => WindowFrameBound::Preceding(start_bound), + Some(start_bound) => WindowFrameBound::Preceding(start_bound.0), None => match units { WindowFrameUnits::Range => WindowFrameBound::Preceding(ScalarValue::UInt64(None)), WindowFrameUnits::Rows => WindowFrameBound::Preceding(ScalarValue::UInt64(None)), @@ -200,7 +201,7 @@ impl PyWindowFrame { }, }; let end_bound = match end_bound { - Some(end_bound) => WindowFrameBound::Following(end_bound), + Some(end_bound) => WindowFrameBound::Following(end_bound.0), None => match units { WindowFrameUnits::Rows => WindowFrameBound::Following(ScalarValue::UInt64(None)), WindowFrameUnits::Range => WindowFrameBound::Following(ScalarValue::UInt64(None)), @@ -253,7 +254,7 @@ impl PyWindowFrameBound { matches!(self.frame_bound, WindowFrameBound::Following(_)) } /// Returns the offset of the window frame - pub fn get_offset(&self) -> PyResult> { + pub fn get_offset(&self) -> PyDataFusionResult> { match &self.frame_bound { WindowFrameBound::Preceding(val) | WindowFrameBound::Following(val) => match val { x if x.is_null() => Ok(None), diff --git a/src/functions.rs b/src/functions.rs index ae032d702..46c748cf8 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -22,8 +22,10 @@ use datafusion::logical_expr::WindowFrame; use pyo3::{prelude::*, wrap_pyfunction}; use crate::common::data_type::NullTreatment; +use crate::common::data_type::PyScalarValue; use crate::context::PySessionContext; -use crate::errors::DataFusionError; +use crate::errors::PyDataFusionError; +use crate::errors::PyDataFusionResult; use crate::expr::conditional_expr::PyCaseBuilder; use crate::expr::sort_expr::to_sort_expressions; use crate::expr::sort_expr::PySortExpr; @@ -44,7 +46,7 @@ fn add_builder_fns_to_aggregate( filter: Option, order_by: Option>, null_treatment: Option, -) -> PyResult { +) -> PyDataFusionResult { // Since ExprFuncBuilder::new() is private, we can guarantee initializing // a builder with an `null_treatment` with option None let mut builder = agg_fn.null_treatment(None); @@ -228,7 +230,10 @@ fn when(when: PyExpr, then: PyExpr) -> PyResult { /// 1) If no function has been found, search default aggregate functions. /// /// NOTE: we search the built-ins first because the `UDAF` versions currently do not have the same behavior. -fn find_window_fn(name: &str, ctx: Option) -> PyResult { +fn find_window_fn( + name: &str, + ctx: Option, +) -> PyDataFusionResult { if let Some(ctx) = ctx { // search UDAFs let udaf = ctx @@ -284,7 +289,9 @@ fn find_window_fn(name: &str, ctx: Option) -> PyResult, order_by: Option>, null_treatment: Option - ) -> PyResult { + ) -> PyDataFusionResult { let agg_fn = functions_aggregate::expr_fn::$NAME($($arg.into()),*); add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) @@ -362,7 +369,7 @@ macro_rules! aggregate_function_vec_args { filter: Option, order_by: Option>, null_treatment: Option - ) -> PyResult { + ) -> PyDataFusionResult { let agg_fn = functions_aggregate::expr_fn::$NAME(vec![$($arg.into()),*]); add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) @@ -642,7 +649,7 @@ pub fn approx_percentile_cont( percentile: f64, num_centroids: Option, // enforces optional arguments at the end, currently filter: Option, -) -> PyResult { +) -> PyDataFusionResult { let args = if let Some(num_centroids) = num_centroids { vec![expression.expr, lit(percentile), lit(num_centroids)] } else { @@ -661,7 +668,7 @@ pub fn approx_percentile_cont_with_weight( weight: PyExpr, percentile: f64, filter: Option, -) -> PyResult { +) -> PyDataFusionResult { let agg_fn = functions_aggregate::expr_fn::approx_percentile_cont_with_weight( expression.expr, weight.expr, @@ -683,7 +690,7 @@ pub fn first_value( filter: Option, order_by: Option>, null_treatment: Option, -) -> PyResult { +) -> PyDataFusionResult { // If we initialize the UDAF with order_by directly, then it gets over-written by the builder let agg_fn = functions_aggregate::expr_fn::first_value(expr.expr, None); @@ -700,7 +707,7 @@ pub fn nth_value( filter: Option, order_by: Option>, null_treatment: Option, -) -> PyResult { +) -> PyDataFusionResult { let agg_fn = datafusion::functions_aggregate::nth_value::nth_value(expr.expr, n, vec![]); add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) } @@ -715,7 +722,7 @@ pub fn string_agg( filter: Option, order_by: Option>, null_treatment: Option, -) -> PyResult { +) -> PyDataFusionResult { let agg_fn = datafusion::functions_aggregate::string_agg::string_agg(expr.expr, lit(delimiter)); add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) } @@ -726,7 +733,7 @@ pub(crate) fn add_builder_fns_to_window( window_frame: Option, order_by: Option>, null_treatment: Option, -) -> PyResult { +) -> PyDataFusionResult { let null_treatment = null_treatment.map(|n| n.into()); let mut builder = window_fn.null_treatment(null_treatment); @@ -748,7 +755,7 @@ pub(crate) fn add_builder_fns_to_window( builder = builder.window_frame(window_frame.into()); } - builder.build().map(|e| e.into()).map_err(|err| err.into()) + Ok(builder.build().map(|e| e.into())?) } #[pyfunction] @@ -756,10 +763,11 @@ pub(crate) fn add_builder_fns_to_window( pub fn lead( arg: PyExpr, shift_offset: i64, - default_value: Option, + default_value: Option, partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { + let default_value = default_value.map(|v| v.into()); let window_fn = functions_window::expr_fn::lead(arg.expr, Some(shift_offset), default_value); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) @@ -770,10 +778,11 @@ pub fn lead( pub fn lag( arg: PyExpr, shift_offset: i64, - default_value: Option, + default_value: Option, partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { + let default_value = default_value.map(|v| v.into()); let window_fn = functions_window::expr_fn::lag(arg.expr, Some(shift_offset), default_value); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) @@ -784,7 +793,7 @@ pub fn lag( pub fn row_number( partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { let window_fn = functions_window::expr_fn::row_number(); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) @@ -795,7 +804,7 @@ pub fn row_number( pub fn rank( partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { let window_fn = functions_window::expr_fn::rank(); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) @@ -806,7 +815,7 @@ pub fn rank( pub fn dense_rank( partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { let window_fn = functions_window::expr_fn::dense_rank(); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) @@ -817,7 +826,7 @@ pub fn dense_rank( pub fn percent_rank( partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { let window_fn = functions_window::expr_fn::percent_rank(); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) @@ -828,7 +837,7 @@ pub fn percent_rank( pub fn cume_dist( partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { let window_fn = functions_window::expr_fn::cume_dist(); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) @@ -840,7 +849,7 @@ pub fn ntile( arg: PyExpr, partition_by: Option>, order_by: Option>, -) -> PyResult { +) -> PyDataFusionResult { let window_fn = functions_window::expr_fn::ntile(arg.into()); add_builder_fns_to_window(window_fn, partition_by, None, order_by, None) diff --git a/src/lib.rs b/src/lib.rs index 1111d5d06..317c3a49a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -48,6 +48,7 @@ pub mod expr; mod functions; pub mod physical_plan; mod pyarrow_filter_expression; +pub mod pyarrow_util; mod record_batch; pub mod sql; pub mod store; diff --git a/src/physical_plan.rs b/src/physical_plan.rs index 9ef2f0ebb..295908dc7 100644 --- a/src/physical_plan.rs +++ b/src/physical_plan.rs @@ -22,7 +22,7 @@ use std::sync::Arc; use pyo3::{exceptions::PyRuntimeError, prelude::*, types::PyBytes}; -use crate::{context::PySessionContext, errors::DataFusionError}; +use crate::{context::PySessionContext, errors::PyDataFusionResult}; #[pyclass(name = "ExecutionPlan", module = "datafusion", subclass)] #[derive(Debug, Clone)] @@ -58,7 +58,7 @@ impl PyExecutionPlan { format!("{}", d.indent(false)) } - pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyResult> { + pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyDataFusionResult> { let codec = DefaultPhysicalExtensionCodec {}; let proto = datafusion_proto::protobuf::PhysicalPlanNode::try_from_physical_plan( self.plan.clone(), @@ -70,7 +70,10 @@ impl PyExecutionPlan { } #[staticmethod] - pub fn from_proto(ctx: PySessionContext, proto_msg: Bound<'_, PyBytes>) -> PyResult { + pub fn from_proto( + ctx: PySessionContext, + proto_msg: Bound<'_, PyBytes>, + ) -> PyDataFusionResult { let bytes: &[u8] = proto_msg.extract()?; let proto_plan = datafusion_proto::protobuf::PhysicalPlanNode::decode(bytes).map_err(|e| { @@ -81,9 +84,7 @@ impl PyExecutionPlan { })?; let codec = DefaultPhysicalExtensionCodec {}; - let plan = proto_plan - .try_into_physical_plan(&ctx.ctx, &ctx.ctx.runtime_env(), &codec) - .map_err(DataFusionError::from)?; + let plan = proto_plan.try_into_physical_plan(&ctx.ctx, &ctx.ctx.runtime_env(), &codec)?; Ok(Self::new(plan)) } diff --git a/src/pyarrow_filter_expression.rs b/src/pyarrow_filter_expression.rs index 0f97ea442..314eebf4f 100644 --- a/src/pyarrow_filter_expression.rs +++ b/src/pyarrow_filter_expression.rs @@ -21,11 +21,11 @@ use pyo3::prelude::*; use std::convert::TryFrom; use std::result::Result; -use arrow::pyarrow::ToPyArrow; use datafusion::common::{Column, ScalarValue}; use datafusion::logical_expr::{expr::InList, Between, BinaryExpr, Expr, Operator}; -use crate::errors::DataFusionError; +use crate::errors::{PyDataFusionError, PyDataFusionResult}; +use crate::pyarrow_util::scalar_to_pyarrow; #[derive(Debug)] #[repr(transparent)] @@ -34,7 +34,7 @@ pub(crate) struct PyArrowFilterExpression(PyObject); fn operator_to_py<'py>( operator: &Operator, op: &Bound<'py, PyModule>, -) -> Result, DataFusionError> { +) -> PyDataFusionResult> { let py_op: Bound<'_, PyAny> = match operator { Operator::Eq => op.getattr("eq")?, Operator::NotEq => op.getattr("ne")?, @@ -45,7 +45,7 @@ fn operator_to_py<'py>( Operator::And => op.getattr("and_")?, Operator::Or => op.getattr("or_")?, _ => { - return Err(DataFusionError::Common(format!( + return Err(PyDataFusionError::Common(format!( "Unsupported operator {operator:?}" ))) } @@ -53,8 +53,8 @@ fn operator_to_py<'py>( Ok(py_op) } -fn extract_scalar_list(exprs: &[Expr], py: Python) -> Result, DataFusionError> { - let ret: Result, DataFusionError> = exprs +fn extract_scalar_list(exprs: &[Expr], py: Python) -> PyDataFusionResult> { + let ret = exprs .iter() .map(|expr| match expr { // TODO: should we also leverage `ScalarValue::to_pyarrow` here? @@ -71,11 +71,11 @@ fn extract_scalar_list(exprs: &[Expr], py: Python) -> Result, Data ScalarValue::Float32(Some(f)) => Ok(f.into_py(py)), ScalarValue::Float64(Some(f)) => Ok(f.into_py(py)), ScalarValue::Utf8(Some(s)) => Ok(s.into_py(py)), - _ => Err(DataFusionError::Common(format!( + _ => Err(PyDataFusionError::Common(format!( "PyArrow can't handle ScalarValue: {v:?}" ))), }, - _ => Err(DataFusionError::Common(format!( + _ => Err(PyDataFusionError::Common(format!( "Only a list of Literals are supported got {expr:?}" ))), }) @@ -90,7 +90,7 @@ impl PyArrowFilterExpression { } impl TryFrom<&Expr> for PyArrowFilterExpression { - type Error = DataFusionError; + type Error = PyDataFusionError; // Converts a Datafusion filter Expr into an expression string that can be evaluated by Python // Note that pyarrow.compute.{field,scalar} are put into Python globals() when evaluated @@ -100,9 +100,9 @@ impl TryFrom<&Expr> for PyArrowFilterExpression { Python::with_gil(|py| { let pc = Python::import_bound(py, "pyarrow.compute")?; let op_module = Python::import_bound(py, "operator")?; - let pc_expr: Result, DataFusionError> = match expr { + let pc_expr: PyDataFusionResult> = match expr { Expr::Column(Column { name, .. }) => Ok(pc.getattr("field")?.call1((name,))?), - Expr::Literal(scalar) => Ok(scalar.to_pyarrow(py)?.into_bound(py)), + Expr::Literal(scalar) => Ok(scalar_to_pyarrow(scalar, py)?.into_bound(py)), Expr::BinaryExpr(BinaryExpr { left, op, right }) => { let operator = operator_to_py(op, &op_module)?; let left = PyArrowFilterExpression::try_from(left.as_ref())?.0; @@ -167,7 +167,7 @@ impl TryFrom<&Expr> for PyArrowFilterExpression { Ok(if *negated { invert.call1((ret,))? } else { ret }) } - _ => Err(DataFusionError::Common(format!( + _ => Err(PyDataFusionError::Common(format!( "Unsupported Datafusion expression {expr:?}" ))), }; diff --git a/src/pyarrow_util.rs b/src/pyarrow_util.rs new file mode 100644 index 000000000..2b31467f8 --- /dev/null +++ b/src/pyarrow_util.rs @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Conversions between PyArrow and DataFusion types + +use arrow::array::{Array, ArrayData}; +use arrow::pyarrow::{FromPyArrow, ToPyArrow}; +use datafusion::scalar::ScalarValue; +use pyo3::types::{PyAnyMethods, PyList}; +use pyo3::{Bound, FromPyObject, PyAny, PyObject, PyResult, Python}; + +use crate::common::data_type::PyScalarValue; +use crate::errors::PyDataFusionError; + +impl FromPyArrow for PyScalarValue { + fn from_pyarrow_bound(value: &Bound<'_, PyAny>) -> PyResult { + let py = value.py(); + let typ = value.getattr("type")?; + let val = value.call_method0("as_py")?; + + // construct pyarrow array from the python value and pyarrow type + let factory = py.import_bound("pyarrow")?.getattr("array")?; + let args = PyList::new_bound(py, [val]); + let array = factory.call1((args, typ))?; + + // convert the pyarrow array to rust array using C data interface + let array = arrow::array::make_array(ArrayData::from_pyarrow_bound(&array)?); + let scalar = ScalarValue::try_from_array(&array, 0).map_err(PyDataFusionError::from)?; + + Ok(PyScalarValue(scalar)) + } +} + +impl<'source> FromPyObject<'source> for PyScalarValue { + fn extract_bound(value: &Bound<'source, PyAny>) -> PyResult { + Self::from_pyarrow_bound(value) + } +} + +pub fn scalar_to_pyarrow(scalar: &ScalarValue, py: Python) -> PyResult { + let array = scalar.to_array().map_err(PyDataFusionError::from)?; + // convert to pyarrow array using C data interface + let pyarray = array.to_data().to_pyarrow(py)?; + let pyscalar = pyarray.call_method1(py, "__getitem__", (0,))?; + + Ok(pyscalar) +} diff --git a/src/record_batch.rs b/src/record_batch.rs index eacdb5867..ec61c263f 100644 --- a/src/record_batch.rs +++ b/src/record_batch.rs @@ -17,6 +17,7 @@ use std::sync::Arc; +use crate::errors::PyDataFusionError; use crate::utils::wait_for_future; use datafusion::arrow::pyarrow::ToPyArrow; use datafusion::arrow::record_batch::RecordBatch; @@ -90,7 +91,7 @@ async fn next_stream( let mut stream = stream.lock().await; match stream.next().await { Some(Ok(batch)) => Ok(batch.into()), - Some(Err(e)) => Err(e.into()), + Some(Err(e)) => Err(PyDataFusionError::from(e))?, None => { // Depending on whether the iteration is sync or not, we raise either a // StopIteration or a StopAsyncIteration diff --git a/src/sql/exceptions.rs b/src/sql/exceptions.rs index c458402a0..cfb02274b 100644 --- a/src/sql/exceptions.rs +++ b/src/sql/exceptions.rs @@ -17,13 +17,7 @@ use std::fmt::{Debug, Display}; -use pyo3::{create_exception, PyErr}; - -// Identifies exceptions that occur while attempting to generate a `LogicalPlan` from a SQL string -create_exception!(rust, ParsingException, pyo3::exceptions::PyException); - -// Identifies exceptions that occur during attempts to optimization an existing `LogicalPlan` -create_exception!(rust, OptimizationException, pyo3::exceptions::PyException); +use pyo3::PyErr; pub fn py_type_err(e: impl Debug + Display) -> PyErr { PyErr::new::(format!("{e}")) @@ -33,10 +27,6 @@ pub fn py_runtime_err(e: impl Debug + Display) -> PyErr { PyErr::new::(format!("{e}")) } -pub fn py_parsing_exp(e: impl Debug + Display) -> PyErr { - PyErr::new::(format!("{e}")) -} - -pub fn py_optimization_exp(e: impl Debug + Display) -> PyErr { - PyErr::new::(format!("{e}")) +pub fn py_value_err(e: impl Debug + Display) -> PyErr { + PyErr::new::(format!("{e}")) } diff --git a/src/sql/logical.rs b/src/sql/logical.rs index a541889c7..1be33b75f 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -17,6 +17,7 @@ use std::sync::Arc; +use crate::errors::PyDataFusionResult; use crate::expr::aggregate::PyAggregate; use crate::expr::analyze::PyAnalyze; use crate::expr::distinct::PyDistinct; @@ -34,7 +35,7 @@ use crate::expr::table_scan::PyTableScan; use crate::expr::unnest::PyUnnest; use crate::expr::window::PyWindowExpr; use crate::{context::PySessionContext, errors::py_unsupported_variant_err}; -use datafusion::{error::DataFusionError, logical_expr::LogicalPlan}; +use datafusion::logical_expr::LogicalPlan; use datafusion_proto::logical_plan::{AsLogicalPlan, DefaultLogicalExtensionCodec}; use prost::Message; use pyo3::{exceptions::PyRuntimeError, prelude::*, types::PyBytes}; @@ -125,7 +126,7 @@ impl PyLogicalPlan { format!("{}", self.plan.display_graphviz()) } - pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyResult> { + pub fn to_proto<'py>(&'py self, py: Python<'py>) -> PyDataFusionResult> { let codec = DefaultLogicalExtensionCodec {}; let proto = datafusion_proto::protobuf::LogicalPlanNode::try_from_logical_plan(&self.plan, &codec)?; @@ -135,7 +136,10 @@ impl PyLogicalPlan { } #[staticmethod] - pub fn from_proto(ctx: PySessionContext, proto_msg: Bound<'_, PyBytes>) -> PyResult { + pub fn from_proto( + ctx: PySessionContext, + proto_msg: Bound<'_, PyBytes>, + ) -> PyDataFusionResult { let bytes: &[u8] = proto_msg.extract()?; let proto_plan = datafusion_proto::protobuf::LogicalPlanNode::decode(bytes).map_err(|e| { @@ -146,9 +150,7 @@ impl PyLogicalPlan { })?; let codec = DefaultLogicalExtensionCodec {}; - let plan = proto_plan - .try_into_logical_plan(&ctx.ctx, &codec) - .map_err(DataFusionError::from)?; + let plan = proto_plan.try_into_logical_plan(&ctx.ctx, &codec)?; Ok(Self::new(plan)) } } diff --git a/src/substrait.rs b/src/substrait.rs index 16e8c9507..8dcf3e8a7 100644 --- a/src/substrait.rs +++ b/src/substrait.rs @@ -18,7 +18,7 @@ use pyo3::{prelude::*, types::PyBytes}; use crate::context::PySessionContext; -use crate::errors::{py_datafusion_err, DataFusionError}; +use crate::errors::{py_datafusion_err, PyDataFusionError, PyDataFusionResult}; use crate::sql::logical::PyLogicalPlan; use crate::utils::wait_for_future; @@ -39,7 +39,7 @@ impl PyPlan { let mut proto_bytes = Vec::::new(); self.plan .encode(&mut proto_bytes) - .map_err(DataFusionError::EncodeError)?; + .map_err(PyDataFusionError::EncodeError)?; Ok(PyBytes::new_bound(py, &proto_bytes).unbind().into()) } } @@ -66,41 +66,47 @@ pub struct PySubstraitSerializer; #[pymethods] impl PySubstraitSerializer { #[staticmethod] - pub fn serialize(sql: &str, ctx: PySessionContext, path: &str, py: Python) -> PyResult<()> { - wait_for_future(py, serializer::serialize(sql, &ctx.ctx, path)) - .map_err(DataFusionError::from)?; + pub fn serialize( + sql: &str, + ctx: PySessionContext, + path: &str, + py: Python, + ) -> PyDataFusionResult<()> { + wait_for_future(py, serializer::serialize(sql, &ctx.ctx, path))?; Ok(()) } #[staticmethod] - pub fn serialize_to_plan(sql: &str, ctx: PySessionContext, py: Python) -> PyResult { - match PySubstraitSerializer::serialize_bytes(sql, ctx, py) { - Ok(proto_bytes) => { - let proto_bytes = proto_bytes.bind(py).downcast::().unwrap(); - PySubstraitSerializer::deserialize_bytes(proto_bytes.as_bytes().to_vec(), py) - } - Err(e) => Err(py_datafusion_err(e)), - } + pub fn serialize_to_plan( + sql: &str, + ctx: PySessionContext, + py: Python, + ) -> PyDataFusionResult { + PySubstraitSerializer::serialize_bytes(sql, ctx, py).and_then(|proto_bytes| { + let proto_bytes = proto_bytes.bind(py).downcast::().unwrap(); + PySubstraitSerializer::deserialize_bytes(proto_bytes.as_bytes().to_vec(), py) + }) } #[staticmethod] - pub fn serialize_bytes(sql: &str, ctx: PySessionContext, py: Python) -> PyResult { - let proto_bytes: Vec = wait_for_future(py, serializer::serialize_bytes(sql, &ctx.ctx)) - .map_err(DataFusionError::from)?; + pub fn serialize_bytes( + sql: &str, + ctx: PySessionContext, + py: Python, + ) -> PyDataFusionResult { + let proto_bytes: Vec = wait_for_future(py, serializer::serialize_bytes(sql, &ctx.ctx))?; Ok(PyBytes::new_bound(py, &proto_bytes).unbind().into()) } #[staticmethod] - pub fn deserialize(path: &str, py: Python) -> PyResult { - let plan = - wait_for_future(py, serializer::deserialize(path)).map_err(DataFusionError::from)?; + pub fn deserialize(path: &str, py: Python) -> PyDataFusionResult { + let plan = wait_for_future(py, serializer::deserialize(path))?; Ok(PyPlan { plan: *plan }) } #[staticmethod] - pub fn deserialize_bytes(proto_bytes: Vec, py: Python) -> PyResult { - let plan = wait_for_future(py, serializer::deserialize_bytes(proto_bytes)) - .map_err(DataFusionError::from)?; + pub fn deserialize_bytes(proto_bytes: Vec, py: Python) -> PyDataFusionResult { + let plan = wait_for_future(py, serializer::deserialize_bytes(proto_bytes))?; Ok(PyPlan { plan: *plan }) } } @@ -134,10 +140,10 @@ impl PySubstraitConsumer { ctx: &mut PySessionContext, plan: PyPlan, py: Python, - ) -> PyResult { + ) -> PyDataFusionResult { let session_state = ctx.ctx.state(); let result = consumer::from_substrait_plan(&session_state, &plan.plan); - let logical_plan = wait_for_future(py, result).map_err(DataFusionError::from)?; + let logical_plan = wait_for_future(py, result)?; Ok(PyLogicalPlan::new(logical_plan)) } } diff --git a/src/udaf.rs b/src/udaf.rs index a6aa59ac3..5f21533e0 100644 --- a/src/udaf.rs +++ b/src/udaf.rs @@ -28,6 +28,7 @@ use datafusion::logical_expr::{ create_udaf, Accumulator, AccumulatorFactoryFunction, AggregateUDF, }; +use crate::common::data_type::PyScalarValue; use crate::expr::PyExpr; use crate::utils::parse_volatility; @@ -44,13 +45,25 @@ impl RustAccumulator { impl Accumulator for RustAccumulator { fn state(&mut self) -> Result> { - Python::with_gil(|py| self.accum.bind(py).call_method0("state")?.extract()) - .map_err(|e| DataFusionError::Execution(format!("{e}"))) + Python::with_gil(|py| { + self.accum + .bind(py) + .call_method0("state")? + .extract::>() + }) + .map(|v| v.into_iter().map(|x| x.0).collect()) + .map_err(|e| DataFusionError::Execution(format!("{e}"))) } fn evaluate(&mut self) -> Result { - Python::with_gil(|py| self.accum.bind(py).call_method0("evaluate")?.extract()) - .map_err(|e| DataFusionError::Execution(format!("{e}"))) + Python::with_gil(|py| { + self.accum + .bind(py) + .call_method0("evaluate")? + .extract::() + }) + .map(|v| v.0) + .map_err(|e| DataFusionError::Execution(format!("{e}"))) } fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { diff --git a/src/udwf.rs b/src/udwf.rs index 689eb79e3..04a4a1640 100644 --- a/src/udwf.rs +++ b/src/udwf.rs @@ -26,6 +26,7 @@ use datafusion::scalar::ScalarValue; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; +use crate::common::data_type::PyScalarValue; use crate::expr::PyExpr; use crate::utils::parse_volatility; use datafusion::arrow::datatypes::DataType; @@ -133,7 +134,8 @@ impl PartitionEvaluator for RustPartitionEvaluator { self.evaluator .bind(py) .call_method1("evaluate", py_args) - .and_then(|v| v.extract()) + .and_then(|v| v.extract::()) + .map(|v| v.0) .map_err(|e| DataFusionError::Execution(format!("{e}"))) }) } diff --git a/src/utils.rs b/src/utils.rs index 795589752..ed224b364 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::errors::DataFusionError; +use crate::errors::{PyDataFusionError, PyDataFusionResult}; use crate::TokioRuntime; use datafusion::logical_expr::Volatility; use pyo3::exceptions::PyValueError; @@ -47,13 +47,13 @@ where py.allow_threads(|| runtime.block_on(f)) } -pub(crate) fn parse_volatility(value: &str) -> Result { +pub(crate) fn parse_volatility(value: &str) -> PyDataFusionResult { Ok(match value { "immutable" => Volatility::Immutable, "stable" => Volatility::Stable, "volatile" => Volatility::Volatile, value => { - return Err(DataFusionError::Common(format!( + return Err(PyDataFusionError::Common(format!( "Unsupportad volatility type: `{value}`, supported \ values are: immutable, stable and volatile." ))) From d3c4dabe3c24d419911106bdde3dfe1244e1224c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 5 Feb 2025 09:42:03 -0500 Subject: [PATCH 002/145] Fix verify-release-candidate script by removing reference to requirements-310.txt (#1012) * Fix verify-release-candidate script by removing requirements.txt * Update dev/release/verify-release-candidate.sh Co-authored-by: Kevin Liu --------- Co-authored-by: Kevin Liu --- dev/release/verify-release-candidate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 1a9104b55..2bfce0e2d 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -128,7 +128,7 @@ test_source_distribution() { python3 -m venv .venv source .venv/bin/activate python3 -m pip install -U pip - python3 -m pip install -r requirements-310.txt + python3 -m pip install -U maturin maturin develop #TODO: we should really run tests here as well From 93ac6a820353b3ddea014be1eddad8bd004b0fce Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 7 Feb 2025 10:39:51 -0500 Subject: [PATCH 003/145] Prepare release 44.0.0 (#1009) --- Cargo.lock | 2 +- Cargo.toml | 2 +- dev/changelog/44.0.0.md | 58 ++++++++ pyproject.toml | 1 + uv.lock | 301 +++++++++++++++++++++++++++++++++++++++- 5 files changed, 361 insertions(+), 3 deletions(-) create mode 100644 dev/changelog/44.0.0.md diff --git a/Cargo.lock b/Cargo.lock index c6590fd21..50809696b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1327,7 +1327,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "43.0.0" +version = "44.0.0" dependencies = [ "arrow", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index 003ba36e5..44e6e2244 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "43.0.0" +version = "44.0.0" homepage = "https://datafusion.apache.org/python" repository = "https://github.com/apache/datafusion-python" authors = ["Apache DataFusion "] diff --git a/dev/changelog/44.0.0.md b/dev/changelog/44.0.0.md new file mode 100644 index 000000000..c5ed4bdb0 --- /dev/null +++ b/dev/changelog/44.0.0.md @@ -0,0 +1,58 @@ + + +# Apache DataFusion Python 44.0.0 Changelog + +This release consists of 12 commits from 5 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: support enable_url_table config [#980](https://github.com/apache/datafusion-python/pull/980) (chenkovsky) +- feat: remove DataFusion pyarrow feat [#1000](https://github.com/apache/datafusion-python/pull/1000) (timsaucer) + +**Fixed bugs:** + +- fix: correct LZ0 to LZO in compression options [#995](https://github.com/apache/datafusion-python/pull/995) (kosiew) + +**Other:** + +- Add arrow cast [#962](https://github.com/apache/datafusion-python/pull/962) (kosiew) +- Fix small issues in pyproject.toml [#976](https://github.com/apache/datafusion-python/pull/976) (kylebarron) +- chore: set validation and type hint for ffi tableprovider [#983](https://github.com/apache/datafusion-python/pull/983) (ion-elgreco) +- Support async iteration of RecordBatchStream [#975](https://github.com/apache/datafusion-python/pull/975) (kylebarron) +- Chore/upgrade datafusion 44 [#973](https://github.com/apache/datafusion-python/pull/973) (timsaucer) +- Default to ZSTD compression when writing Parquet [#981](https://github.com/apache/datafusion-python/pull/981) (kosiew) +- Feat/use uv python management [#994](https://github.com/apache/datafusion-python/pull/994) (timsaucer) +- minor: Update dependencies prior to release [#999](https://github.com/apache/datafusion-python/pull/999) (timsaucer) +- Apply import ordering in ruff check [#1001](https://github.com/apache/datafusion-python/pull/1001) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 5 Tim Saucer + 3 kosiew + 2 Kyle Barron + 1 Chongchen Chen + 1 Ion Koutsouris +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/pyproject.toml b/pyproject.toml index 32bb28d21..f416e02a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,6 +89,7 @@ dev = [ "pytest>=7.4.4", "ruff>=0.9.1", "toml>=0.10.2", + "pygithub==2.5.0", ] docs = [ "sphinx>=7.1.2", diff --git a/uv.lock b/uv.lock index 75d9ed018..587ddc8b7 100644 --- a/uv.lock +++ b/uv.lock @@ -139,6 +139,83 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a5/32/8f6669fc4798494966bf446c8c4a162e0b5d893dff088afddf76414f70e1/certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56", size = 164927 }, ] +[[package]] +name = "cffi" +version = "1.17.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pycparser" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fc/97/c783634659c2920c3fc70419e3af40972dbaf758daa229a7d6ea6135c90d/cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824", size = 516621 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/07/f44ca684db4e4f08a3fdc6eeb9a0d15dc6883efc7b8c90357fdbf74e186c/cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14", size = 182191 }, + { url = "https://files.pythonhosted.org/packages/08/fd/cc2fedbd887223f9f5d170c96e57cbf655df9831a6546c1727ae13fa977a/cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67", size = 178592 }, + { url = "https://files.pythonhosted.org/packages/de/cc/4635c320081c78d6ffc2cab0a76025b691a91204f4aa317d568ff9280a2d/cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382", size = 426024 }, + { url = "https://files.pythonhosted.org/packages/b6/7b/3b2b250f3aab91abe5f8a51ada1b717935fdaec53f790ad4100fe2ec64d1/cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702", size = 448188 }, + { url = "https://files.pythonhosted.org/packages/d3/48/1b9283ebbf0ec065148d8de05d647a986c5f22586b18120020452fff8f5d/cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3", size = 455571 }, + { url = "https://files.pythonhosted.org/packages/40/87/3b8452525437b40f39ca7ff70276679772ee7e8b394934ff60e63b7b090c/cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6", size = 436687 }, + { url = "https://files.pythonhosted.org/packages/8d/fb/4da72871d177d63649ac449aec2e8a29efe0274035880c7af59101ca2232/cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17", size = 446211 }, + { url = "https://files.pythonhosted.org/packages/ab/a0/62f00bcb411332106c02b663b26f3545a9ef136f80d5df746c05878f8c4b/cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8", size = 461325 }, + { url = "https://files.pythonhosted.org/packages/36/83/76127035ed2e7e27b0787604d99da630ac3123bfb02d8e80c633f218a11d/cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e", size = 438784 }, + { url = "https://files.pythonhosted.org/packages/21/81/a6cd025db2f08ac88b901b745c163d884641909641f9b826e8cb87645942/cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be", size = 461564 }, + { url = "https://files.pythonhosted.org/packages/f8/fe/4d41c2f200c4a457933dbd98d3cf4e911870877bd94d9656cc0fcb390681/cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c", size = 171804 }, + { url = "https://files.pythonhosted.org/packages/d1/b6/0b0f5ab93b0df4acc49cae758c81fe4e5ef26c3ae2e10cc69249dfd8b3ab/cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15", size = 181299 }, + { url = "https://files.pythonhosted.org/packages/6b/f4/927e3a8899e52a27fa57a48607ff7dc91a9ebe97399b357b85a0c7892e00/cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401", size = 182264 }, + { url = "https://files.pythonhosted.org/packages/6c/f5/6c3a8efe5f503175aaddcbea6ad0d2c96dad6f5abb205750d1b3df44ef29/cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf", size = 178651 }, + { url = "https://files.pythonhosted.org/packages/94/dd/a3f0118e688d1b1a57553da23b16bdade96d2f9bcda4d32e7d2838047ff7/cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4", size = 445259 }, + { url = "https://files.pythonhosted.org/packages/2e/ea/70ce63780f096e16ce8588efe039d3c4f91deb1dc01e9c73a287939c79a6/cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41", size = 469200 }, + { url = "https://files.pythonhosted.org/packages/1c/a0/a4fa9f4f781bda074c3ddd57a572b060fa0df7655d2a4247bbe277200146/cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1", size = 477235 }, + { url = "https://files.pythonhosted.org/packages/62/12/ce8710b5b8affbcdd5c6e367217c242524ad17a02fe5beec3ee339f69f85/cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6", size = 459721 }, + { url = "https://files.pythonhosted.org/packages/ff/6b/d45873c5e0242196f042d555526f92aa9e0c32355a1be1ff8c27f077fd37/cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d", size = 467242 }, + { url = "https://files.pythonhosted.org/packages/1a/52/d9a0e523a572fbccf2955f5abe883cfa8bcc570d7faeee06336fbd50c9fc/cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6", size = 477999 }, + { url = "https://files.pythonhosted.org/packages/44/74/f2a2460684a1a2d00ca799ad880d54652841a780c4c97b87754f660c7603/cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f", size = 454242 }, + { url = "https://files.pythonhosted.org/packages/f8/4a/34599cac7dfcd888ff54e801afe06a19c17787dfd94495ab0c8d35fe99fb/cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b", size = 478604 }, + { url = "https://files.pythonhosted.org/packages/34/33/e1b8a1ba29025adbdcda5fb3a36f94c03d771c1b7b12f726ff7fef2ebe36/cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655", size = 171727 }, + { url = "https://files.pythonhosted.org/packages/3d/97/50228be003bb2802627d28ec0627837ac0bf35c90cf769812056f235b2d1/cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0", size = 181400 }, + { url = "https://files.pythonhosted.org/packages/5a/84/e94227139ee5fb4d600a7a4927f322e1d4aea6fdc50bd3fca8493caba23f/cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4", size = 183178 }, + { url = "https://files.pythonhosted.org/packages/da/ee/fb72c2b48656111c4ef27f0f91da355e130a923473bf5ee75c5643d00cca/cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c", size = 178840 }, + { url = "https://files.pythonhosted.org/packages/cc/b6/db007700f67d151abadf508cbfd6a1884f57eab90b1bb985c4c8c02b0f28/cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36", size = 454803 }, + { url = "https://files.pythonhosted.org/packages/1a/df/f8d151540d8c200eb1c6fba8cd0dfd40904f1b0682ea705c36e6c2e97ab3/cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5", size = 478850 }, + { url = "https://files.pythonhosted.org/packages/28/c0/b31116332a547fd2677ae5b78a2ef662dfc8023d67f41b2a83f7c2aa78b1/cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff", size = 485729 }, + { url = "https://files.pythonhosted.org/packages/91/2b/9a1ddfa5c7f13cab007a2c9cc295b70fbbda7cb10a286aa6810338e60ea1/cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99", size = 471256 }, + { url = "https://files.pythonhosted.org/packages/b2/d5/da47df7004cb17e4955df6a43d14b3b4ae77737dff8bf7f8f333196717bf/cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93", size = 479424 }, + { url = "https://files.pythonhosted.org/packages/0b/ac/2a28bcf513e93a219c8a4e8e125534f4f6db03e3179ba1c45e949b76212c/cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3", size = 484568 }, + { url = "https://files.pythonhosted.org/packages/d4/38/ca8a4f639065f14ae0f1d9751e70447a261f1a30fa7547a828ae08142465/cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8", size = 488736 }, + { url = "https://files.pythonhosted.org/packages/86/c5/28b2d6f799ec0bdecf44dced2ec5ed43e0eb63097b0f58c293583b406582/cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65", size = 172448 }, + { url = "https://files.pythonhosted.org/packages/50/b9/db34c4755a7bd1cb2d1603ac3863f22bcecbd1ba29e5ee841a4bc510b294/cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903", size = 181976 }, + { url = "https://files.pythonhosted.org/packages/8d/f8/dd6c246b148639254dad4d6803eb6a54e8c85c6e11ec9df2cffa87571dbe/cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e", size = 182989 }, + { url = "https://files.pythonhosted.org/packages/8b/f1/672d303ddf17c24fc83afd712316fda78dc6fce1cd53011b839483e1ecc8/cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2", size = 178802 }, + { url = "https://files.pythonhosted.org/packages/0e/2d/eab2e858a91fdff70533cab61dcff4a1f55ec60425832ddfdc9cd36bc8af/cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3", size = 454792 }, + { url = "https://files.pythonhosted.org/packages/75/b2/fbaec7c4455c604e29388d55599b99ebcc250a60050610fadde58932b7ee/cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683", size = 478893 }, + { url = "https://files.pythonhosted.org/packages/4f/b7/6e4a2162178bf1935c336d4da8a9352cccab4d3a5d7914065490f08c0690/cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5", size = 485810 }, + { url = "https://files.pythonhosted.org/packages/c7/8a/1d0e4a9c26e54746dc08c2c6c037889124d4f59dffd853a659fa545f1b40/cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4", size = 471200 }, + { url = "https://files.pythonhosted.org/packages/26/9f/1aab65a6c0db35f43c4d1b4f580e8df53914310afc10ae0397d29d697af4/cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd", size = 479447 }, + { url = "https://files.pythonhosted.org/packages/5f/e4/fb8b3dd8dc0e98edf1135ff067ae070bb32ef9d509d6cb0f538cd6f7483f/cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed", size = 484358 }, + { url = "https://files.pythonhosted.org/packages/f1/47/d7145bf2dc04684935d57d67dff9d6d795b2ba2796806bb109864be3a151/cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9", size = 488469 }, + { url = "https://files.pythonhosted.org/packages/bf/ee/f94057fa6426481d663b88637a9a10e859e492c73d0384514a17d78ee205/cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d", size = 172475 }, + { url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009 }, + { url = "https://files.pythonhosted.org/packages/48/08/15bf6b43ae9bd06f6b00ad8a91f5a8fe1069d4c9fab550a866755402724e/cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b", size = 182457 }, + { url = "https://files.pythonhosted.org/packages/c2/5b/f1523dd545f92f7df468e5f653ffa4df30ac222f3c884e51e139878f1cb5/cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964", size = 425932 }, + { url = "https://files.pythonhosted.org/packages/53/93/7e547ab4105969cc8c93b38a667b82a835dd2cc78f3a7dad6130cfd41e1d/cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9", size = 448585 }, + { url = "https://files.pythonhosted.org/packages/56/c4/a308f2c332006206bb511de219efeff090e9d63529ba0a77aae72e82248b/cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc", size = 456268 }, + { url = "https://files.pythonhosted.org/packages/ca/5b/b63681518265f2f4060d2b60755c1c77ec89e5e045fc3773b72735ddaad5/cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c", size = 436592 }, + { url = "https://files.pythonhosted.org/packages/bb/19/b51af9f4a4faa4a8ac5a0e5d5c2522dcd9703d07fac69da34a36c4d960d3/cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1", size = 446512 }, + { url = "https://files.pythonhosted.org/packages/e2/63/2bed8323890cb613bbecda807688a31ed11a7fe7afe31f8faaae0206a9a3/cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8", size = 171576 }, + { url = "https://files.pythonhosted.org/packages/2f/70/80c33b044ebc79527447fd4fbc5455d514c3bb840dede4455de97da39b4d/cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1", size = 181229 }, + { url = "https://files.pythonhosted.org/packages/b9/ea/8bb50596b8ffbc49ddd7a1ad305035daa770202a6b782fc164647c2673ad/cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16", size = 182220 }, + { url = "https://files.pythonhosted.org/packages/ae/11/e77c8cd24f58285a82c23af484cf5b124a376b32644e445960d1a4654c3a/cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36", size = 178605 }, + { url = "https://files.pythonhosted.org/packages/ed/65/25a8dc32c53bf5b7b6c2686b42ae2ad58743f7ff644844af7cdb29b49361/cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8", size = 424910 }, + { url = "https://files.pythonhosted.org/packages/42/7a/9d086fab7c66bd7c4d0f27c57a1b6b068ced810afc498cc8c49e0088661c/cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576", size = 447200 }, + { url = "https://files.pythonhosted.org/packages/da/63/1785ced118ce92a993b0ec9e0d0ac8dc3e5dbfbcaa81135be56c69cabbb6/cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87", size = 454565 }, + { url = "https://files.pythonhosted.org/packages/74/06/90b8a44abf3556599cdec107f7290277ae8901a58f75e6fe8f970cd72418/cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0", size = 435635 }, + { url = "https://files.pythonhosted.org/packages/bd/62/a1f468e5708a70b1d86ead5bab5520861d9c7eacce4a885ded9faa7729c3/cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3", size = 445218 }, + { url = "https://files.pythonhosted.org/packages/5b/95/b34462f3ccb09c2594aa782d90a90b045de4ff1f70148ee79c69d37a0a5a/cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595", size = 460486 }, + { url = "https://files.pythonhosted.org/packages/fc/fc/a1e4bebd8d680febd29cf6c8a40067182b64f00c7d105f8f26b5bc54317b/cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a", size = 437911 }, + { url = "https://files.pythonhosted.org/packages/e6/c3/21cab7a6154b6a5ea330ae80de386e7665254835b9e98ecc1340b3a7de9a/cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e", size = 460632 }, + { url = "https://files.pythonhosted.org/packages/cb/b5/fd9f8b5a84010ca169ee49f4e4ad6f8c05f4e3545b72ee041dbbcb159882/cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7", size = 171820 }, + { url = "https://files.pythonhosted.org/packages/8c/52/b08750ce0bce45c143e1b5d7357ee8c55341b52bdef4b0f081af1eb248c2/cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662", size = 181290 }, +] + [[package]] name = "charset-normalizer" version = "3.4.1" @@ -235,9 +312,46 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, ] +[[package]] +name = "cryptography" +version = "44.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/91/4c/45dfa6829acffa344e3967d6006ee4ae8be57af746ae2eba1c431949b32c/cryptography-44.0.0.tar.gz", hash = "sha256:cd4e834f340b4293430701e772ec543b0fbe6c2dea510a5286fe0acabe153a02", size = 710657 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/09/8cc67f9b84730ad330b3b72cf867150744bf07ff113cda21a15a1c6d2c7c/cryptography-44.0.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:84111ad4ff3f6253820e6d3e58be2cc2a00adb29335d4cacb5ab4d4d34f2a123", size = 6541833 }, + { url = "https://files.pythonhosted.org/packages/7e/5b/3759e30a103144e29632e7cb72aec28cedc79e514b2ea8896bb17163c19b/cryptography-44.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15492a11f9e1b62ba9d73c210e2416724633167de94607ec6069ef724fad092", size = 3922710 }, + { url = "https://files.pythonhosted.org/packages/5f/58/3b14bf39f1a0cfd679e753e8647ada56cddbf5acebffe7db90e184c76168/cryptography-44.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:831c3c4d0774e488fdc83a1923b49b9957d33287de923d58ebd3cec47a0ae43f", size = 4137546 }, + { url = "https://files.pythonhosted.org/packages/98/65/13d9e76ca19b0ba5603d71ac8424b5694415b348e719db277b5edc985ff5/cryptography-44.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:761817a3377ef15ac23cd7834715081791d4ec77f9297ee694ca1ee9c2c7e5eb", size = 3915420 }, + { url = "https://files.pythonhosted.org/packages/b1/07/40fe09ce96b91fc9276a9ad272832ead0fddedcba87f1190372af8e3039c/cryptography-44.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3c672a53c0fb4725a29c303be906d3c1fa99c32f58abe008a82705f9ee96f40b", size = 4154498 }, + { url = "https://files.pythonhosted.org/packages/75/ea/af65619c800ec0a7e4034207aec543acdf248d9bffba0533342d1bd435e1/cryptography-44.0.0-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:4ac4c9f37eba52cb6fbeaf5b59c152ea976726b865bd4cf87883a7e7006cc543", size = 3932569 }, + { url = "https://files.pythonhosted.org/packages/c7/af/d1deb0c04d59612e3d5e54203159e284d3e7a6921e565bb0eeb6269bdd8a/cryptography-44.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ed3534eb1090483c96178fcb0f8893719d96d5274dfde98aa6add34614e97c8e", size = 4016721 }, + { url = "https://files.pythonhosted.org/packages/bd/69/7ca326c55698d0688db867795134bdfac87136b80ef373aaa42b225d6dd5/cryptography-44.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f3f6fdfa89ee2d9d496e2c087cebef9d4fcbb0ad63c40e821b39f74bf48d9c5e", size = 4240915 }, + { url = "https://files.pythonhosted.org/packages/ef/d4/cae11bf68c0f981e0413906c6dd03ae7fa864347ed5fac40021df1ef467c/cryptography-44.0.0-cp37-abi3-win32.whl", hash = "sha256:eb33480f1bad5b78233b0ad3e1b0be21e8ef1da745d8d2aecbb20671658b9053", size = 2757925 }, + { url = "https://files.pythonhosted.org/packages/64/b1/50d7739254d2002acae64eed4fc43b24ac0cc44bf0a0d388d1ca06ec5bb1/cryptography-44.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:abc998e0c0eee3c8a1904221d3f67dcfa76422b23620173e28c11d3e626c21bd", size = 3202055 }, + { url = "https://files.pythonhosted.org/packages/11/18/61e52a3d28fc1514a43b0ac291177acd1b4de00e9301aaf7ef867076ff8a/cryptography-44.0.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:660cb7312a08bc38be15b696462fa7cc7cd85c3ed9c576e81f4dc4d8b2b31591", size = 6542801 }, + { url = "https://files.pythonhosted.org/packages/1a/07/5f165b6c65696ef75601b781a280fc3b33f1e0cd6aa5a92d9fb96c410e97/cryptography-44.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1923cb251c04be85eec9fda837661c67c1049063305d6be5721643c22dd4e2b7", size = 3922613 }, + { url = "https://files.pythonhosted.org/packages/28/34/6b3ac1d80fc174812486561cf25194338151780f27e438526f9c64e16869/cryptography-44.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:404fdc66ee5f83a1388be54300ae978b2efd538018de18556dde92575e05defc", size = 4137925 }, + { url = "https://files.pythonhosted.org/packages/d0/c7/c656eb08fd22255d21bc3129625ed9cd5ee305f33752ef2278711b3fa98b/cryptography-44.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c5eb858beed7835e5ad1faba59e865109f3e52b3783b9ac21e7e47dc5554e289", size = 3915417 }, + { url = "https://files.pythonhosted.org/packages/ef/82/72403624f197af0db6bac4e58153bc9ac0e6020e57234115db9596eee85d/cryptography-44.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f53c2c87e0fb4b0c00fa9571082a057e37690a8f12233306161c8f4b819960b7", size = 4155160 }, + { url = "https://files.pythonhosted.org/packages/a2/cd/2f3c440913d4329ade49b146d74f2e9766422e1732613f57097fea61f344/cryptography-44.0.0-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:9e6fc8a08e116fb7c7dd1f040074c9d7b51d74a8ea40d4df2fc7aa08b76b9e6c", size = 3932331 }, + { url = "https://files.pythonhosted.org/packages/7f/df/8be88797f0a1cca6e255189a57bb49237402b1880d6e8721690c5603ac23/cryptography-44.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:d2436114e46b36d00f8b72ff57e598978b37399d2786fd39793c36c6d5cb1c64", size = 4017372 }, + { url = "https://files.pythonhosted.org/packages/af/36/5ccc376f025a834e72b8e52e18746b927f34e4520487098e283a719c205e/cryptography-44.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a01956ddfa0a6790d594f5b34fc1bfa6098aca434696a03cfdbe469b8ed79285", size = 4239657 }, + { url = "https://files.pythonhosted.org/packages/46/b0/f4f7d0d0bcfbc8dd6296c1449be326d04217c57afb8b2594f017eed95533/cryptography-44.0.0-cp39-abi3-win32.whl", hash = "sha256:eca27345e1214d1b9f9490d200f9db5a874479be914199194e746c893788d417", size = 2758672 }, + { url = "https://files.pythonhosted.org/packages/97/9b/443270b9210f13f6ef240eff73fd32e02d381e7103969dc66ce8e89ee901/cryptography-44.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:708ee5f1bafe76d041b53a4f95eb28cdeb8d18da17e597d46d7833ee59b97ede", size = 3202071 }, + { url = "https://files.pythonhosted.org/packages/77/d4/fea74422326388bbac0c37b7489a0fcb1681a698c3b875959430ba550daa/cryptography-44.0.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:37d76e6863da3774cd9db5b409a9ecfd2c71c981c38788d3fcfaf177f447b731", size = 3338857 }, + { url = "https://files.pythonhosted.org/packages/1a/aa/ba8a7467c206cb7b62f09b4168da541b5109838627f582843bbbe0235e8e/cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:f677e1268c4e23420c3acade68fac427fffcb8d19d7df95ed7ad17cdef8404f4", size = 3850615 }, + { url = "https://files.pythonhosted.org/packages/89/fa/b160e10a64cc395d090105be14f399b94e617c879efd401188ce0fea39ee/cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f5e7cb1e5e56ca0933b4873c0220a78b773b24d40d186b6738080b73d3d0a756", size = 4081622 }, + { url = "https://files.pythonhosted.org/packages/47/8f/20ff0656bb0cf7af26ec1d01f780c5cfbaa7666736063378c5f48558b515/cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:8b3e6eae66cf54701ee7d9c83c30ac0a1e3fa17be486033000f2a73a12ab507c", size = 3867546 }, + { url = "https://files.pythonhosted.org/packages/38/d9/28edf32ee2fcdca587146bcde90102a7319b2f2c690edfa627e46d586050/cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:be4ce505894d15d5c5037167ffb7f0ae90b7be6f2a98f9a5c3442395501c32fa", size = 4090937 }, + { url = "https://files.pythonhosted.org/packages/cc/9d/37e5da7519de7b0b070a3fedd4230fe76d50d2a21403e0f2153d70ac4163/cryptography-44.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:62901fb618f74d7d81bf408c8719e9ec14d863086efe4185afd07c352aee1d2c", size = 3128774 }, +] + [[package]] name = "datafusion" -version = "43.0.0" +version = "44.0.0" source = { editable = "." } dependencies = [ { name = "pyarrow", version = "17.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, @@ -249,6 +363,7 @@ dependencies = [ dev = [ { name = "maturin" }, { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "pygithub" }, { name = "pytest" }, { name = "ruff" }, { name = "toml" }, @@ -282,6 +397,7 @@ requires-dist = [ dev = [ { name = "maturin", specifier = ">=1.8.1" }, { name = "numpy", marker = "python_full_version >= '3.10'", specifier = ">1.24.4" }, + { name = "pygithub", specifier = "==2.5.0" }, { name = "pytest", specifier = ">=7.4.4" }, { name = "ruff", specifier = ">=0.9.1" }, { name = "toml", specifier = ">=0.10.2" }, @@ -307,6 +423,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186", size = 9073 }, ] +[[package]] +name = "deprecated" +version = "1.2.18" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/98/97/06afe62762c9a8a86af0cfb7bfdab22a43ad17138b07af5b1a58442690a2/deprecated-1.2.18.tar.gz", hash = "sha256:422b6f6d859da6f2ef57857761bfb392480502a64c3028ca9bbe86085d72115d", size = 2928744 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec", size = 9998 }, +] + [[package]] name = "docutils" version = "0.20.1" @@ -1189,6 +1317,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/d8/94161a7ca5c55199484e926165e9e33f318ea1d1b0d7cdbcbc3652b933ec/pyarrow-18.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:a1880dd6772b685e803011a6b43a230c23b566859a6e0c9a276c1e0faf4f4052", size = 25301373 }, ] +[[package]] +name = "pycparser" +version = "2.22" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1d/b2/31537cf4b1ca988837256c910a668b553fceb8f069bedc4b1c826024b52c/pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6", size = 172736 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552 }, +] + [[package]] name = "pydata-sphinx-theme" version = "0.8.0" @@ -1206,6 +1343,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/91/26/0694318d46c7d90ab602ae27b24431e939f1600f9a4c69d1e727ec57289f/pydata_sphinx_theme-0.8.0-py3-none-any.whl", hash = "sha256:fbcbb833a07d3ad8dd997dd40dc94da18d98b41c68123ab0182b58fe92271204", size = 3284997 }, ] +[[package]] +name = "pygithub" +version = "2.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "deprecated" }, + { name = "pyjwt", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, extra = ["crypto"], marker = "python_full_version < '3.9'" }, + { name = "pyjwt", version = "2.10.1", source = { registry = "https://pypi.org/simple" }, extra = ["crypto"], marker = "python_full_version >= '3.9'" }, + { name = "pynacl" }, + { name = "requests" }, + { name = "typing-extensions" }, + { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "urllib3", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/16/ce/aa91d30040d9552c274e7ea8bd10a977600d508d579a4bb262b95eccf961/pygithub-2.5.0.tar.gz", hash = "sha256:e1613ac508a9be710920d26eb18b1905ebd9926aa49398e88151c1b526aad3cf", size = 3552804 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/37/05/bfbdbbc5d8aafd8dae9b3b6877edca561fccd8528ef5edc4e7b6d23721b5/PyGithub-2.5.0-py3-none-any.whl", hash = "sha256:b0b635999a658ab8e08720bdd3318893ff20e2275f6446fcf35bf3f44f2c0fd2", size = 375935 }, +] + [[package]] name = "pygments" version = "2.19.1" @@ -1215,6 +1371,63 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 }, ] +[[package]] +name = "pyjwt" +version = "2.9.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.9'", +] +sdist = { url = "https://files.pythonhosted.org/packages/fb/68/ce067f09fca4abeca8771fe667d89cc347d1e99da3e093112ac329c6020e/pyjwt-2.9.0.tar.gz", hash = "sha256:7e1e5b56cc735432a7369cbfa0efe50fa113ebecdc04ae6922deba8b84582d0c", size = 78825 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/79/84/0fdf9b18ba31d69877bd39c9cd6052b47f3761e9910c15de788e519f079f/PyJWT-2.9.0-py3-none-any.whl", hash = "sha256:3b02fb0f44517787776cf48f2ae25d8e14f300e6d7545a4315cee571a415e850", size = 22344 }, +] + +[package.optional-dependencies] +crypto = [ + { name = "cryptography", marker = "python_full_version < '3.9'" }, +] + +[[package]] +name = "pyjwt" +version = "2.10.1" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", + "python_full_version == '3.9.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/e7/46/bd74733ff231675599650d3e47f361794b22ef3e3770998dda30d3b63726/pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953", size = 87785 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb", size = 22997 }, +] + +[package.optional-dependencies] +crypto = [ + { name = "cryptography", marker = "python_full_version >= '3.9'" }, +] + +[[package]] +name = "pynacl" +version = "1.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a7/22/27582568be639dfe22ddb3902225f91f2f17ceff88ce80e4db396c8986da/PyNaCl-1.5.0.tar.gz", hash = "sha256:8ac7448f09ab85811607bdd21ec2464495ac8b7c66d146bf545b0f08fb9220ba", size = 3392854 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/75/0b8ede18506041c0bf23ac4d8e2971b4161cd6ce630b177d0a08eb0d8857/PyNaCl-1.5.0-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:401002a4aaa07c9414132aaed7f6836ff98f59277a234704ff66878c2ee4a0d1", size = 349920 }, + { url = "https://files.pythonhosted.org/packages/59/bb/fddf10acd09637327a97ef89d2a9d621328850a72f1fdc8c08bdf72e385f/PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:52cb72a79269189d4e0dc537556f4740f7f0a9ec41c1322598799b0bdad4ef92", size = 601722 }, + { url = "https://files.pythonhosted.org/packages/5d/70/87a065c37cca41a75f2ce113a5a2c2aa7533be648b184ade58971b5f7ccc/PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a36d4a9dda1f19ce6e03c9a784a2921a4b726b02e1c736600ca9c22029474394", size = 680087 }, + { url = "https://files.pythonhosted.org/packages/ee/87/f1bb6a595f14a327e8285b9eb54d41fef76c585a0edef0a45f6fc95de125/PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:0c84947a22519e013607c9be43706dd42513f9e6ae5d39d3613ca1e142fba44d", size = 856678 }, + { url = "https://files.pythonhosted.org/packages/66/28/ca86676b69bf9f90e710571b67450508484388bfce09acf8a46f0b8c785f/PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06b8f6fa7f5de8d5d2f7573fe8c863c051225a27b61e6860fd047b1775807858", size = 1133660 }, + { url = "https://files.pythonhosted.org/packages/3d/85/c262db650e86812585e2bc59e497a8f59948a005325a11bbbc9ecd3fe26b/PyNaCl-1.5.0-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:a422368fc821589c228f4c49438a368831cb5bbc0eab5ebe1d7fac9dded6567b", size = 663824 }, + { url = "https://files.pythonhosted.org/packages/fd/1a/cc308a884bd299b651f1633acb978e8596c71c33ca85e9dc9fa33a5399b9/PyNaCl-1.5.0-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:61f642bf2378713e2c2e1de73444a3778e5f0a38be6fee0fe532fe30060282ff", size = 1117912 }, + { url = "https://files.pythonhosted.org/packages/25/2d/b7df6ddb0c2a33afdb358f8af6ea3b8c4d1196ca45497dd37a56f0c122be/PyNaCl-1.5.0-cp36-abi3-win32.whl", hash = "sha256:e46dae94e34b085175f8abb3b0aaa7da40767865ac82c928eeb9e57e1ea8a543", size = 204624 }, + { url = "https://files.pythonhosted.org/packages/5e/22/d3db169895faaf3e2eda892f005f433a62db2decbcfbc2f61e6517adfa87/PyNaCl-1.5.0-cp36-abi3-win_amd64.whl", hash = "sha256:20f42270d27e1b6a29f54032090b972d97f0a1b0948cc52392041ef7831fee93", size = 212141 }, +] + [[package]] name = "pytest" version = "8.3.4" @@ -1817,6 +2030,92 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 }, ] +[[package]] +name = "wrapt" +version = "1.17.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c3/fc/e91cc220803d7bc4db93fb02facd8461c37364151b8494762cc88b0fbcef/wrapt-1.17.2.tar.gz", hash = "sha256:41388e9d4d1522446fe79d3213196bd9e3b301a336965b9e27ca2788ebd122f3", size = 55531 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/d1/1daec934997e8b160040c78d7b31789f19b122110a75eca3d4e8da0049e1/wrapt-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3d57c572081fed831ad2d26fd430d565b76aa277ed1d30ff4d40670b1c0dd984", size = 53307 }, + { url = "https://files.pythonhosted.org/packages/1b/7b/13369d42651b809389c1a7153baa01d9700430576c81a2f5c5e460df0ed9/wrapt-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b5e251054542ae57ac7f3fba5d10bfff615b6c2fb09abeb37d2f1463f841ae22", size = 38486 }, + { url = "https://files.pythonhosted.org/packages/62/bf/e0105016f907c30b4bd9e377867c48c34dc9c6c0c104556c9c9126bd89ed/wrapt-1.17.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:80dd7db6a7cb57ffbc279c4394246414ec99537ae81ffd702443335a61dbf3a7", size = 38777 }, + { url = "https://files.pythonhosted.org/packages/27/70/0f6e0679845cbf8b165e027d43402a55494779295c4b08414097b258ac87/wrapt-1.17.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a6e821770cf99cc586d33833b2ff32faebdbe886bd6322395606cf55153246c", size = 83314 }, + { url = "https://files.pythonhosted.org/packages/0f/77/0576d841bf84af8579124a93d216f55d6f74374e4445264cb378a6ed33eb/wrapt-1.17.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b60fb58b90c6d63779cb0c0c54eeb38941bae3ecf7a73c764c52c88c2dcb9d72", size = 74947 }, + { url = "https://files.pythonhosted.org/packages/90/ec/00759565518f268ed707dcc40f7eeec38637d46b098a1f5143bff488fe97/wrapt-1.17.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b870b5df5b71d8c3359d21be8f0d6c485fa0ebdb6477dda51a1ea54a9b558061", size = 82778 }, + { url = "https://files.pythonhosted.org/packages/f8/5a/7cffd26b1c607b0b0c8a9ca9d75757ad7620c9c0a9b4a25d3f8a1480fafc/wrapt-1.17.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4011d137b9955791f9084749cba9a367c68d50ab8d11d64c50ba1688c9b457f2", size = 81716 }, + { url = "https://files.pythonhosted.org/packages/7e/09/dccf68fa98e862df7e6a60a61d43d644b7d095a5fc36dbb591bbd4a1c7b2/wrapt-1.17.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:1473400e5b2733e58b396a04eb7f35f541e1fb976d0c0724d0223dd607e0f74c", size = 74548 }, + { url = "https://files.pythonhosted.org/packages/b7/8e/067021fa3c8814952c5e228d916963c1115b983e21393289de15128e867e/wrapt-1.17.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3cedbfa9c940fdad3e6e941db7138e26ce8aad38ab5fe9dcfadfed9db7a54e62", size = 81334 }, + { url = "https://files.pythonhosted.org/packages/4b/0d/9d4b5219ae4393f718699ca1c05f5ebc0c40d076f7e65fd48f5f693294fb/wrapt-1.17.2-cp310-cp310-win32.whl", hash = "sha256:582530701bff1dec6779efa00c516496968edd851fba224fbd86e46cc6b73563", size = 36427 }, + { url = "https://files.pythonhosted.org/packages/72/6a/c5a83e8f61aec1e1aeef939807602fb880e5872371e95df2137142f5c58e/wrapt-1.17.2-cp310-cp310-win_amd64.whl", hash = "sha256:58705da316756681ad3c9c73fd15499aa4d8c69f9fd38dc8a35e06c12468582f", size = 38774 }, + { url = "https://files.pythonhosted.org/packages/cd/f7/a2aab2cbc7a665efab072344a8949a71081eed1d2f451f7f7d2b966594a2/wrapt-1.17.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ff04ef6eec3eee8a5efef2401495967a916feaa353643defcc03fc74fe213b58", size = 53308 }, + { url = "https://files.pythonhosted.org/packages/50/ff/149aba8365fdacef52b31a258c4dc1c57c79759c335eff0b3316a2664a64/wrapt-1.17.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4db983e7bca53819efdbd64590ee96c9213894272c776966ca6306b73e4affda", size = 38488 }, + { url = "https://files.pythonhosted.org/packages/65/46/5a917ce85b5c3b490d35c02bf71aedaa9f2f63f2d15d9949cc4ba56e8ba9/wrapt-1.17.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9abc77a4ce4c6f2a3168ff34b1da9b0f311a8f1cfd694ec96b0603dff1c79438", size = 38776 }, + { url = "https://files.pythonhosted.org/packages/ca/74/336c918d2915a4943501c77566db41d1bd6e9f4dbc317f356b9a244dfe83/wrapt-1.17.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b929ac182f5ace000d459c59c2c9c33047e20e935f8e39371fa6e3b85d56f4a", size = 83776 }, + { url = "https://files.pythonhosted.org/packages/09/99/c0c844a5ccde0fe5761d4305485297f91d67cf2a1a824c5f282e661ec7ff/wrapt-1.17.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f09b286faeff3c750a879d336fb6d8713206fc97af3adc14def0cdd349df6000", size = 75420 }, + { url = "https://files.pythonhosted.org/packages/b4/b0/9fc566b0fe08b282c850063591a756057c3247b2362b9286429ec5bf1721/wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a7ed2d9d039bd41e889f6fb9364554052ca21ce823580f6a07c4ec245c1f5d6", size = 83199 }, + { url = "https://files.pythonhosted.org/packages/9d/4b/71996e62d543b0a0bd95dda485219856def3347e3e9380cc0d6cf10cfb2f/wrapt-1.17.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:129a150f5c445165ff941fc02ee27df65940fcb8a22a61828b1853c98763a64b", size = 82307 }, + { url = "https://files.pythonhosted.org/packages/39/35/0282c0d8789c0dc9bcc738911776c762a701f95cfe113fb8f0b40e45c2b9/wrapt-1.17.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1fb5699e4464afe5c7e65fa51d4f99e0b2eadcc176e4aa33600a3df7801d6662", size = 75025 }, + { url = "https://files.pythonhosted.org/packages/4f/6d/90c9fd2c3c6fee181feecb620d95105370198b6b98a0770cba090441a828/wrapt-1.17.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9a2bce789a5ea90e51a02dfcc39e31b7f1e662bc3317979aa7e5538e3a034f72", size = 81879 }, + { url = "https://files.pythonhosted.org/packages/8f/fa/9fb6e594f2ce03ef03eddbdb5f4f90acb1452221a5351116c7c4708ac865/wrapt-1.17.2-cp311-cp311-win32.whl", hash = "sha256:4afd5814270fdf6380616b321fd31435a462019d834f83c8611a0ce7484c7317", size = 36419 }, + { url = "https://files.pythonhosted.org/packages/47/f8/fb1773491a253cbc123c5d5dc15c86041f746ed30416535f2a8df1f4a392/wrapt-1.17.2-cp311-cp311-win_amd64.whl", hash = "sha256:acc130bc0375999da18e3d19e5a86403667ac0c4042a094fefb7eec8ebac7cf3", size = 38773 }, + { url = "https://files.pythonhosted.org/packages/a1/bd/ab55f849fd1f9a58ed7ea47f5559ff09741b25f00c191231f9f059c83949/wrapt-1.17.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d5e2439eecc762cd85e7bd37161d4714aa03a33c5ba884e26c81559817ca0925", size = 53799 }, + { url = "https://files.pythonhosted.org/packages/53/18/75ddc64c3f63988f5a1d7e10fb204ffe5762bc663f8023f18ecaf31a332e/wrapt-1.17.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fc7cb4c1c744f8c05cd5f9438a3caa6ab94ce8344e952d7c45a8ed59dd88392", size = 38821 }, + { url = "https://files.pythonhosted.org/packages/48/2a/97928387d6ed1c1ebbfd4efc4133a0633546bec8481a2dd5ec961313a1c7/wrapt-1.17.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8fdbdb757d5390f7c675e558fd3186d590973244fab0c5fe63d373ade3e99d40", size = 38919 }, + { url = "https://files.pythonhosted.org/packages/73/54/3bfe5a1febbbccb7a2f77de47b989c0b85ed3a6a41614b104204a788c20e/wrapt-1.17.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb1d0dbf99411f3d871deb6faa9aabb9d4e744d67dcaaa05399af89d847a91d", size = 88721 }, + { url = "https://files.pythonhosted.org/packages/25/cb/7262bc1b0300b4b64af50c2720ef958c2c1917525238d661c3e9a2b71b7b/wrapt-1.17.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d18a4865f46b8579d44e4fe1e2bcbc6472ad83d98e22a26c963d46e4c125ef0b", size = 80899 }, + { url = "https://files.pythonhosted.org/packages/2a/5a/04cde32b07a7431d4ed0553a76fdb7a61270e78c5fd5a603e190ac389f14/wrapt-1.17.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc570b5f14a79734437cb7b0500376b6b791153314986074486e0b0fa8d71d98", size = 89222 }, + { url = "https://files.pythonhosted.org/packages/09/28/2e45a4f4771fcfb109e244d5dbe54259e970362a311b67a965555ba65026/wrapt-1.17.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6d9187b01bebc3875bac9b087948a2bccefe464a7d8f627cf6e48b1bbae30f82", size = 86707 }, + { url = "https://files.pythonhosted.org/packages/c6/d2/dcb56bf5f32fcd4bd9aacc77b50a539abdd5b6536872413fd3f428b21bed/wrapt-1.17.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:9e8659775f1adf02eb1e6f109751268e493c73716ca5761f8acb695e52a756ae", size = 79685 }, + { url = "https://files.pythonhosted.org/packages/80/4e/eb8b353e36711347893f502ce91c770b0b0929f8f0bed2670a6856e667a9/wrapt-1.17.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e8b2816ebef96d83657b56306152a93909a83f23994f4b30ad4573b00bd11bb9", size = 87567 }, + { url = "https://files.pythonhosted.org/packages/17/27/4fe749a54e7fae6e7146f1c7d914d28ef599dacd4416566c055564080fe2/wrapt-1.17.2-cp312-cp312-win32.whl", hash = "sha256:468090021f391fe0056ad3e807e3d9034e0fd01adcd3bdfba977b6fdf4213ea9", size = 36672 }, + { url = "https://files.pythonhosted.org/packages/15/06/1dbf478ea45c03e78a6a8c4be4fdc3c3bddea5c8de8a93bc971415e47f0f/wrapt-1.17.2-cp312-cp312-win_amd64.whl", hash = "sha256:ec89ed91f2fa8e3f52ae53cd3cf640d6feff92ba90d62236a81e4e563ac0e991", size = 38865 }, + { url = "https://files.pythonhosted.org/packages/ce/b9/0ffd557a92f3b11d4c5d5e0c5e4ad057bd9eb8586615cdaf901409920b14/wrapt-1.17.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6ed6ffac43aecfe6d86ec5b74b06a5be33d5bb9243d055141e8cabb12aa08125", size = 53800 }, + { url = "https://files.pythonhosted.org/packages/c0/ef/8be90a0b7e73c32e550c73cfb2fa09db62234227ece47b0e80a05073b375/wrapt-1.17.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:35621ae4c00e056adb0009f8e86e28eb4a41a4bfa8f9bfa9fca7d343fe94f998", size = 38824 }, + { url = "https://files.pythonhosted.org/packages/36/89/0aae34c10fe524cce30fe5fc433210376bce94cf74d05b0d68344c8ba46e/wrapt-1.17.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a604bf7a053f8362d27eb9fefd2097f82600b856d5abe996d623babd067b1ab5", size = 38920 }, + { url = "https://files.pythonhosted.org/packages/3b/24/11c4510de906d77e0cfb5197f1b1445d4fec42c9a39ea853d482698ac681/wrapt-1.17.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cbabee4f083b6b4cd282f5b817a867cf0b1028c54d445b7ec7cfe6505057cf8", size = 88690 }, + { url = "https://files.pythonhosted.org/packages/71/d7/cfcf842291267bf455b3e266c0c29dcb675b5540ee8b50ba1699abf3af45/wrapt-1.17.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:49703ce2ddc220df165bd2962f8e03b84c89fee2d65e1c24a7defff6f988f4d6", size = 80861 }, + { url = "https://files.pythonhosted.org/packages/d5/66/5d973e9f3e7370fd686fb47a9af3319418ed925c27d72ce16b791231576d/wrapt-1.17.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8112e52c5822fc4253f3901b676c55ddf288614dc7011634e2719718eaa187dc", size = 89174 }, + { url = "https://files.pythonhosted.org/packages/a7/d3/8e17bb70f6ae25dabc1aaf990f86824e4fd98ee9cadf197054e068500d27/wrapt-1.17.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fee687dce376205d9a494e9c121e27183b2a3df18037f89d69bd7b35bcf59e2", size = 86721 }, + { url = "https://files.pythonhosted.org/packages/6f/54/f170dfb278fe1c30d0ff864513cff526d624ab8de3254b20abb9cffedc24/wrapt-1.17.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:18983c537e04d11cf027fbb60a1e8dfd5190e2b60cc27bc0808e653e7b218d1b", size = 79763 }, + { url = "https://files.pythonhosted.org/packages/4a/98/de07243751f1c4a9b15c76019250210dd3486ce098c3d80d5f729cba029c/wrapt-1.17.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:703919b1633412ab54bcf920ab388735832fdcb9f9a00ae49387f0fe67dad504", size = 87585 }, + { url = "https://files.pythonhosted.org/packages/f9/f0/13925f4bd6548013038cdeb11ee2cbd4e37c30f8bfd5db9e5a2a370d6e20/wrapt-1.17.2-cp313-cp313-win32.whl", hash = "sha256:abbb9e76177c35d4e8568e58650aa6926040d6a9f6f03435b7a522bf1c487f9a", size = 36676 }, + { url = "https://files.pythonhosted.org/packages/bf/ae/743f16ef8c2e3628df3ddfd652b7d4c555d12c84b53f3d8218498f4ade9b/wrapt-1.17.2-cp313-cp313-win_amd64.whl", hash = "sha256:69606d7bb691b50a4240ce6b22ebb319c1cfb164e5f6569835058196e0f3a845", size = 38871 }, + { url = "https://files.pythonhosted.org/packages/3d/bc/30f903f891a82d402ffb5fda27ec1d621cc97cb74c16fea0b6141f1d4e87/wrapt-1.17.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:4a721d3c943dae44f8e243b380cb645a709ba5bd35d3ad27bc2ed947e9c68192", size = 56312 }, + { url = "https://files.pythonhosted.org/packages/8a/04/c97273eb491b5f1c918857cd26f314b74fc9b29224521f5b83f872253725/wrapt-1.17.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:766d8bbefcb9e00c3ac3b000d9acc51f1b399513f44d77dfe0eb026ad7c9a19b", size = 40062 }, + { url = "https://files.pythonhosted.org/packages/4e/ca/3b7afa1eae3a9e7fefe499db9b96813f41828b9fdb016ee836c4c379dadb/wrapt-1.17.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e496a8ce2c256da1eb98bd15803a79bee00fc351f5dfb9ea82594a3f058309e0", size = 40155 }, + { url = "https://files.pythonhosted.org/packages/89/be/7c1baed43290775cb9030c774bc53c860db140397047cc49aedaf0a15477/wrapt-1.17.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d615e4fe22f4ad3528448c193b218e077656ca9ccb22ce2cb20db730f8d306", size = 113471 }, + { url = "https://files.pythonhosted.org/packages/32/98/4ed894cf012b6d6aae5f5cc974006bdeb92f0241775addad3f8cd6ab71c8/wrapt-1.17.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a5aaeff38654462bc4b09023918b7f21790efb807f54c000a39d41d69cf552cb", size = 101208 }, + { url = "https://files.pythonhosted.org/packages/ea/fd/0c30f2301ca94e655e5e057012e83284ce8c545df7661a78d8bfca2fac7a/wrapt-1.17.2-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a7d15bbd2bc99e92e39f49a04653062ee6085c0e18b3b7512a4f2fe91f2d681", size = 109339 }, + { url = "https://files.pythonhosted.org/packages/75/56/05d000de894c4cfcb84bcd6b1df6214297b8089a7bd324c21a4765e49b14/wrapt-1.17.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e3890b508a23299083e065f435a492b5435eba6e304a7114d2f919d400888cc6", size = 110232 }, + { url = "https://files.pythonhosted.org/packages/53/f8/c3f6b2cf9b9277fb0813418e1503e68414cd036b3b099c823379c9575e6d/wrapt-1.17.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8c8b293cd65ad716d13d8dd3624e42e5a19cc2a2f1acc74b30c2c13f15cb61a6", size = 100476 }, + { url = "https://files.pythonhosted.org/packages/a7/b1/0bb11e29aa5139d90b770ebbfa167267b1fc548d2302c30c8f7572851738/wrapt-1.17.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c82b8785d98cdd9fed4cac84d765d234ed3251bd6afe34cb7ac523cb93e8b4f", size = 106377 }, + { url = "https://files.pythonhosted.org/packages/6a/e1/0122853035b40b3f333bbb25f1939fc1045e21dd518f7f0922b60c156f7c/wrapt-1.17.2-cp313-cp313t-win32.whl", hash = "sha256:13e6afb7fe71fe7485a4550a8844cc9ffbe263c0f1a1eea569bc7091d4898555", size = 37986 }, + { url = "https://files.pythonhosted.org/packages/09/5e/1655cf481e079c1f22d0cabdd4e51733679932718dc23bf2db175f329b76/wrapt-1.17.2-cp313-cp313t-win_amd64.whl", hash = "sha256:eaf675418ed6b3b31c7a989fd007fa7c3be66ce14e5c3b27336383604c9da85c", size = 40750 }, + { url = "https://files.pythonhosted.org/packages/0c/66/95b9e90e6e1274999b183c9c3f984996d870e933ca9560115bd1cd1d6f77/wrapt-1.17.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5c803c401ea1c1c18de70a06a6f79fcc9c5acfc79133e9869e730ad7f8ad8ef9", size = 53234 }, + { url = "https://files.pythonhosted.org/packages/a4/b6/6eced5e2db5924bf6d9223d2bb96b62e00395aae77058e6a9e11bf16b3bd/wrapt-1.17.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f917c1180fdb8623c2b75a99192f4025e412597c50b2ac870f156de8fb101119", size = 38462 }, + { url = "https://files.pythonhosted.org/packages/5d/a4/c8472fe2568978b5532df84273c53ddf713f689d408a4335717ab89547e0/wrapt-1.17.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ecc840861360ba9d176d413a5489b9a0aff6d6303d7e733e2c4623cfa26904a6", size = 38730 }, + { url = "https://files.pythonhosted.org/packages/3c/70/1d259c6b1ad164eb23ff70e3e452dd1950f96e6473f72b7207891d0fd1f0/wrapt-1.17.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb87745b2e6dc56361bfde481d5a378dc314b252a98d7dd19a651a3fa58f24a9", size = 86225 }, + { url = "https://files.pythonhosted.org/packages/a9/68/6b83367e1afb8de91cbea4ef8e85b58acdf62f034f05d78c7b82afaa23d8/wrapt-1.17.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58455b79ec2661c3600e65c0a716955adc2410f7383755d537584b0de41b1d8a", size = 78055 }, + { url = "https://files.pythonhosted.org/packages/0d/21/09573d2443916705c57fdab85d508f592c0a58d57becc53e15755d67fba2/wrapt-1.17.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4e42a40a5e164cbfdb7b386c966a588b1047558a990981ace551ed7e12ca9c2", size = 85592 }, + { url = "https://files.pythonhosted.org/packages/45/ce/700e17a852dd5dec894e241c72973ea82363486bcc1fb05d47b4fbd1d683/wrapt-1.17.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:91bd7d1773e64019f9288b7a5101f3ae50d3d8e6b1de7edee9c2ccc1d32f0c0a", size = 83906 }, + { url = "https://files.pythonhosted.org/packages/37/14/bd210faf0a66faeb8529d42b6b45a25d6aa6ce25ddfc19168e4161aed227/wrapt-1.17.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:bb90fb8bda722a1b9d48ac1e6c38f923ea757b3baf8ebd0c82e09c5c1a0e7a04", size = 76763 }, + { url = "https://files.pythonhosted.org/packages/34/0c/85af70d291f44659c422416f0272046109e785bf6db8c081cfeeae5715c5/wrapt-1.17.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:08e7ce672e35efa54c5024936e559469436f8b8096253404faeb54d2a878416f", size = 83573 }, + { url = "https://files.pythonhosted.org/packages/f8/1e/b215068e824878f69ea945804fa26c176f7c2735a3ad5367d78930bd076a/wrapt-1.17.2-cp38-cp38-win32.whl", hash = "sha256:410a92fefd2e0e10d26210e1dfb4a876ddaf8439ef60d6434f21ef8d87efc5b7", size = 36408 }, + { url = "https://files.pythonhosted.org/packages/52/27/3dd9ad5f1097b33c95d05929e409cc86d7c765cb5437b86694dc8f8e9af0/wrapt-1.17.2-cp38-cp38-win_amd64.whl", hash = "sha256:95c658736ec15602da0ed73f312d410117723914a5c91a14ee4cdd72f1d790b3", size = 38737 }, + { url = "https://files.pythonhosted.org/packages/8a/f4/6ed2b8f6f1c832933283974839b88ec7c983fd12905e01e97889dadf7559/wrapt-1.17.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:99039fa9e6306880572915728d7f6c24a86ec57b0a83f6b2491e1d8ab0235b9a", size = 53308 }, + { url = "https://files.pythonhosted.org/packages/a2/a9/712a53f8f4f4545768ac532619f6e56d5d0364a87b2212531685e89aeef8/wrapt-1.17.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2696993ee1eebd20b8e4ee4356483c4cb696066ddc24bd70bcbb80fa56ff9061", size = 38489 }, + { url = "https://files.pythonhosted.org/packages/fa/9b/e172c8f28a489a2888df18f953e2f6cb8d33b1a2e78c9dfc52d8bf6a5ead/wrapt-1.17.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:612dff5db80beef9e649c6d803a8d50c409082f1fedc9dbcdfde2983b2025b82", size = 38776 }, + { url = "https://files.pythonhosted.org/packages/cf/cb/7a07b51762dcd59bdbe07aa97f87b3169766cadf240f48d1cbe70a1be9db/wrapt-1.17.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:62c2caa1585c82b3f7a7ab56afef7b3602021d6da34fbc1cf234ff139fed3cd9", size = 83050 }, + { url = "https://files.pythonhosted.org/packages/a5/51/a42757dd41032afd6d8037617aa3bc6803ba971850733b24dfb7d5c627c4/wrapt-1.17.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c958bcfd59bacc2d0249dcfe575e71da54f9dcf4a8bdf89c4cb9a68a1170d73f", size = 74718 }, + { url = "https://files.pythonhosted.org/packages/bf/bb/d552bfe47db02fcfc950fc563073a33500f8108efa5f7b41db2f83a59028/wrapt-1.17.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc78a84e2dfbc27afe4b2bd7c80c8db9bca75cc5b85df52bfe634596a1da846b", size = 82590 }, + { url = "https://files.pythonhosted.org/packages/77/99/77b06b3c3c410dbae411105bf22496facf03a5496bfaca8fbcf9da381889/wrapt-1.17.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ba0f0eb61ef00ea10e00eb53a9129501f52385c44853dbd6c4ad3f403603083f", size = 81462 }, + { url = "https://files.pythonhosted.org/packages/2d/21/cf0bd85ae66f92600829ea1de8e1da778e5e9f6e574ccbe74b66db0d95db/wrapt-1.17.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1e1fe0e6ab7775fd842bc39e86f6dcfc4507ab0ffe206093e76d61cde37225c8", size = 74309 }, + { url = "https://files.pythonhosted.org/packages/6d/16/112d25e9092398a0dd6fec50ab7ac1b775a0c19b428f049785096067ada9/wrapt-1.17.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c86563182421896d73858e08e1db93afdd2b947a70064b813d515d66549e15f9", size = 81081 }, + { url = "https://files.pythonhosted.org/packages/2b/49/364a615a0cc0872685646c495c7172e4fc7bf1959e3b12a1807a03014e05/wrapt-1.17.2-cp39-cp39-win32.whl", hash = "sha256:f393cda562f79828f38a819f4788641ac7c4085f30f1ce1a68672baa686482bb", size = 36423 }, + { url = "https://files.pythonhosted.org/packages/00/ad/5d2c1b34ba3202cd833d9221833e74d6500ce66730974993a8dc9a94fb8c/wrapt-1.17.2-cp39-cp39-win_amd64.whl", hash = "sha256:36ccae62f64235cf8ddb682073a60519426fdd4725524ae38874adf72b5f2aeb", size = 38772 }, + { url = "https://files.pythonhosted.org/packages/2d/82/f56956041adef78f849db6b289b282e72b55ab8045a75abad81898c28d19/wrapt-1.17.2-py3-none-any.whl", hash = "sha256:b18f2d1533a71f069c7f82d524a52599053d4c7166e9dd374ae2136b7f40f7c8", size = 23594 }, +] + [[package]] name = "zipp" version = "3.20.2" From d635d56ecdc0cf2667c01cfcc51f26733ec796dc Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Fri, 7 Feb 2025 14:36:54 -0500 Subject: [PATCH 004/145] Chore/upgrade datafusion 45 (#1010) * upgrade dep * resolve errors * match new pyo3 version * upgrade dep * back to 43 * use released v45 * remove unnecessary pyarrow feature * Update unit test return type * fix test_relational_expr --------- Co-authored-by: Tim Saucer --- Cargo.lock | 400 +++++++++++--------- Cargo.toml | 16 +- examples/ffi-table-provider/Cargo.lock | 493 ++++++++++++++----------- examples/ffi-table-provider/Cargo.toml | 14 +- examples/ffi-table-provider/src/lib.rs | 2 +- python/tests/test_expr.py | 3 +- python/tests/test_functions.py | 2 +- src/context.rs | 2 +- src/dataframe.rs | 2 +- 9 files changed, 514 insertions(+), 420 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 50809696b..f1b1ed50a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -179,9 +179,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eaf3437355979f1e93ba84ba108c38be5767713051f3c8ffbf07c094e2e61f9f" +checksum = "6422e12ac345a0678d7a17e316238e3a40547ae7f92052b77bd86d5e0239f3fc" dependencies = [ "arrow-arith", "arrow-array", @@ -201,24 +201,23 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31dce77d2985522288edae7206bffd5fc4996491841dda01a13a58415867e681" +checksum = "23cf34bb1f48c41d3475927bcc7be498665b8e80b379b88f62a840337f8b8248" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "half", "num", ] [[package]] name = "arrow-array" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d45fe6d3faed0435b7313e59a02583b14c6c6339fa7729e94c32a20af319a79" +checksum = "fb4a06d507f54b70a277be22a127c8ffe0cec6cd98c0ad8a48e77779bbda8223" dependencies = [ "ahash", "arrow-buffer", @@ -233,9 +232,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b02656a35cc103f28084bc80a0159668e0a680d919cef127bd7e0aaccb06ec1" +checksum = "d69d326d5ad1cb82dcefa9ede3fee8fdca98f9982756b16f9cb142f4aa6edc89" dependencies = [ "bytes", "half", @@ -244,9 +243,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c73c6233c5b5d635a56f6010e6eb1ab9e30e94707db21cea03da317f67d84cf3" +checksum = "626e65bd42636a84a238bed49d09c8777e3d825bf81f5087a70111c2831d9870" dependencies = [ "arrow-array", "arrow-buffer", @@ -265,28 +264,25 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec222848d70fea5a32af9c3602b08f5d740d5e2d33fbd76bf6fd88759b5b13a7" +checksum = "71c8f959f7a1389b1dbd883cdcd37c3ed12475329c111912f7f69dad8195d8c6" dependencies = [ "arrow-array", - "arrow-buffer", "arrow-cast", - "arrow-data", "arrow-schema", "chrono", "csv", "csv-core", "lazy_static", - "lexical-core", "regex", ] [[package]] name = "arrow-data" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f2861ffa86f107b8ab577d86cff7c7a490243eabe961ba1e1af4f27542bb79" +checksum = "1858e7c7d01c44cf71c21a85534fd1a54501e8d60d1195d0d6fbcc00f4b10754" dependencies = [ "arrow-buffer", "arrow-schema", @@ -296,13 +292,12 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0270dc511f11bb5fa98a25020ad51a99ca5b08d8a8dfbd17503bb9dba0388f0b" +checksum = "a6bb3f727f049884c7603f0364bc9315363f356b59e9f605ea76541847e06a1e" dependencies = [ "arrow-array", "arrow-buffer", - "arrow-cast", "arrow-data", "arrow-schema", "flatbuffers", @@ -311,9 +306,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0eff38eeb8a971ad3a4caf62c5d57f0cff8a48b64a55e3207c4fd696a9234aad" +checksum = "35de94f165ed8830aede72c35f238763794f0d49c69d30c44d49c9834267ff8c" dependencies = [ "arrow-array", "arrow-buffer", @@ -331,26 +326,23 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6f202a879d287099139ff0d121e7f55ae5e0efe634b8cf2106ebc27a8715dee" +checksum = "8aa06e5f267dc53efbacb933485c79b6fc1685d3ffbe870a16ce4e696fb429da" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", - "half", - "num", ] [[package]] name = "arrow-row" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f936954991c360ba762dff23f5dda16300774fafd722353d9683abd97630ae" +checksum = "66f1144bb456a2f9d82677bd3abcea019217e572fc8f07de5a7bac4b2c56eb2c" dependencies = [ - "ahash", "arrow-array", "arrow-buffer", "arrow-data", @@ -360,18 +352,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9579b9d8bce47aa41389fe344f2c6758279983b7c0ebb4013e283e3e91bb450e" +checksum = "105f01ec0090259e9a33a9263ec18ff223ab91a0ea9fbc18042f7e38005142f6" dependencies = [ "bitflags 2.8.0", ] [[package]] name = "arrow-select" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7471ba126d0b0aaa24b50a36bc6c25e4e74869a1fd1a5553357027a0b1c8d1f1" +checksum = "f690752fdbd2dee278b5f1636fefad8f2f7134c85e20fd59c4199e15a39a6807" dependencies = [ "ahash", "arrow-array", @@ -383,9 +375,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72993b01cb62507b06f1fb49648d7286c8989ecfabdb7b77a750fcb54410731b" +checksum = "d0fff9cd745a7039b66c47ecaf5954460f9fa12eed628f65170117ea93e64ee0" dependencies = [ "arrow-array", "arrow-buffer", @@ -444,7 +436,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -455,7 +447,7 @@ checksum = "644dd749086bf3771a2fbc5f256fdb982d53f011c7d5d560304eafeecebce79d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -598,9 +590,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" +checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9" [[package]] name = "bzip2" @@ -635,9 +627,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.11" +version = "1.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4730490333d58093109dc02c23174c3f4d490998c3fed3cc8e82d57afedb9cf" +checksum = "755717a7de9ec452bf7f3f1a3099085deabd7f2962b861dae91ecd7a365903d2" dependencies = [ "jobserver", "libc", @@ -874,9 +866,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "014fc8c384ecacedaabb3bc8359c2a6c6e9d8f7bea65be3434eccacfc37f52d9" +checksum = "eae420e7a5b0b7f1c39364cc76cbcd0f5fdc416b2514ae3847c2676bbd60702a" dependencies = [ "apache-avro", "arrow", @@ -888,7 +880,6 @@ dependencies = [ "bytes", "bzip2 0.5.0", "chrono", - "dashmap", "datafusion-catalog", "datafusion-common", "datafusion-common-runtime", @@ -908,7 +899,7 @@ dependencies = [ "flate2", "futures", "glob", - "itertools", + "itertools 0.14.0", "log", "num-traits", "object_store", @@ -928,31 +919,39 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee60d33e210ef96070377ae667ece7caa0e959c8387496773d4a1a72f1a5012e" +checksum = "6f27987bc22b810939e8dfecc55571e9d50355d6ea8ec1c47af8383a76a6d0e1" dependencies = [ - "arrow-schema", + "arrow", "async-trait", + "dashmap", "datafusion-common", "datafusion-execution", "datafusion-expr", "datafusion-physical-plan", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", "parking_lot", + "sqlparser", ] [[package]] name = "datafusion-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b42b7d720fe21ed9cca2ebb635f3f13a12cfab786b41e0fba184fb2e620525b" +checksum = "e3f6d5b8c9408cc692f7c194b8aa0c0f9b253e065a8d960ad9cdc2a13e697602" dependencies = [ "ahash", "apache-avro", "arrow", "arrow-array", "arrow-buffer", + "arrow-ipc", "arrow-schema", + "base64 0.22.1", "half", "hashbrown 0.14.5", "indexmap", @@ -969,9 +968,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72fbf14d4079f7ce5306393084fe5057dddfdc2113577e0049310afa12e94281" +checksum = "0d4603c8e8a4baf77660ab7074cc66fc15cc8a18f2ce9dfadb755fc6ee294e48" dependencies = [ "log", "tokio", @@ -979,15 +978,15 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c278dbd64860ed0bb5240fc1f4cb6aeea437153910aea69bcf7d5a8d6d0454f3" +checksum = "e5bf4bc68623a5cf231eed601ed6eb41f46a37c4d15d11a0bff24cbc8396cd66" [[package]] name = "datafusion-execution" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e22cb02af47e756468b3cbfee7a83e3d4f2278d452deb4b033ba933c75169486" +checksum = "88b491c012cdf8e051053426013429a76f74ee3c2db68496c79c323ca1084d27" dependencies = [ "arrow", "dashmap", @@ -1004,9 +1003,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62298eadb1d15b525df1315e61a71519ffc563d41d5c3b2a30fda2d70f77b93c" +checksum = "e5a181408d4fc5dc22f9252781a8f39f2d0e5d1b33ec9bde242844980a2689c1" dependencies = [ "arrow", "chrono", @@ -1025,23 +1024,26 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dda7f73c5fc349251cd3dcb05773c5bf55d2505a698ef9d38dfc712161ea2f55" +checksum = "d1129b48e8534d8c03c6543bcdccef0b55c8ac0c1272a15a56c67068b6eb1885" dependencies = [ "arrow", "datafusion-common", - "itertools", + "itertools 0.14.0", + "paste", ] [[package]] name = "datafusion-ffi" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "114e944790756b84c2cc5971eae24f5430980149345601939ac222885d4db5f7" +checksum = "ff47a79d442207c168c6e3e1d970c248589c148e4800e5b285ac1b2cb1a230f8" dependencies = [ "abi_stable", "arrow", + "arrow-array", + "arrow-schema", "async-ffi", "async-trait", "datafusion", @@ -1049,13 +1051,15 @@ dependencies = [ "futures", "log", "prost", + "semver", + "tokio", ] [[package]] name = "datafusion-functions" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd197f3b2975424d3a4898ea46651be855a46721a56727515dbd5c9e2fb597da" +checksum = "6125874e4856dfb09b59886784fcb74cde5cfc5930b3a80a1a728ef7a010df6b" dependencies = [ "arrow", "arrow-buffer", @@ -1071,7 +1075,7 @@ dependencies = [ "datafusion-macros", "hashbrown 0.14.5", "hex", - "itertools", + "itertools 0.14.0", "log", "md-5", "rand", @@ -1083,12 +1087,13 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aabbe48fba18f9981b134124381bee9e46f93518b8ad2f9721ee296cef5affb9" +checksum = "f3add7b1d3888e05e7c95f2b281af900ca69ebdcb21069ba679b33bde8b3b9d6" dependencies = [ "ahash", "arrow", + "arrow-buffer", "arrow-schema", "datafusion-common", "datafusion-doc", @@ -1105,9 +1110,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7a3fefed9c8c11268d446d924baca8cabf52fe32f73fdaa20854bac6473590c" +checksum = "6e18baa4cfc3d2f144f74148ed68a1f92337f5072b6dde204a0dbbdf3324989c" dependencies = [ "ahash", "arrow", @@ -1118,9 +1123,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6360f27464fab857bec698af39b2ae331dc07c8bf008fb4de387a19cdc6815a5" +checksum = "3ec5ee8cecb0dc370291279673097ddabec03a011f73f30d7f1096457127e03e" dependencies = [ "arrow", "arrow-array", @@ -1128,21 +1133,23 @@ dependencies = [ "arrow-ord", "arrow-schema", "datafusion-common", + "datafusion-doc", "datafusion-execution", "datafusion-expr", "datafusion-functions", "datafusion-functions-aggregate", + "datafusion-macros", "datafusion-physical-expr-common", - "itertools", + "itertools 0.14.0", "log", "paste", ] [[package]] name = "datafusion-functions-table" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c35c070eb705c12795dab399c3809f4dfbc290678c624d3989490ca9b8449c1" +checksum = "2c403ddd473bbb0952ba880008428b3c7febf0ed3ce1eec35a205db20efb2a36" dependencies = [ "arrow", "async-trait", @@ -1156,9 +1163,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52229bca26b590b140900752226c829f15fc1a99840e1ca3ce1a9534690b82a8" +checksum = "1ab18c2fb835614d06a75f24a9e09136d3a8c12a92d97c95a6af316a1787a9c5" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1173,9 +1180,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "367befc303b64a668a10ae6988a064a9289e1999e71a7f8e526b6e14d6bdd9d6" +checksum = "a77b73bc15e7d1967121fdc7a55d819bfb9d6c03766a6c322247dce9094a53a4" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1183,19 +1190,20 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5de3c8f386ea991696553afe241a326ecbc3c98a12c562867e4be754d3a060c" +checksum = "09369b8d962291e808977cf94d495fd8b5b38647232d7ef562c27ac0f495b0af" dependencies = [ + "datafusion-expr", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] name = "datafusion-optimizer" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53b520413906f755910422b016fb73884ae6e9e1b376de4f9584b6c0e031da75" +checksum = "2403a7e4a84637f3de7d8d4d7a9ccc0cc4be92d89b0161ba3ee5be82f0531c54" dependencies = [ "arrow", "chrono", @@ -1203,7 +1211,7 @@ dependencies = [ "datafusion-expr", "datafusion-physical-expr", "indexmap", - "itertools", + "itertools 0.14.0", "log", "recursive", "regex", @@ -1212,9 +1220,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acd6ddc378f6ad19af95ccd6790dec8f8e1264bc4c70e99ddc1830c1a1c78ccd" +checksum = "86ff72ac702b62dbf2650c4e1d715ebd3e4aab14e3885e72e8549e250307347c" dependencies = [ "ahash", "arrow", @@ -1229,48 +1237,54 @@ dependencies = [ "half", "hashbrown 0.14.5", "indexmap", - "itertools", + "itertools 0.14.0", "log", "paste", - "petgraph", + "petgraph 0.7.1", ] [[package]] name = "datafusion-physical-expr-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06e6c05458eccd74b4c77ed6a1fe63d52434240711de7f6960034794dad1caf5" +checksum = "60982b7d684e25579ee29754b4333057ed62e2cc925383c5f0bd8cab7962f435" dependencies = [ "ahash", "arrow", + "arrow-buffer", "datafusion-common", "datafusion-expr-common", "hashbrown 0.14.5", - "itertools", + "itertools 0.14.0", ] [[package]] name = "datafusion-physical-optimizer" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dc3a82190f49c37d377f31317e07ab5d7588b837adadba8ac367baad5dc2351" +checksum = "ac5e85c189d5238a5cf181a624e450c4cd4c66ac77ca551d6f3ff9080bac90bb" dependencies = [ "arrow", + "arrow-schema", "datafusion-common", "datafusion-execution", + "datafusion-expr", "datafusion-expr-common", "datafusion-physical-expr", + "datafusion-physical-expr-common", "datafusion-physical-plan", - "itertools", + "futures", + "itertools 0.14.0", "log", "recursive", + "url", ] [[package]] name = "datafusion-physical-plan" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6608bc9844b4ddb5ed4e687d173e6c88700b1d0482f43894617d18a1fe75da" +checksum = "c36bf163956d7e2542657c78b3383fdc78f791317ef358a359feffcdb968106f" dependencies = [ "ahash", "arrow", @@ -1291,7 +1305,7 @@ dependencies = [ "half", "hashbrown 0.14.5", "indexmap", - "itertools", + "itertools 0.14.0", "log", "parking_lot", "pin-project-lite", @@ -1300,9 +1314,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e23b0998195e495bfa7b37cdceb317129a6c40522219f6872d2e0c9ae9f4fcb" +checksum = "2db5d79f0c974041787b899d24dc91bdab2ff112d1942dd71356a4ce3b407e6c" dependencies = [ "arrow", "chrono", @@ -1316,9 +1330,9 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfc59992a29eed2d2c1dd779deac99083b217774ebcf90ee121840607a4d866f" +checksum = "de21bde1603aac0ff32cf478e47081be6e3583c6861fe8f57034da911efe7578" dependencies = [ "arrow", "datafusion-common", @@ -1350,9 +1364,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a884061c79b33d0c8e84a6f4f4be8bdc12c0f53f5af28ddf5d6d95ac0b15fdc" +checksum = "e13caa4daede211ecec53c78b13c503b592794d125f9a3cc3afe992edf9e7f43" dependencies = [ "arrow", "arrow-array", @@ -1369,16 +1383,16 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "44.0.0" +version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2ec36dd38512b1ecc7a3bb92e72046b944611b2f0d709445c1e51b0143bffd4" +checksum = "1634405abd8bd3c64c352f2da2f2aec6d80a815930257e0db0ce4ff5daf00944" dependencies = [ "arrow-buffer", "async-recursion", "async-trait", "chrono", "datafusion", - "itertools", + "itertools 0.14.0", "object_store", "pbjson-types", "prost", @@ -1405,7 +1419,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1448,6 +1462,12 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + [[package]] name = "flatbuffers" version = "24.12.23" @@ -1545,7 +1565,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1938,7 +1958,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1999,6 +2019,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.14" @@ -2362,7 +2391,7 @@ dependencies = [ "httparse", "humantime", "hyper", - "itertools", + "itertools 0.13.0", "md-5", "parking_lot", "percent-encoding", @@ -2382,9 +2411,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.20.2" +version = "1.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" +checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" [[package]] name = "openssl-probe" @@ -2426,9 +2455,9 @@ dependencies = [ [[package]] name = "parquet" -version = "53.4.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8957c0c95a6a1804f3e51a18f69df29be53856a8c5768cc9b6d00fcafcd2917c" +checksum = "8a01a0efa30bbd601ae85b375c728efdb211ade54390281628a7b16708beb235" dependencies = [ "ahash", "arrow-array", @@ -2452,6 +2481,7 @@ dependencies = [ "object_store", "paste", "seq-macro", + "simdutf8", "snap", "thrift", "tokio", @@ -2492,7 +2522,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9" dependencies = [ "heck", - "itertools", + "itertools 0.13.0", "prost", "prost-types", ] @@ -2524,7 +2554,17 @@ version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" dependencies = [ - "fixedbitset", + "fixedbitset 0.4.2", + "indexmap", +] + +[[package]] +name = "petgraph" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +dependencies = [ + "fixedbitset 0.5.7", "indexmap", ] @@ -2606,7 +2646,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6924ced06e1f7dfe3fa48d57b9f74f55d8915f5036121bef647ef4b204895fac" dependencies = [ "proc-macro2", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -2635,16 +2675,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0f3e5beed80eb580c68e2c600937ac2c4eedabdfd5ef1e5b7ea4f3fba84497b" dependencies = [ "heck", - "itertools", + "itertools 0.13.0", "log", "multimap", "once_cell", - "petgraph", + "petgraph 0.6.5", "prettyplease", "prost", "prost-types", "regex", - "syn 2.0.96", + "syn 2.0.98", "tempfile", ] @@ -2655,10 +2695,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "157c5a9d7ea5c2ed2d9fb8f495b64759f7816c7eaea54ba3978f0d63000162e3" dependencies = [ "anyhow", - "itertools", + "itertools 0.13.0", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -2690,9 +2730,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884" +checksum = "57fe09249128b3173d092de9523eaa75136bf7ba85e0d69eca241c7939c933cc" dependencies = [ "cfg-if", "indoc", @@ -2708,9 +2748,9 @@ dependencies = [ [[package]] name = "pyo3-async-runtimes" -version = "0.22.0" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2529f0be73ffd2be0cc43c013a640796558aa12d7ca0aab5cc14f375b4733031" +checksum = "977dc837525cfd22919ba6a831413854beb7c99a256c03bf8624ad707e45810e" dependencies = [ "futures", "once_cell", @@ -2721,9 +2761,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38" +checksum = "1cd3927b5a78757a0d71aa9dff669f903b1eb64b54142a9bd9f757f8fde65fd7" dependencies = [ "once_cell", "target-lexicon", @@ -2731,9 +2771,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636" +checksum = "dab6bb2102bd8f991e7749f130a70d05dd557613e39ed2deeee8e9ca0c4d548d" dependencies = [ "libc", "pyo3-build-config", @@ -2741,27 +2781,27 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453" +checksum = "91871864b353fd5ffcb3f91f2f703a22a9797c91b9ab497b1acac7b07ae509c7" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] name = "pyo3-macros-backend" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe" +checksum = "43abc3b80bc20f3facd86cd3c60beed58c3e2aa26213f3cda368de39c60a27e4" dependencies = [ "heck", "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -2888,7 +2928,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3029,9 +3069,9 @@ checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustc-hash" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" [[package]] name = "rustc_version" @@ -3161,7 +3201,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3234,7 +3274,7 @@ checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3245,7 +3285,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3269,7 +3309,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3314,6 +3354,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "siphasher" version = "1.0.1" @@ -3353,7 +3399,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3396,7 +3442,7 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3440,14 +3486,14 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] name = "substrait" -version = "0.50.4" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1772d041c37cc7e6477733c76b2acf4ee36bd52b2ae4d9ea0ec9c87d003db32" +checksum = "5db15789cecbfdf6b1fcf2db807e767c92273bdc407ac057c2194b070c597756" dependencies = [ "heck", "pbjson", @@ -3464,7 +3510,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.96", + "syn 2.0.98", "typify", "walkdir", ] @@ -3488,9 +3534,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.96" +version = "2.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" +checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" dependencies = [ "proc-macro2", "quote", @@ -3514,7 +3560,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3563,7 +3609,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3574,7 +3620,7 @@ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3646,7 +3692,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3718,7 +3764,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3784,7 +3830,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3795,9 +3841,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "typify" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c644dda9862f0fef3a570d8ddb3c2cfb1d5ac824a1f2ddfa7bc8f071a5ad8a" +checksum = "e03ba3643450cfd95a1aca2e1938fef63c1c1994489337998aff4ad771f21ef8" dependencies = [ "typify-impl", "typify-macro", @@ -3805,9 +3851,9 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d59ab345b6c0d8ae9500b9ff334a4c7c0d316c1c628dc55726b95887eb8dbd11" +checksum = "bce48219a2f3154aaa2c56cbf027728b24a3c8fe0a47ed6399781de2b3f3eeaf" dependencies = [ "heck", "log", @@ -3818,16 +3864,16 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.96", - "thiserror 1.0.69", + "syn 2.0.98", + "thiserror 2.0.11", "unicode-ident", ] [[package]] name = "typify-macro" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "785e2cdcef0df8160fdd762ed548a637aaec1e83704fdbc14da0df66013ee8d0" +checksum = "68b5780d745920ed73c5b7447496a9b5c42ed2681a9b70859377aec423ecf02b" dependencies = [ "proc-macro2", "quote", @@ -3836,7 +3882,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.96", + "syn 2.0.98", "typify-impl", ] @@ -3901,11 +3947,11 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.12.1" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3758f5e68192bb96cc8f9b7e2c2cfdabb435499a28499a42f8f984092adad4b" +checksum = "ced87ca4be083373936a67f8de945faa23b6b42384bd5b64434850802c6dccd0" dependencies = [ - "getrandom 0.2.15", + "getrandom 0.3.1", "serde", ] @@ -3971,7 +4017,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "wasm-bindgen-shared", ] @@ -4006,7 +4052,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4255,7 +4301,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "synstructure", ] @@ -4277,7 +4323,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -4297,7 +4343,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "synstructure", ] @@ -4326,7 +4372,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 44e6e2244..d18e0e8f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,13 +35,13 @@ substrait = ["dep:datafusion-substrait"] [dependencies] tokio = { version = "1.42", features = ["macros", "rt", "rt-multi-thread", "sync"] } -pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] } -pyo3-async-runtimes = { version = "0.22", features = ["tokio-runtime"]} -arrow = { version = "53", features = ["pyarrow"] } -datafusion = { version = "44.0.0", features = ["avro", "unicode_expressions"] } -datafusion-substrait = { version = "44.0.0", optional = true } -datafusion-proto = { version = "44.0.0" } -datafusion-ffi = { version = "44.0.0" } +pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py38"] } +pyo3-async-runtimes = { version = "0.23", features = ["tokio-runtime"]} +arrow = { version = "54", features = ["pyarrow"] } +datafusion = { version = "45.0.0", features = ["avro", "unicode_expressions"] } +datafusion-substrait = { version = "45.0.0", optional = true } +datafusion-proto = { version = "45.0.0" } +datafusion-ffi = { version = "45.0.0" } prost = "0.13" # keep in line with `datafusion-substrait` uuid = { version = "1.12", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } @@ -52,7 +52,7 @@ url = "2" [build-dependencies] prost-types = "0.13" # keep in line with `datafusion-substrait` -pyo3-build-config = "0.22" +pyo3-build-config = "0.23" [lib] name = "datafusion_python" diff --git a/examples/ffi-table-provider/Cargo.lock b/examples/ffi-table-provider/Cargo.lock index 3b57cac75..32af85180 100644 --- a/examples/ffi-table-provider/Cargo.lock +++ b/examples/ffi-table-provider/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "abi_stable" @@ -144,9 +144,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4caf25cdc4a985f91df42ed9e9308e1adbcd341a31a72605c697033fcef163e3" +checksum = "6422e12ac345a0678d7a17e316238e3a40547ae7f92052b77bd86d5e0239f3fc" dependencies = [ "arrow-arith", "arrow-array", @@ -165,24 +165,23 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91f2dfd1a7ec0aca967dfaa616096aec49779adc8eccec005e2f5e4111b1192a" +checksum = "23cf34bb1f48c41d3475927bcc7be498665b8e80b379b88f62a840337f8b8248" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "half", "num", ] [[package]] name = "arrow-array" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d39387ca628be747394890a6e47f138ceac1aa912eab64f02519fed24b637af8" +checksum = "fb4a06d507f54b70a277be22a127c8ffe0cec6cd98c0ad8a48e77779bbda8223" dependencies = [ "ahash", "arrow-buffer", @@ -191,15 +190,15 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.14.5", + "hashbrown 0.15.1", "num", ] [[package]] name = "arrow-buffer" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e51e05228852ffe3eb391ce7178a0f97d2cf80cc6ef91d3c4a6b3cb688049ec" +checksum = "d69d326d5ad1cb82dcefa9ede3fee8fdca98f9982756b16f9cb142f4aa6edc89" dependencies = [ "bytes", "half", @@ -208,9 +207,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d09aea56ec9fa267f3f3f6cdab67d8a9974cbba90b3aa38c8fe9d0bb071bd8c1" +checksum = "626e65bd42636a84a238bed49d09c8777e3d825bf81f5087a70111c2831d9870" dependencies = [ "arrow-array", "arrow-buffer", @@ -229,28 +228,25 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c07b5232be87d115fde73e32f2ca7f1b353bff1b44ac422d3c6fc6ae38f11f0d" +checksum = "71c8f959f7a1389b1dbd883cdcd37c3ed12475329c111912f7f69dad8195d8c6" dependencies = [ "arrow-array", - "arrow-buffer", "arrow-cast", - "arrow-data", "arrow-schema", "chrono", "csv", "csv-core", "lazy_static", - "lexical-core", "regex", ] [[package]] name = "arrow-data" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b98ae0af50890b494cebd7d6b04b35e896205c1d1df7b29a6272c5d0d0249ef5" +checksum = "1858e7c7d01c44cf71c21a85534fd1a54501e8d60d1195d0d6fbcc00f4b10754" dependencies = [ "arrow-buffer", "arrow-schema", @@ -260,13 +256,12 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ed91bdeaff5a1c00d28d8f73466bcb64d32bbd7093b5a30156b4b9f4dba3eee" +checksum = "a6bb3f727f049884c7603f0364bc9315363f356b59e9f605ea76541847e06a1e" dependencies = [ "arrow-array", "arrow-buffer", - "arrow-cast", "arrow-data", "arrow-schema", "flatbuffers", @@ -275,9 +270,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0471f51260a5309307e5d409c9dc70aede1cd9cf1d4ff0f0a1e8e1a2dd0e0d3c" +checksum = "35de94f165ed8830aede72c35f238763794f0d49c69d30c44d49c9834267ff8c" dependencies = [ "arrow-array", "arrow-buffer", @@ -295,26 +290,23 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2883d7035e0b600fb4c30ce1e50e66e53d8656aa729f2bfa4b51d359cf3ded52" +checksum = "8aa06e5f267dc53efbacb933485c79b6fc1685d3ffbe870a16ce4e696fb429da" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", - "half", - "num", ] [[package]] name = "arrow-row" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "552907e8e587a6fde4f8843fd7a27a576a260f65dab6c065741ea79f633fc5be" +checksum = "66f1144bb456a2f9d82677bd3abcea019217e572fc8f07de5a7bac4b2c56eb2c" dependencies = [ - "ahash", "arrow-array", "arrow-buffer", "arrow-data", @@ -324,18 +316,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "539ada65246b949bd99ffa0881a9a15a4a529448af1a07a9838dd78617dafab1" +checksum = "105f01ec0090259e9a33a9263ec18ff223ab91a0ea9fbc18042f7e38005142f6" dependencies = [ "bitflags 2.6.0", ] [[package]] name = "arrow-select" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6259e566b752da6dceab91766ed8b2e67bf6270eb9ad8a6e07a33c1bede2b125" +checksum = "f690752fdbd2dee278b5f1636fefad8f2f7134c85e20fd59c4199e15a39a6807" dependencies = [ "ahash", "arrow-array", @@ -347,9 +339,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3179ccbd18ebf04277a095ba7321b93fd1f774f18816bd5f6b3ce2f594edb6c" +checksum = "d0fff9cd745a7039b66c47ecaf5954460f9fa12eed628f65170117ea93e64ee0" dependencies = [ "arrow-array", "arrow-buffer", @@ -380,10 +372,9 @@ version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cb8f1d480b0ea3783ab015936d2a55c87e219676f0c0b7dec61494043f21857" dependencies = [ - "bzip2", + "bzip2 0.4.4", "flate2", "futures-core", - "futures-io", "memchr", "pin-project-lite", "tokio", @@ -448,6 +439,19 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bigdecimal" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f31f3af01c5c65a07985c804d3366560e6fa7883d640a122819b14ec327482c" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -540,6 +544,16 @@ dependencies = [ "libc", ] +[[package]] +name = "bzip2" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bafdbf26611df8c14810e268ddceda071c297570a5fb360ceddf617fe417ef58" +dependencies = [ + "bzip2-sys", + "libc", +] + [[package]] name = "bzip2-sys" version = "0.1.11+1.0.8" @@ -751,11 +765,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbba0799cf6913b456ed07a94f0f3b6e12c62a5d88b10809e2284a0f2b915c05" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ - "ahash", "arrow", "arrow-array", "arrow-ipc", @@ -763,9 +775,8 @@ dependencies = [ "async-compression", "async-trait", "bytes", - "bzip2", + "bzip2 0.5.0", "chrono", - "dashmap", "datafusion-catalog", "datafusion-common", "datafusion-common-runtime", @@ -774,6 +785,7 @@ dependencies = [ "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-nested", + "datafusion-functions-table", "datafusion-functions-window", "datafusion-optimizer", "datafusion-physical-expr", @@ -784,18 +796,13 @@ dependencies = [ "flate2", "futures", "glob", - "half", - "hashbrown 0.14.5", - "indexmap", - "itertools", + "itertools 0.14.0", "log", - "num_cpus", "object_store", "parking_lot", "parquet", - "paste", - "pin-project-lite", "rand", + "regex", "sqlparser", "tempfile", "tokio", @@ -808,67 +815,74 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7493c5c2d40eec435b13d92e5703554f4efc7059451fcb8d3a79580ff0e45560" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ - "arrow-schema", + "arrow", "async-trait", + "dashmap", "datafusion-common", "datafusion-execution", "datafusion-expr", "datafusion-physical-plan", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", "parking_lot", + "sqlparser", ] [[package]] name = "datafusion-common" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24953049ebbd6f8964f91f60aa3514e121b5e81e068e33b60e77815ab369b25c" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "ahash", "arrow", "arrow-array", "arrow-buffer", + "arrow-ipc", "arrow-schema", - "chrono", + "base64", "half", "hashbrown 0.14.5", "indexmap", - "instant", "libc", - "num_cpus", + "log", "object_store", "parquet", "paste", + "recursive", "sqlparser", "tokio", + "web-time", ] [[package]] name = "datafusion-common-runtime" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f06df4ef76872e11c924d3c814fd2a8dd09905ed2e2195f71c857d78abd19685" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "log", "tokio", ] +[[package]] +name = "datafusion-doc" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" + [[package]] name = "datafusion-execution" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bbdcb628d690f3ce5fea7de81642b514486d58ff9779a51f180a69a4eadb361" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", - "chrono", "dashmap", "datafusion-common", "datafusion-expr", "futures", - "hashbrown 0.14.5", "log", "object_store", "parking_lot", @@ -879,63 +893,59 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8036495980e3131f706b7d33ab00b4492d73dc714e3cb74d11b50f9602a73246" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ - "ahash", "arrow", - "arrow-array", - "arrow-buffer", "chrono", "datafusion-common", + "datafusion-doc", "datafusion-expr-common", "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr-common", "indexmap", "paste", + "recursive", "serde_json", "sqlparser", - "strum", - "strum_macros", ] [[package]] name = "datafusion-expr-common" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4da0f3cb4669f9523b403d6b5a0ec85023e0ab3bf0183afd1517475b3e64fdd2" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", "datafusion-common", - "itertools", + "itertools 0.14.0", "paste", ] [[package]] name = "datafusion-ffi" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e923c459b53a26d92a8806d1f6a37fdf48bde51507a39eaed6f42a60f2bfd160" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "abi_stable", "arrow", + "arrow-array", + "arrow-schema", "async-ffi", "async-trait", "datafusion", "datafusion-proto", - "doc-comment", "futures", "log", "prost", + "semver", + "tokio", ] [[package]] name = "datafusion-functions" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f52c4012648b34853e40a2c6bcaa8772f837831019b68aca384fb38436dba162" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", "arrow-buffer", @@ -944,11 +954,14 @@ dependencies = [ "blake3", "chrono", "datafusion-common", + "datafusion-doc", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", + "datafusion-macros", "hashbrown 0.14.5", "hex", - "itertools", + "itertools 0.14.0", "log", "md-5", "rand", @@ -960,44 +973,42 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5b8bb624597ba28ed7446df4a9bd7c7a7bde7c578b6b527da3f47371d5f6741" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "ahash", "arrow", + "arrow-buffer", "arrow-schema", "datafusion-common", + "datafusion-doc", "datafusion-execution", "datafusion-expr", "datafusion-functions-aggregate-common", + "datafusion-macros", "datafusion-physical-expr", "datafusion-physical-expr-common", "half", - "indexmap", "log", "paste", ] [[package]] name = "datafusion-functions-aggregate-common" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb06208fc470bc8cf1ce2d9a1159d42db591f2c7264a8c1776b53ad8f675143" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "ahash", "arrow", "datafusion-common", "datafusion-expr-common", "datafusion-physical-expr-common", - "rand", ] [[package]] name = "datafusion-functions-nested" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fca25bbb87323716d05e54114666e942172ccca23c5a507e9c7851db6e965317" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", "arrow-array", @@ -1005,26 +1016,43 @@ dependencies = [ "arrow-ord", "arrow-schema", "datafusion-common", + "datafusion-doc", "datafusion-execution", "datafusion-expr", "datafusion-functions", "datafusion-functions-aggregate", + "datafusion-macros", "datafusion-physical-expr-common", - "itertools", + "itertools 0.14.0", "log", "paste", - "rand", +] + +[[package]] +name = "datafusion-functions-table" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", + "paste", ] [[package]] name = "datafusion-functions-window" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ae23356c634e54c59f7c51acb7a5b9f6240ffb2cf997049a1a24a8a88598dbe" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "datafusion-common", + "datafusion-doc", "datafusion-expr", "datafusion-functions-window-common", + "datafusion-macros", "datafusion-physical-expr", "datafusion-physical-expr-common", "log", @@ -1033,48 +1061,51 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4b3d6ff7794acea026de36007077a06b18b89e4f9c3fea7f2215f9f7dd9059b" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", ] +[[package]] +name = "datafusion-macros" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +dependencies = [ + "datafusion-expr", + "quote", + "syn 2.0.87", +] + [[package]] name = "datafusion-optimizer" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bec6241eb80c595fa0e1a8a6b69686b5cf3bd5fdacb8319582a0943b0bd788aa" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", - "async-trait", "chrono", "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "hashbrown 0.14.5", "indexmap", - "itertools", + "itertools 0.14.0", "log", - "paste", + "recursive", + "regex", "regex-syntax", ] [[package]] name = "datafusion-physical-expr" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3370357b8fc75ec38577700644e5d1b0bc78f38babab99c0b8bd26bafb3e4335" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "ahash", "arrow", "arrow-array", "arrow-buffer", - "arrow-ord", "arrow-schema", - "arrow-string", - "chrono", "datafusion-common", "datafusion-expr", "datafusion-expr-common", @@ -1083,7 +1114,7 @@ dependencies = [ "half", "hashbrown 0.14.5", "indexmap", - "itertools", + "itertools 0.14.0", "log", "paste", "petgraph", @@ -1091,39 +1122,43 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8b7734d94bf2fa6f6e570935b0ddddd8421179ce200065be97874e13d46a47b" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "ahash", "arrow", + "arrow-buffer", "datafusion-common", "datafusion-expr-common", "hashbrown 0.14.5", - "rand", + "itertools 0.14.0", ] [[package]] name = "datafusion-physical-optimizer" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eee8c479522df21d7b395640dff88c5ed05361852dce6544d7c98e9dbcebffe" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", "arrow-schema", "datafusion-common", "datafusion-execution", + "datafusion-expr", "datafusion-expr-common", "datafusion-physical-expr", + "datafusion-physical-expr-common", "datafusion-physical-plan", - "itertools", + "futures", + "itertools 0.14.0", + "log", + "recursive", + "url", ] [[package]] name = "datafusion-physical-plan" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17e1fc2e2c239d14e8556f2622b19a726bf6bc6962cc00c71fc52626274bee24" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "ahash", "arrow", @@ -1137,7 +1172,6 @@ dependencies = [ "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", - "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -1145,20 +1179,17 @@ dependencies = [ "half", "hashbrown 0.14.5", "indexmap", - "itertools", + "itertools 0.14.0", "log", - "once_cell", "parking_lot", "pin-project-lite", - "rand", "tokio", ] [[package]] name = "datafusion-proto" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f730f7fc5a20134d4e5ecdf7bbf392002ac58163d58423ea28a702dc077b06e1" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", "chrono", @@ -1172,33 +1203,30 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12c225fe49e4f943e35446b263613ada7a9e9f8d647544e6b07037b9803567df" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", - "chrono", "datafusion-common", - "object_store", "prost", ] [[package]] name = "datafusion-sql" -version = "43.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63e3a4ed41dbee20a5d947a59ca035c225d67dc9cbe869c10f66dcdf25e7ce51" +version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" dependencies = [ "arrow", "arrow-array", "arrow-schema", + "bigdecimal", "datafusion-common", "datafusion-expr", "indexmap", "log", + "recursive", "regex", "sqlparser", - "strum", ] [[package]] @@ -1223,12 +1251,6 @@ dependencies = [ "syn 2.0.87", ] -[[package]] -name = "doc-comment" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" - [[package]] name = "either" version = "1.13.0" @@ -1272,15 +1294,15 @@ dependencies = [ [[package]] name = "fixedbitset" -version = "0.4.2" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flatbuffers" -version = "24.3.25" +version = "24.12.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8add37afff2d4ffa83bc748a70b4b1370984f6980768554182424ef71447c35f" +checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" dependencies = [ "bitflags 1.3.2", "rustc_version", @@ -1469,12 +1491,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" - [[package]] name = "hex" version = "0.4.3" @@ -1651,9 +1667,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.6.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" dependencies = [ "equivalent", "hashbrown 0.15.1", @@ -1665,18 +1681,6 @@ version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" -[[package]] -name = "instant" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" -dependencies = [ - "cfg-if", - "js-sys", - "wasm-bindgen", - "web-sys", -] - [[package]] name = "integer-encoding" version = "3.0.4" @@ -1692,6 +1696,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.11" @@ -1964,16 +1977,6 @@ dependencies = [ "libm", ] -[[package]] -name = "num_cpus" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" -dependencies = [ - "hermit-abi", - "libc", -] - [[package]] name = "object" version = "0.36.5" @@ -1994,7 +1997,7 @@ dependencies = [ "chrono", "futures", "humantime", - "itertools", + "itertools 0.13.0", "parking_lot", "percent-encoding", "snafu", @@ -2044,9 +2047,9 @@ dependencies = [ [[package]] name = "parquet" -version = "53.2.0" +version = "54.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dea02606ba6f5e856561d8d507dba8bac060aefca2a6c0f1aa1d361fed91ff3e" +checksum = "8a01a0efa30bbd601ae85b375c728efdb211ade54390281628a7b16708beb235" dependencies = [ "ahash", "arrow-array", @@ -2063,13 +2066,14 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.14.5", + "hashbrown 0.15.1", "lz4_flex", "num", "num-bigint", "object_store", "paste", "seq-macro", + "simdutf8", "snap", "thrift", "tokio", @@ -2101,9 +2105,9 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "petgraph" -version = "0.6.5" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ "fixedbitset", "indexmap", @@ -2206,17 +2210,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" dependencies = [ "anyhow", - "itertools", + "itertools 0.13.0", "proc-macro2", "quote", "syn 2.0.87", ] +[[package]] +name = "psm" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810" +dependencies = [ + "cc", +] + [[package]] name = "pyo3" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884" +checksum = "57fe09249128b3173d092de9523eaa75136bf7ba85e0d69eca241c7939c933cc" dependencies = [ "cfg-if", "indoc", @@ -2232,9 +2245,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38" +checksum = "1cd3927b5a78757a0d71aa9dff669f903b1eb64b54142a9bd9f757f8fde65fd7" dependencies = [ "once_cell", "target-lexicon", @@ -2242,9 +2255,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636" +checksum = "dab6bb2102bd8f991e7749f130a70d05dd557613e39ed2deeee8e9ca0c4d548d" dependencies = [ "libc", "pyo3-build-config", @@ -2252,9 +2265,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453" +checksum = "91871864b353fd5ffcb3f91f2f703a22a9797c91b9ab497b1acac7b07ae509c7" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -2264,9 +2277,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.22.6" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe" +checksum = "43abc3b80bc20f3facd86cd3c60beed58c3e2aa26213f3cda368de39c60a27e4" dependencies = [ "heck", "proc-macro2", @@ -2314,6 +2327,26 @@ dependencies = [ "getrandom", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.87", +] + [[package]] name = "redox_syscall" version = "0.5.7" @@ -2418,9 +2451,9 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "semver" -version = "1.0.23" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" +checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03" [[package]] name = "seq-macro" @@ -2477,6 +2510,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "siphasher" version = "0.3.11" @@ -2527,9 +2566,9 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "sqlparser" -version = "0.51.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fe11944a61da0da3f592e19a45ebe5ab92dc14a779907ff1f08fbb797bfefc7" +checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8" dependencies = [ "log", "sqlparser_derive", @@ -2537,9 +2576,9 @@ dependencies = [ [[package]] name = "sqlparser_derive" -version = "0.2.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" +checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", @@ -2552,6 +2591,19 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "stacker" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "799c883d55abdb5e98af1a7b3f23b9b6de8ecada0ecac058672d7635eb48ca7b" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + [[package]] name = "static_assertions" version = "1.1.0" @@ -2563,9 +2615,6 @@ name = "strum" version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" -dependencies = [ - "strum_macros", -] [[package]] name = "strum_macros" @@ -2798,9 +2847,9 @@ checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" [[package]] name = "url" -version = "2.5.3" +version = "2.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" dependencies = [ "form_urlencoded", "idna", @@ -2906,10 +2955,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" [[package]] -name = "web-sys" -version = "0.3.72" +name = "web-time" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/examples/ffi-table-provider/Cargo.toml b/examples/ffi-table-provider/Cargo.toml index 4e6f91f33..0e558fdd0 100644 --- a/examples/ffi-table-provider/Cargo.toml +++ b/examples/ffi-table-provider/Cargo.toml @@ -21,15 +21,15 @@ version = "0.1.0" edition = "2021" [dependencies] -datafusion = { version = "44.0.0" } -datafusion-ffi = { version = "44.0.0" } -pyo3 = { version = "0.22.6", features = ["extension-module", "abi3", "abi3-py38"] } -arrow = { version = "53.2.0" } -arrow-array = { version = "53.2.0" } -arrow-schema = { version = "53.2.0" } +datafusion = { version = "45.0.0" } +datafusion-ffi = { version = "45.0.0" } +pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py38"] } +arrow = { version = "54" } +arrow-array = { version = "54" } +arrow-schema = { version = "54" } [build-dependencies] -pyo3-build-config = "0.22.6" +pyo3-build-config = "0.23" [lib] name = "ffi_table_provider" diff --git a/examples/ffi-table-provider/src/lib.rs b/examples/ffi-table-provider/src/lib.rs index 473244d88..88deeece2 100644 --- a/examples/ffi-table-provider/src/lib.rs +++ b/examples/ffi-table-provider/src/lib.rs @@ -102,7 +102,7 @@ impl MyTableProvider { let provider = self .create_table() .map_err(|e| PyRuntimeError::new_err(e.to_string()))?; - let provider = FFI_TableProvider::new(Arc::new(provider), false); + let provider = FFI_TableProvider::new(Arc::new(provider), false, None); PyCapsule::new_bound(py, provider, Some(name.clone())) } diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index 77f88aa44..354c7e180 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -148,8 +148,7 @@ def test_relational_expr(test_ctx): assert df.filter(col("b") == "beta").count() == 1 assert df.filter(col("b") != "beta").count() == 2 - with pytest.raises(Exception): - df.filter(col("a") == "beta").count() + assert df.filter(col("a") == "beta").count() == 0 def test_expr_to_variant(): diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index ad6aa7c0a..796b1f76e 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -732,7 +732,7 @@ def test_array_function_obj_tests(stmt, py_expr): ), ( f.regexp_match(column("a"), literal("(ell|orl)")), - pa.array([["ell"], ["orl"], None]), + pa.array([["ell"], ["orl"], None], type=pa.list_(pa.string_view())), ), ( f.regexp_replace(column("a"), literal("(ell|orl)"), literal("-")), diff --git a/src/context.rs b/src/context.rs index f53b15576..ebe7db230 100644 --- a/src/context.rs +++ b/src/context.rs @@ -48,7 +48,7 @@ use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future}; use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::arrow::record_batch::RecordBatch; -use datafusion::catalog_common::TableReference; +use datafusion::common::TableReference; use datafusion::common::{exec_err, ScalarValue}; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; use datafusion::datasource::file_format::parquet::ParquetFormat; diff --git a/src/dataframe.rs b/src/dataframe.rs index 6fb08ba25..13d7ae838 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -546,7 +546,7 @@ impl PyDataFrame { /// Collect the batches and pass to Arrow Table fn to_arrow_table(&self, py: Python<'_>) -> PyResult { let batches = self.collect(py)?.to_object(py); - let schema: PyObject = self.schema().into_py(py); + let schema: PyObject = self.schema().into_pyobject(py)?.to_object(py); // Instantiate pyarrow Table object and use its from_batches method let table_class = py.import_bound("pyarrow")?.getattr("Table")?; From 40a61c150adee6beb9961302fece81c33639082e Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sun, 16 Feb 2025 02:31:00 +0800 Subject: [PATCH 005/145] add to_timestamp_nanos (#1020) --- python/datafusion/functions.py | 1 + python/tests/test_functions.py | 4 ++++ src/functions.rs | 2 ++ 3 files changed, 7 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 7c2fa9a8f..5c260aade 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -252,6 +252,7 @@ "to_hex", "to_timestamp", "to_timestamp_micros", + "to_timestamp_nanos", "to_timestamp_millis", "to_timestamp_seconds", "to_unixtime", diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 796b1f76e..b1a739b49 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -871,6 +871,7 @@ def test_temporal_functions(df): f.to_timestamp_millis(literal("2023-09-07 05:06:14.523952")), f.to_timestamp_micros(literal("2023-09-07 05:06:14.523952")), f.extract(literal("day"), column("d")), + f.to_timestamp_nanos(literal("2023-09-07 05:06:14.523952")), ) result = df.collect() assert len(result) == 1 @@ -909,6 +910,9 @@ def test_temporal_functions(df): [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us") ) assert result.column(10) == pa.array([31, 26, 2], type=pa.int32()) + assert result.column(11) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + ) def test_arrow_cast(df): diff --git a/src/functions.rs b/src/functions.rs index 46c748cf8..6a8abb18d 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -553,6 +553,7 @@ expr_fn!( expr_fn!(now); expr_fn_vec!(to_timestamp); expr_fn_vec!(to_timestamp_millis); +expr_fn_vec!(to_timestamp_nanos); expr_fn_vec!(to_timestamp_micros); expr_fn_vec!(to_timestamp_seconds); expr_fn_vec!(to_unixtime); @@ -977,6 +978,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(to_hex))?; m.add_wrapped(wrap_pyfunction!(to_timestamp))?; m.add_wrapped(wrap_pyfunction!(to_timestamp_millis))?; + m.add_wrapped(wrap_pyfunction!(to_timestamp_nanos))?; m.add_wrapped(wrap_pyfunction!(to_timestamp_micros))?; m.add_wrapped(wrap_pyfunction!(to_timestamp_seconds))?; m.add_wrapped(wrap_pyfunction!(to_unixtime))?; From 3584bec8900bcfb33bcae4b85a3c47a46b82c72e Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 19 Feb 2025 20:50:31 -0500 Subject: [PATCH 006/145] [infra] Fail Clippy on rust build warnings (#1029) * pyo3 update required changes to deprecated interfaces * Substrait feature clippy updates * PyTuple was called twice * add -D warnings option --------- Co-authored-by: Tim Saucer --- .github/workflows/test.yaml | 2 +- .pre-commit-config.yaml | 2 +- src/config.rs | 10 +++--- src/context.rs | 12 +++---- src/dataframe.rs | 17 +++++---- src/dataset.rs | 2 +- src/dataset_exec.rs | 8 ++--- src/errors.rs | 4 +++ src/expr.rs | 61 ++++++++++++++++---------------- src/expr/aggregate.rs | 6 ++-- src/expr/analyze.rs | 6 ++-- src/expr/create_memory_table.rs | 6 ++-- src/expr/create_view.rs | 6 ++-- src/expr/distinct.rs | 6 ++-- src/expr/drop_table.rs | 6 ++-- src/expr/empty_relation.rs | 6 ++-- src/expr/explain.rs | 6 ++-- src/expr/extension.rs | 6 ++-- src/expr/filter.rs | 6 ++-- src/expr/join.rs | 6 ++-- src/expr/limit.rs | 6 ++-- src/expr/literal.rs | 6 ++-- src/expr/logical_node.rs | 4 +-- src/expr/projection.rs | 6 ++-- src/expr/repartition.rs | 6 ++-- src/expr/sort.rs | 6 ++-- src/expr/subquery.rs | 6 ++-- src/expr/subquery_alias.rs | 6 ++-- src/expr/table_scan.rs | 6 ++-- src/expr/union.rs | 6 ++-- src/expr/unnest.rs | 6 ++-- src/expr/window.rs | 6 ++-- src/lib.rs | 10 +++--- src/physical_plan.rs | 2 +- src/pyarrow_filter_expression.rs | 36 ++++++++++--------- src/pyarrow_util.rs | 4 +-- src/sql/logical.rs | 4 +-- src/substrait.rs | 4 +-- src/udaf.rs | 5 +-- src/udf.rs | 5 +-- src/udwf.rs | 44 +++++++++++------------ 41 files changed, 188 insertions(+), 180 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index c93d4c06f..c1d9ac838 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -71,7 +71,7 @@ jobs: - name: Run Clippy if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} - run: cargo clippy --all-targets --all-features -- -D clippy::all -A clippy::redundant_closure + run: cargo clippy --all-targets --all-features -- -D clippy::all -D warnings -A clippy::redundant_closure - name: Install dependencies and build uses: astral-sh/setup-uv@v5 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e20fedf5c..b548ff18f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,7 +40,7 @@ repos: - id: rust-clippy name: Rust clippy description: Run cargo clippy on files included in the commit. clippy should be installed before-hand. - entry: cargo clippy --all-targets --all-features -- -Dclippy::all -Aclippy::redundant_closure + entry: cargo clippy --all-targets --all-features -- -Dclippy::all -D warnings -Aclippy::redundant_closure pass_filenames: false types: [file, rust] language: system diff --git a/src/config.rs b/src/config.rs index cc725b9a3..667d5c590 100644 --- a/src/config.rs +++ b/src/config.rs @@ -47,14 +47,14 @@ impl PyConfig { } /// Get a configuration option - pub fn get(&mut self, key: &str, py: Python) -> PyResult { + pub fn get<'py>(&mut self, key: &str, py: Python<'py>) -> PyResult> { let options = self.config.to_owned(); for entry in options.entries() { if entry.key == key { - return Ok(entry.value.into_py(py)); + return Ok(entry.value.into_pyobject(py)?); } } - Ok(None::.into_py(py)) + Ok(None::.into_pyobject(py)?) } /// Set a configuration option @@ -66,10 +66,10 @@ impl PyConfig { /// Get all configuration options pub fn get_all(&mut self, py: Python) -> PyResult { - let dict = PyDict::new_bound(py); + let dict = PyDict::new(py); let options = self.config.to_owned(); for entry in options.entries() { - dict.set_item(entry.key, entry.value.clone().into_py(py))?; + dict.set_item(entry.key, entry.value.clone().into_pyobject(py)?)?; } Ok(dict.into()) } diff --git a/src/context.rs b/src/context.rs index ebe7db230..0f962638e 100644 --- a/src/context.rs +++ b/src/context.rs @@ -458,8 +458,8 @@ impl PySessionContext { let py = data.py(); // Instantiate pyarrow Table object & convert to Arrow Table - let table_class = py.import_bound("pyarrow")?.getattr("Table")?; - let args = PyTuple::new_bound(py, &[data]); + let table_class = py.import("pyarrow")?.getattr("Table")?; + let args = PyTuple::new(py, &[data])?; let table = table_class.call_method1("from_pylist", args)?; // Convert Arrow Table to datafusion DataFrame @@ -478,8 +478,8 @@ impl PySessionContext { let py = data.py(); // Instantiate pyarrow Table object & convert to Arrow Table - let table_class = py.import_bound("pyarrow")?.getattr("Table")?; - let args = PyTuple::new_bound(py, &[data]); + let table_class = py.import("pyarrow")?.getattr("Table")?; + let args = PyTuple::new(py, &[data])?; let table = table_class.call_method1("from_pydict", args)?; // Convert Arrow Table to datafusion DataFrame @@ -533,8 +533,8 @@ impl PySessionContext { let py = data.py(); // Instantiate pyarrow Table object & convert to Arrow Table - let table_class = py.import_bound("pyarrow")?.getattr("Table")?; - let args = PyTuple::new_bound(py, &[data]); + let table_class = py.import("pyarrow")?.getattr("Table")?; + let args = PyTuple::new(py, &[data])?; let table = table_class.call_method1("from_pandas", args)?; // Convert Arrow Table to datafusion DataFrame diff --git a/src/dataframe.rs b/src/dataframe.rs index 13d7ae838..ed9578a71 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -545,12 +545,12 @@ impl PyDataFrame { /// Convert to Arrow Table /// Collect the batches and pass to Arrow Table fn to_arrow_table(&self, py: Python<'_>) -> PyResult { - let batches = self.collect(py)?.to_object(py); - let schema: PyObject = self.schema().into_pyobject(py)?.to_object(py); + let batches = self.collect(py)?.into_pyobject(py)?; + let schema = self.schema().into_pyobject(py)?; // Instantiate pyarrow Table object and use its from_batches method - let table_class = py.import_bound("pyarrow")?.getattr("Table")?; - let args = PyTuple::new_bound(py, &[batches, schema]); + let table_class = py.import("pyarrow")?.getattr("Table")?; + let args = PyTuple::new(py, &[batches, schema])?; let table: PyObject = table_class.call_method1("from_batches", args)?.into(); Ok(table) } @@ -585,8 +585,7 @@ impl PyDataFrame { let ffi_stream = FFI_ArrowArrayStream::new(reader); let stream_capsule_name = CString::new("arrow_array_stream").unwrap(); - PyCapsule::new_bound(py, ffi_stream, Some(stream_capsule_name)) - .map_err(PyDataFusionError::from) + PyCapsule::new(py, ffi_stream, Some(stream_capsule_name)).map_err(PyDataFusionError::from) } fn execute_stream(&self, py: Python) -> PyDataFusionResult { @@ -649,8 +648,8 @@ impl PyDataFrame { /// Collect the batches, pass to Arrow Table & then convert to polars DataFrame fn to_polars(&self, py: Python<'_>) -> PyResult { let table = self.to_arrow_table(py)?; - let dataframe = py.import_bound("polars")?.getattr("DataFrame")?; - let args = PyTuple::new_bound(py, &[table]); + let dataframe = py.import("polars")?.getattr("DataFrame")?; + let args = PyTuple::new(py, &[table])?; let result: PyObject = dataframe.call1(args)?.into(); Ok(result) } @@ -673,7 +672,7 @@ fn print_dataframe(py: Python, df: DataFrame) -> PyDataFusionResult<()> { // Import the Python 'builtins' module to access the print function // Note that println! does not print to the Python debug console and is not visible in notebooks for instance - let print = py.import_bound("builtins")?.getattr("print")?; + let print = py.import("builtins")?.getattr("print")?; print.call1((result,))?; Ok(()) } diff --git a/src/dataset.rs b/src/dataset.rs index a8fa21ec5..0baf4da2a 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -48,7 +48,7 @@ impl Dataset { // Creates a Python PyArrow.Dataset pub fn new(dataset: &Bound<'_, PyAny>, py: Python) -> PyResult { // Ensure that we were passed an instance of pyarrow.dataset.Dataset - let ds = PyModule::import_bound(py, "pyarrow.dataset")?; + let ds = PyModule::import(py, "pyarrow.dataset")?; let ds_attr = ds.getattr("Dataset")?; let ds_type = ds_attr.downcast::()?; if dataset.is_instance(ds_type)? { diff --git a/src/dataset_exec.rs b/src/dataset_exec.rs index ace42115b..445e4fe74 100644 --- a/src/dataset_exec.rs +++ b/src/dataset_exec.rs @@ -104,7 +104,7 @@ impl DatasetExec { }) .transpose()?; - let kwargs = PyDict::new_bound(py); + let kwargs = PyDict::new(py); kwargs.set_item("columns", columns.clone())?; kwargs.set_item( @@ -121,7 +121,7 @@ impl DatasetExec { .0, ); - let builtins = Python::import_bound(py, "builtins")?; + let builtins = Python::import(py, "builtins")?; let pylist = builtins.getattr("list")?; // Get the fragments or partitions of the dataset @@ -198,7 +198,7 @@ impl ExecutionPlan for DatasetExec { let dataset_schema = dataset .getattr("schema") .map_err(|err| InnerDataFusionError::External(Box::new(err)))?; - let kwargs = PyDict::new_bound(py); + let kwargs = PyDict::new(py); kwargs .set_item("columns", self.columns.clone()) .map_err(|err| InnerDataFusionError::External(Box::new(err)))?; @@ -223,7 +223,7 @@ impl ExecutionPlan for DatasetExec { let record_batches: Bound<'_, PyIterator> = scanner .call_method0("to_batches") .map_err(|err| InnerDataFusionError::External(Box::new(err)))? - .iter() + .try_iter() .map_err(|err| InnerDataFusionError::External(Box::new(err)))?; let record_batches = PyArrowBatchesAdapter { diff --git a/src/errors.rs b/src/errors.rs index b02b754a2..f1d5aeb23 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -91,3 +91,7 @@ pub fn py_datafusion_err(e: impl Debug) -> PyErr { pub fn py_unsupported_variant_err(e: impl Debug) -> PyErr { PyErr::new::(format!("{e:?}")) } + +pub fn to_datafusion_err(e: impl Debug) -> InnerDataFusionError { + InnerDataFusionError::Execution(format!("{e:?}")) +} diff --git a/src/expr.rs b/src/expr.rs index 1e9983d42..e750be6a4 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -19,6 +19,7 @@ use datafusion::logical_expr::utils::exprlist_to_fields; use datafusion::logical_expr::{ ExprFuncBuilder, ExprFunctionExt, LogicalPlan, WindowFunctionDefinition, }; +use pyo3::IntoPyObjectExt; use pyo3::{basic::CompareOp, prelude::*}; use std::convert::{From, Into}; use std::sync::Arc; @@ -126,35 +127,35 @@ pub fn py_expr_list(expr: &[Expr]) -> PyResult> { #[pymethods] impl PyExpr { /// Return the specific expression - fn to_variant(&self, py: Python) -> PyResult { + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { Python::with_gil(|_| { match &self.expr { - Expr::Alias(alias) => Ok(PyAlias::from(alias.clone()).into_py(py)), - Expr::Column(col) => Ok(PyColumn::from(col.clone()).into_py(py)), + Expr::Alias(alias) => Ok(PyAlias::from(alias.clone()).into_bound_py_any(py)?), + Expr::Column(col) => Ok(PyColumn::from(col.clone()).into_bound_py_any(py)?), Expr::ScalarVariable(data_type, variables) => { - Ok(PyScalarVariable::new(data_type, variables).into_py(py)) + Ok(PyScalarVariable::new(data_type, variables).into_bound_py_any(py)?) } - Expr::Like(value) => Ok(PyLike::from(value.clone()).into_py(py)), - Expr::Literal(value) => Ok(PyLiteral::from(value.clone()).into_py(py)), - Expr::BinaryExpr(expr) => Ok(PyBinaryExpr::from(expr.clone()).into_py(py)), - Expr::Not(expr) => Ok(PyNot::new(*expr.clone()).into_py(py)), - Expr::IsNotNull(expr) => Ok(PyIsNotNull::new(*expr.clone()).into_py(py)), - Expr::IsNull(expr) => Ok(PyIsNull::new(*expr.clone()).into_py(py)), - Expr::IsTrue(expr) => Ok(PyIsTrue::new(*expr.clone()).into_py(py)), - Expr::IsFalse(expr) => Ok(PyIsFalse::new(*expr.clone()).into_py(py)), - Expr::IsUnknown(expr) => Ok(PyIsUnknown::new(*expr.clone()).into_py(py)), - Expr::IsNotTrue(expr) => Ok(PyIsNotTrue::new(*expr.clone()).into_py(py)), - Expr::IsNotFalse(expr) => Ok(PyIsNotFalse::new(*expr.clone()).into_py(py)), - Expr::IsNotUnknown(expr) => Ok(PyIsNotUnknown::new(*expr.clone()).into_py(py)), - Expr::Negative(expr) => Ok(PyNegative::new(*expr.clone()).into_py(py)), + Expr::Like(value) => Ok(PyLike::from(value.clone()).into_bound_py_any(py)?), + Expr::Literal(value) => Ok(PyLiteral::from(value.clone()).into_bound_py_any(py)?), + Expr::BinaryExpr(expr) => Ok(PyBinaryExpr::from(expr.clone()).into_bound_py_any(py)?), + Expr::Not(expr) => Ok(PyNot::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNotNull(expr) => Ok(PyIsNotNull::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNull(expr) => Ok(PyIsNull::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsTrue(expr) => Ok(PyIsTrue::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsFalse(expr) => Ok(PyIsFalse::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsUnknown(expr) => Ok(PyIsUnknown::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNotTrue(expr) => Ok(PyIsNotTrue::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNotFalse(expr) => Ok(PyIsNotFalse::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNotUnknown(expr) => Ok(PyIsNotUnknown::new(*expr.clone()).into_bound_py_any(py)?), + Expr::Negative(expr) => Ok(PyNegative::new(*expr.clone()).into_bound_py_any(py)?), Expr::AggregateFunction(expr) => { - Ok(PyAggregateFunction::from(expr.clone()).into_py(py)) + Ok(PyAggregateFunction::from(expr.clone()).into_bound_py_any(py)?) } - Expr::SimilarTo(value) => Ok(PySimilarTo::from(value.clone()).into_py(py)), - Expr::Between(value) => Ok(between::PyBetween::from(value.clone()).into_py(py)), - Expr::Case(value) => Ok(case::PyCase::from(value.clone()).into_py(py)), - Expr::Cast(value) => Ok(cast::PyCast::from(value.clone()).into_py(py)), - Expr::TryCast(value) => Ok(cast::PyTryCast::from(value.clone()).into_py(py)), + Expr::SimilarTo(value) => Ok(PySimilarTo::from(value.clone()).into_bound_py_any(py)?), + Expr::Between(value) => Ok(between::PyBetween::from(value.clone()).into_bound_py_any(py)?), + Expr::Case(value) => Ok(case::PyCase::from(value.clone()).into_bound_py_any(py)?), + Expr::Cast(value) => Ok(cast::PyCast::from(value.clone()).into_bound_py_any(py)?), + Expr::TryCast(value) => Ok(cast::PyTryCast::from(value.clone()).into_bound_py_any(py)?), Expr::ScalarFunction(value) => Err(py_unsupported_variant_err(format!( "Converting Expr::ScalarFunction to a Python object is not implemented: {:?}", value @@ -163,29 +164,29 @@ impl PyExpr { "Converting Expr::WindowFunction to a Python object is not implemented: {:?}", value ))), - Expr::InList(value) => Ok(in_list::PyInList::from(value.clone()).into_py(py)), - Expr::Exists(value) => Ok(exists::PyExists::from(value.clone()).into_py(py)), + Expr::InList(value) => Ok(in_list::PyInList::from(value.clone()).into_bound_py_any(py)?), + Expr::Exists(value) => Ok(exists::PyExists::from(value.clone()).into_bound_py_any(py)?), Expr::InSubquery(value) => { - Ok(in_subquery::PyInSubquery::from(value.clone()).into_py(py)) + Ok(in_subquery::PyInSubquery::from(value.clone()).into_bound_py_any(py)?) } Expr::ScalarSubquery(value) => { - Ok(scalar_subquery::PyScalarSubquery::from(value.clone()).into_py(py)) + Ok(scalar_subquery::PyScalarSubquery::from(value.clone()).into_bound_py_any(py)?) } Expr::Wildcard { qualifier, options } => Err(py_unsupported_variant_err(format!( "Converting Expr::Wildcard to a Python object is not implemented : {:?} {:?}", qualifier, options ))), Expr::GroupingSet(value) => { - Ok(grouping_set::PyGroupingSet::from(value.clone()).into_py(py)) + Ok(grouping_set::PyGroupingSet::from(value.clone()).into_bound_py_any(py)?) } Expr::Placeholder(value) => { - Ok(placeholder::PyPlaceholder::from(value.clone()).into_py(py)) + Ok(placeholder::PyPlaceholder::from(value.clone()).into_bound_py_any(py)?) } Expr::OuterReferenceColumn(data_type, column) => Err(py_unsupported_variant_err(format!( "Converting Expr::OuterReferenceColumn to a Python object is not implemented: {:?} - {:?}", data_type, column ))), - Expr::Unnest(value) => Ok(unnest_expr::PyUnnestExpr::from(value.clone()).into_py(py)), + Expr::Unnest(value) => Ok(unnest_expr::PyUnnestExpr::from(value.clone()).into_bound_py_any(py)?), } }) } diff --git a/src/expr/aggregate.rs b/src/expr/aggregate.rs index 389bfb332..8fc9da5b0 100644 --- a/src/expr/aggregate.rs +++ b/src/expr/aggregate.rs @@ -19,7 +19,7 @@ use datafusion::common::DataFusionError; use datafusion::logical_expr::expr::{AggregateFunction, Alias}; use datafusion::logical_expr::logical_plan::Aggregate; use datafusion::logical_expr::Expr; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use super::logical_node::LogicalNode; @@ -151,7 +151,7 @@ impl LogicalNode for PyAggregate { vec![PyLogicalPlan::from((*self.aggregate.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/analyze.rs b/src/expr/analyze.rs index 084513971..62f93cd26 100644 --- a/src/expr/analyze.rs +++ b/src/expr/analyze.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::Analyze; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use super::logical_node::LogicalNode; @@ -78,7 +78,7 @@ impl LogicalNode for PyAnalyze { vec![PyLogicalPlan::from((*self.analyze.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/create_memory_table.rs b/src/expr/create_memory_table.rs index 01ebb66b0..8872b2d47 100644 --- a/src/expr/create_memory_table.rs +++ b/src/expr/create_memory_table.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::CreateMemoryTable; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::sql::logical::PyLogicalPlan; @@ -91,7 +91,7 @@ impl LogicalNode for PyCreateMemoryTable { vec![PyLogicalPlan::from((*self.create.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/create_view.rs b/src/expr/create_view.rs index d119f5c21..87bb76876 100644 --- a/src/expr/create_view.rs +++ b/src/expr/create_view.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::{CreateView, DdlStatement, LogicalPlan}; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::{errors::py_type_err, sql::logical::PyLogicalPlan}; @@ -88,8 +88,8 @@ impl LogicalNode for PyCreateView { vec![PyLogicalPlan::from((*self.create.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/distinct.rs b/src/expr/distinct.rs index 061ab4824..b62b776f8 100644 --- a/src/expr/distinct.rs +++ b/src/expr/distinct.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::Distinct; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::sql::logical::PyLogicalPlan; @@ -89,7 +89,7 @@ impl LogicalNode for PyDistinct { } } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/drop_table.rs b/src/expr/drop_table.rs index 330156abe..96983c1cf 100644 --- a/src/expr/drop_table.rs +++ b/src/expr/drop_table.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::logical_plan::DropTable; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::sql::logical::PyLogicalPlan; @@ -83,7 +83,7 @@ impl LogicalNode for PyDropTable { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/empty_relation.rs b/src/expr/empty_relation.rs index ce7163466..a1534ac15 100644 --- a/src/expr/empty_relation.rs +++ b/src/expr/empty_relation.rs @@ -17,7 +17,7 @@ use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; use datafusion::logical_expr::EmptyRelation; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use super::logical_node::LogicalNode; @@ -79,7 +79,7 @@ impl LogicalNode for PyEmptyRelation { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/explain.rs b/src/expr/explain.rs index 8e7fb8843..fc02fe2b5 100644 --- a/src/expr/explain.rs +++ b/src/expr/explain.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::{logical_plan::Explain, LogicalPlan}; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::{common::df_schema::PyDFSchema, errors::py_type_err, sql::logical::PyLogicalPlan}; @@ -104,7 +104,7 @@ impl LogicalNode for PyExplain { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/extension.rs b/src/expr/extension.rs index a29802b0b..1e3fbb199 100644 --- a/src/expr/extension.rs +++ b/src/expr/extension.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::Extension; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::sql::logical::PyLogicalPlan; @@ -46,7 +46,7 @@ impl LogicalNode for PyExtension { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/filter.rs b/src/expr/filter.rs index a6d8aa7ee..9bdb667cd 100644 --- a/src/expr/filter.rs +++ b/src/expr/filter.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::Filter; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -81,7 +81,7 @@ impl LogicalNode for PyFilter { vec![PyLogicalPlan::from((*self.filter.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/join.rs b/src/expr/join.rs index 66e677f8a..76ec532e7 100644 --- a/src/expr/join.rs +++ b/src/expr/join.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::{Join, JoinConstraint, JoinType}; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -193,7 +193,7 @@ impl LogicalNode for PyJoin { ] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/limit.rs b/src/expr/limit.rs index 84ad7d68b..c2a33ff89 100644 --- a/src/expr/limit.rs +++ b/src/expr/limit.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::Limit; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -90,7 +90,7 @@ impl LogicalNode for PyLimit { vec![PyLogicalPlan::from((*self.limit.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/literal.rs b/src/expr/literal.rs index 2cb2079f1..a660ac914 100644 --- a/src/expr/literal.rs +++ b/src/expr/literal.rs @@ -17,7 +17,7 @@ use crate::errors::PyDataFusionError; use datafusion::common::ScalarValue; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; #[pyclass(name = "Literal", module = "datafusion.expr", subclass)] #[derive(Clone)] @@ -144,8 +144,8 @@ impl PyLiteral { } #[allow(clippy::wrong_self_convention)] - fn into_type(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn into_type<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } fn __repr__(&self) -> PyResult { diff --git a/src/expr/logical_node.rs b/src/expr/logical_node.rs index 757e4f94b..5aff70059 100644 --- a/src/expr/logical_node.rs +++ b/src/expr/logical_node.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use pyo3::{PyObject, PyResult, Python}; +use pyo3::{Bound, PyAny, PyResult, Python}; use crate::sql::logical::PyLogicalPlan; @@ -25,5 +25,5 @@ pub trait LogicalNode { /// The input plan to the current logical node instance. fn inputs(&self) -> Vec; - fn to_variant(&self, py: Python) -> PyResult; + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult>; } diff --git a/src/expr/projection.rs b/src/expr/projection.rs index 36534fdb2..dc7e5e3c1 100644 --- a/src/expr/projection.rs +++ b/src/expr/projection.rs @@ -17,7 +17,7 @@ use datafusion::logical_expr::logical_plan::Projection; use datafusion::logical_expr::Expr; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -113,7 +113,7 @@ impl LogicalNode for PyProjection { vec![PyLogicalPlan::from((*self.projection.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/repartition.rs b/src/expr/repartition.rs index 4e680e181..3e782d6af 100644 --- a/src/expr/repartition.rs +++ b/src/expr/repartition.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::{logical_plan::Repartition, Expr, Partitioning}; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::{errors::py_type_err, sql::logical::PyLogicalPlan}; @@ -121,7 +121,7 @@ impl LogicalNode for PyRepartition { vec![PyLogicalPlan::from((*self.repartition.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/sort.rs b/src/expr/sort.rs index a1803ccaf..ed4947591 100644 --- a/src/expr/sort.rs +++ b/src/expr/sort.rs @@ -17,7 +17,7 @@ use datafusion::common::DataFusionError; use datafusion::logical_expr::logical_plan::Sort; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -96,7 +96,7 @@ impl LogicalNode for PySort { vec![PyLogicalPlan::from((*self.sort.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/subquery.rs b/src/expr/subquery.rs index dac8d0a2b..5ebfe6927 100644 --- a/src/expr/subquery.rs +++ b/src/expr/subquery.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::Subquery; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::sql::logical::PyLogicalPlan; @@ -75,7 +75,7 @@ impl LogicalNode for PySubquery { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/subquery_alias.rs b/src/expr/subquery_alias.rs index a83cff96d..267a4d485 100644 --- a/src/expr/subquery_alias.rs +++ b/src/expr/subquery_alias.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::SubqueryAlias; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; @@ -85,7 +85,7 @@ impl LogicalNode for PySubqueryAlias { vec![PyLogicalPlan::from((*self.subquery_alias.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/table_scan.rs b/src/expr/table_scan.rs index f61be7fe4..6a0d53f0f 100644 --- a/src/expr/table_scan.rs +++ b/src/expr/table_scan.rs @@ -17,7 +17,7 @@ use datafusion::common::TableReference; use datafusion::logical_expr::logical_plan::TableScan; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::expr::logical_node::LogicalNode; @@ -146,7 +146,7 @@ impl LogicalNode for PyTableScan { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/union.rs b/src/expr/union.rs index 62488d9a1..5a08ccc13 100644 --- a/src/expr/union.rs +++ b/src/expr/union.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::Union; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -83,7 +83,7 @@ impl LogicalNode for PyUnion { .collect() } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/unnest.rs b/src/expr/unnest.rs index adc705035..8e70e0990 100644 --- a/src/expr/unnest.rs +++ b/src/expr/unnest.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::Unnest; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -79,7 +79,7 @@ impl LogicalNode for PyUnnest { vec![PyLogicalPlan::from((*self.unnest_.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/window.rs b/src/expr/window.rs index 4dc6cb9c9..13deaec25 100644 --- a/src/expr/window.rs +++ b/src/expr/window.rs @@ -18,7 +18,7 @@ use datafusion::common::{DataFusionError, ScalarValue}; use datafusion::logical_expr::expr::WindowFunction; use datafusion::logical_expr::{Expr, Window, WindowFrame, WindowFrameBound, WindowFrameUnits}; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::data_type::PyScalarValue; @@ -289,7 +289,7 @@ impl LogicalNode for PyWindowExpr { vec![self.window.input.as_ref().clone().into()] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/lib.rs b/src/lib.rs index 317c3a49a..ce93ff0c3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -94,21 +94,21 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; // Register `common` as a submodule. Matching `datafusion-common` https://docs.rs/datafusion-common/latest/datafusion_common/ - let common = PyModule::new_bound(py, "common")?; + let common = PyModule::new(py, "common")?; common::init_module(&common)?; m.add_submodule(&common)?; // Register `expr` as a submodule. Matching `datafusion-expr` https://docs.rs/datafusion-expr/latest/datafusion_expr/ - let expr = PyModule::new_bound(py, "expr")?; + let expr = PyModule::new(py, "expr")?; expr::init_module(&expr)?; m.add_submodule(&expr)?; // Register the functions as a submodule - let funcs = PyModule::new_bound(py, "functions")?; + let funcs = PyModule::new(py, "functions")?; functions::init_module(&funcs)?; m.add_submodule(&funcs)?; - let store = PyModule::new_bound(py, "object_store")?; + let store = PyModule::new(py, "object_store")?; store::init_module(&store)?; m.add_submodule(&store)?; @@ -121,7 +121,7 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { #[cfg(feature = "substrait")] fn setup_substrait_module(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { - let substrait = PyModule::new_bound(py, "substrait")?; + let substrait = PyModule::new(py, "substrait")?; substrait::init_module(&substrait)?; m.add_submodule(&substrait)?; Ok(()) diff --git a/src/physical_plan.rs b/src/physical_plan.rs index 295908dc7..f0be45c6a 100644 --- a/src/physical_plan.rs +++ b/src/physical_plan.rs @@ -66,7 +66,7 @@ impl PyExecutionPlan { )?; let bytes = proto.encode_to_vec(); - Ok(PyBytes::new_bound(py, &bytes)) + Ok(PyBytes::new(py, &bytes)) } #[staticmethod] diff --git a/src/pyarrow_filter_expression.rs b/src/pyarrow_filter_expression.rs index 314eebf4f..4b4c86597 100644 --- a/src/pyarrow_filter_expression.rs +++ b/src/pyarrow_filter_expression.rs @@ -16,7 +16,7 @@ // under the License. /// Converts a Datafusion logical plan expression (Expr) into a PyArrow compute expression -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::convert::TryFrom; use std::result::Result; @@ -53,24 +53,28 @@ fn operator_to_py<'py>( Ok(py_op) } -fn extract_scalar_list(exprs: &[Expr], py: Python) -> PyDataFusionResult> { +fn extract_scalar_list<'py>( + exprs: &[Expr], + py: Python<'py>, +) -> PyDataFusionResult>> { let ret = exprs .iter() .map(|expr| match expr { // TODO: should we also leverage `ScalarValue::to_pyarrow` here? Expr::Literal(v) => match v { - ScalarValue::Boolean(Some(b)) => Ok(b.into_py(py)), - ScalarValue::Int8(Some(i)) => Ok(i.into_py(py)), - ScalarValue::Int16(Some(i)) => Ok(i.into_py(py)), - ScalarValue::Int32(Some(i)) => Ok(i.into_py(py)), - ScalarValue::Int64(Some(i)) => Ok(i.into_py(py)), - ScalarValue::UInt8(Some(i)) => Ok(i.into_py(py)), - ScalarValue::UInt16(Some(i)) => Ok(i.into_py(py)), - ScalarValue::UInt32(Some(i)) => Ok(i.into_py(py)), - ScalarValue::UInt64(Some(i)) => Ok(i.into_py(py)), - ScalarValue::Float32(Some(f)) => Ok(f.into_py(py)), - ScalarValue::Float64(Some(f)) => Ok(f.into_py(py)), - ScalarValue::Utf8(Some(s)) => Ok(s.into_py(py)), + // The unwraps here are for infallible conversions + ScalarValue::Boolean(Some(b)) => Ok(b.into_bound_py_any(py)?), + ScalarValue::Int8(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::Int16(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::Int32(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::Int64(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::UInt8(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::UInt16(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::UInt32(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::UInt64(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::Float32(Some(f)) => Ok(f.into_bound_py_any(py)?), + ScalarValue::Float64(Some(f)) => Ok(f.into_bound_py_any(py)?), + ScalarValue::Utf8(Some(s)) => Ok(s.into_bound_py_any(py)?), _ => Err(PyDataFusionError::Common(format!( "PyArrow can't handle ScalarValue: {v:?}" ))), @@ -98,8 +102,8 @@ impl TryFrom<&Expr> for PyArrowFilterExpression { // https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Expression.html#pyarrow-dataset-expression fn try_from(expr: &Expr) -> Result { Python::with_gil(|py| { - let pc = Python::import_bound(py, "pyarrow.compute")?; - let op_module = Python::import_bound(py, "operator")?; + let pc = Python::import(py, "pyarrow.compute")?; + let op_module = Python::import(py, "operator")?; let pc_expr: PyDataFusionResult> = match expr { Expr::Column(Column { name, .. }) => Ok(pc.getattr("field")?.call1((name,))?), Expr::Literal(scalar) => Ok(scalar_to_pyarrow(scalar, py)?.into_bound(py)), diff --git a/src/pyarrow_util.rs b/src/pyarrow_util.rs index 2b31467f8..cab708458 100644 --- a/src/pyarrow_util.rs +++ b/src/pyarrow_util.rs @@ -33,8 +33,8 @@ impl FromPyArrow for PyScalarValue { let val = value.call_method0("as_py")?; // construct pyarrow array from the python value and pyarrow type - let factory = py.import_bound("pyarrow")?.getattr("array")?; - let args = PyList::new_bound(py, [val]); + let factory = py.import("pyarrow")?.getattr("array")?; + let args = PyList::new(py, [val])?; let array = factory.call1((args, typ))?; // convert the pyarrow array to rust array using C data interface diff --git a/src/sql/logical.rs b/src/sql/logical.rs index 1be33b75f..96561c434 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -64,7 +64,7 @@ impl PyLogicalPlan { #[pymethods] impl PyLogicalPlan { /// Return the specific logical operator - pub fn to_variant(&self, py: Python) -> PyResult { + pub fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { match self.plan.as_ref() { LogicalPlan::Aggregate(plan) => PyAggregate::from(plan.clone()).to_variant(py), LogicalPlan::Analyze(plan) => PyAnalyze::from(plan.clone()).to_variant(py), @@ -132,7 +132,7 @@ impl PyLogicalPlan { datafusion_proto::protobuf::LogicalPlanNode::try_from_logical_plan(&self.plan, &codec)?; let bytes = proto.encode_to_vec(); - Ok(PyBytes::new_bound(py, &bytes)) + Ok(PyBytes::new(py, &bytes)) } #[staticmethod] diff --git a/src/substrait.rs b/src/substrait.rs index 8dcf3e8a7..1fefc0bbd 100644 --- a/src/substrait.rs +++ b/src/substrait.rs @@ -40,7 +40,7 @@ impl PyPlan { self.plan .encode(&mut proto_bytes) .map_err(PyDataFusionError::EncodeError)?; - Ok(PyBytes::new_bound(py, &proto_bytes).unbind().into()) + Ok(PyBytes::new(py, &proto_bytes).into()) } } @@ -95,7 +95,7 @@ impl PySubstraitSerializer { py: Python, ) -> PyDataFusionResult { let proto_bytes: Vec = wait_for_future(py, serializer::serialize_bytes(sql, &ctx.ctx))?; - Ok(PyBytes::new_bound(py, &proto_bytes).unbind().into()) + Ok(PyBytes::new(py, &proto_bytes).into()) } #[staticmethod] diff --git a/src/udaf.rs b/src/udaf.rs index 5f21533e0..34a9cd51d 100644 --- a/src/udaf.rs +++ b/src/udaf.rs @@ -29,6 +29,7 @@ use datafusion::logical_expr::{ }; use crate::common::data_type::PyScalarValue; +use crate::errors::to_datafusion_err; use crate::expr::PyExpr; use crate::utils::parse_volatility; @@ -73,7 +74,7 @@ impl Accumulator for RustAccumulator { .iter() .map(|arg| arg.into_data().to_pyarrow(py).unwrap()) .collect::>(); - let py_args = PyTuple::new_bound(py, py_args); + let py_args = PyTuple::new(py, py_args).map_err(to_datafusion_err)?; // 2. call function self.accum @@ -119,7 +120,7 @@ impl Accumulator for RustAccumulator { .iter() .map(|arg| arg.into_data().to_pyarrow(py).unwrap()) .collect::>(); - let py_args = PyTuple::new_bound(py, py_args); + let py_args = PyTuple::new(py, py_args).map_err(to_datafusion_err)?; // 2. call function self.accum diff --git a/src/udf.rs b/src/udf.rs index 4570e77a6..574c9d7b5 100644 --- a/src/udf.rs +++ b/src/udf.rs @@ -28,6 +28,7 @@ use datafusion::logical_expr::function::ScalarFunctionImplementation; use datafusion::logical_expr::ScalarUDF; use datafusion::logical_expr::{create_udf, ColumnarValue}; +use crate::errors::to_datafusion_err; use crate::expr::PyExpr; use crate::utils::parse_volatility; @@ -46,11 +47,11 @@ fn pyarrow_function_to_rust( .map_err(|e| DataFusionError::Execution(format!("{e:?}"))) }) .collect::, _>>()?; - let py_args = PyTuple::new_bound(py, py_args); + let py_args = PyTuple::new(py, py_args).map_err(to_datafusion_err)?; // 2. call function let value = func - .call_bound(py, py_args, None) + .call(py, py_args, None) .map_err(|e| DataFusionError::Execution(format!("{e:?}")))?; // 3. cast to arrow::array::Array diff --git a/src/udwf.rs b/src/udwf.rs index 04a4a1640..defd9c522 100644 --- a/src/udwf.rs +++ b/src/udwf.rs @@ -27,6 +27,7 @@ use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use crate::common::data_type::PyScalarValue; +use crate::errors::to_datafusion_err; use crate::expr::PyExpr; use crate::utils::parse_volatility; use datafusion::arrow::datatypes::DataType; @@ -56,8 +57,8 @@ impl PartitionEvaluator for RustPartitionEvaluator { fn get_range(&self, idx: usize, n_rows: usize) -> Result> { Python::with_gil(|py| { - let py_args = vec![idx.to_object(py), n_rows.to_object(py)]; - let py_args = PyTuple::new_bound(py, py_args); + let py_args = vec![idx.into_pyobject(py)?, n_rows.into_pyobject(py)?]; + let py_args = PyTuple::new(py, py_args)?; self.evaluator .bind(py) @@ -93,17 +94,14 @@ impl PartitionEvaluator for RustPartitionEvaluator { fn evaluate_all(&mut self, values: &[ArrayRef], num_rows: usize) -> Result { println!("evaluate all called with number of values {}", values.len()); Python::with_gil(|py| { - let py_values = PyList::new_bound( + let py_values = PyList::new( py, values .iter() .map(|arg| arg.into_data().to_pyarrow(py).unwrap()), - ); - let py_num_rows = num_rows.to_object(py).into_bound(py); - let py_args = PyTuple::new_bound( - py, - PyTuple::new_bound(py, vec![py_values.as_any(), &py_num_rows]), - ); + )?; + let py_num_rows = num_rows.into_pyobject(py)?; + let py_args = PyTuple::new(py, vec![py_values.as_any(), &py_num_rows])?; self.evaluator .bind(py) @@ -112,32 +110,28 @@ impl PartitionEvaluator for RustPartitionEvaluator { let array_data = ArrayData::from_pyarrow_bound(&v).unwrap(); make_array(array_data) }) - .map_err(|e| DataFusionError::Execution(format!("{e}"))) }) + .map_err(to_datafusion_err) } fn evaluate(&mut self, values: &[ArrayRef], range: &Range) -> Result { Python::with_gil(|py| { - let py_values = PyList::new_bound( + let py_values = PyList::new( py, values .iter() .map(|arg| arg.into_data().to_pyarrow(py).unwrap()), - ); - let range_tuple = - PyTuple::new_bound(py, vec![range.start.to_object(py), range.end.to_object(py)]); - let py_args = PyTuple::new_bound( - py, - PyTuple::new_bound(py, vec![py_values.as_any(), range_tuple.as_any()]), - ); + )?; + let range_tuple = PyTuple::new(py, vec![range.start, range.end])?; + let py_args = PyTuple::new(py, vec![py_values.as_any(), range_tuple.as_any()])?; self.evaluator .bind(py) .call_method1("evaluate", py_args) .and_then(|v| v.extract::()) .map(|v| v.0) - .map_err(|e| DataFusionError::Execution(format!("{e}"))) }) + .map_err(to_datafusion_err) } fn evaluate_all_with_rank( @@ -148,23 +142,27 @@ impl PartitionEvaluator for RustPartitionEvaluator { Python::with_gil(|py| { let ranks = ranks_in_partition .iter() - .map(|r| PyTuple::new_bound(py, vec![r.start, r.end])); + .map(|r| PyTuple::new(py, vec![r.start, r.end])) + .collect::>>()?; // 1. cast args to Pyarrow array - let py_args = vec![num_rows.to_object(py), PyList::new_bound(py, ranks).into()]; + let py_args = vec![ + num_rows.into_pyobject(py)?.into_any(), + PyList::new(py, ranks)?.into_any(), + ]; - let py_args = PyTuple::new_bound(py, py_args); + let py_args = PyTuple::new(py, py_args)?; // 2. call function self.evaluator .bind(py) .call_method1("evaluate_all_with_rank", py_args) - .map_err(|e| DataFusionError::Execution(format!("{e}"))) .map(|v| { let array_data = ArrayData::from_pyarrow_bound(&v).unwrap(); make_array(array_data) }) }) + .map_err(to_datafusion_err) } fn supports_bounded_execution(&self) -> bool { From e6f6e66c1d180246ad933f8bcc0d40faa8426dfa Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 21 Feb 2025 16:03:36 -0500 Subject: [PATCH 007/145] Add user documentation for the FFI approach (#1031) * Initial commit for FFI user documentation * Update readme to point to the online documentation. Fix a small typo. * Small text adjustments for clarity and formatting --- README.md | 11 +- docs/source/contributor-guide/ffi.rst | 212 ++++++++++++++++++++++++++ docs/source/index.rst | 1 + 3 files changed, 220 insertions(+), 4 deletions(-) create mode 100644 docs/source/contributor-guide/ffi.rst diff --git a/README.md b/README.md index 5aaf7f5f3..9c56b62dd 100644 --- a/README.md +++ b/README.md @@ -30,10 +30,8 @@ DataFusion's Python bindings can be used as a foundation for building new data s planning, and logical plan optimizations, and then transpiles the logical plan to Dask operations for execution. - [DataFusion Ballista](https://github.com/apache/datafusion-ballista) is a distributed SQL query engine that extends DataFusion's Python bindings for distributed use cases. - -It is also possible to use these Python bindings directly for DataFrame and SQL operations, but you may find that -[Polars](http://pola.rs/) and [DuckDB](http://www.duckdb.org/) are more suitable for this use case, since they have -more of an end-user focus and are more actively maintained than these Python bindings. +- [DataFusion Ray](https://github.com/apache/datafusion-ray) is another distributed query engine that uses + DataFusion's Python bindings. ## Features @@ -114,6 +112,11 @@ Printing the context will show the current configuration settings. print(ctx) ``` +## Extensions + +For information about how to extend DataFusion Python, please see the extensions page of the +[online documentation](https://datafusion.apache.org/python/). + ## More Examples See [examples](examples/README.md) for more information. diff --git a/docs/source/contributor-guide/ffi.rst b/docs/source/contributor-guide/ffi.rst new file mode 100644 index 000000000..c1f9806b3 --- /dev/null +++ b/docs/source/contributor-guide/ffi.rst @@ -0,0 +1,212 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Python Extensions +================= + +The DataFusion in Python project is designed to allow users to extend its functionality in a few core +areas. Ideally many users would like to package their extensions as a Python package and easily +integrate that package with this project. This page serves to describe some of the challenges we face +when doing these integrations and the approach our project uses. + +The Primary Issue +----------------- + +Suppose you wish to use DataFusion and you have a custom data source that can produce tables that +can then be queried against, similar to how you can register a :ref:`CSV ` or +:ref:`Parquet ` file. In DataFusion terminology, you likely want to implement a +:ref:`Custom Table Provider `. In an effort to make your data source +as performant as possible and to utilize the features of DataFusion, you may decide to write +your source in Rust and then expose it through `PyO3 `_ as a Python library. + +At first glance, it may appear the best way to do this is to add the ``datafusion-python`` +crate as a dependency, provide a ``PyTable``, and then to register it with the +``SessionContext``. Unfortunately, this will not work. + +When you produce your code as a Python library and it needs to interact with the DataFusion +library, at the lowest level they communicate through an Application Binary Interface (ABI). +The acronym sounds similar to API (Application Programming Interface), but it is distinctly +different. + +The ABI sets the standard for how these libraries can share data and functions between each +other. One of the key differences between Rust and other programming languages is that Rust +does not have a stable ABI. What this means in practice is that if you compile a Rust library +with one version of the ``rustc`` compiler and I compile another library to interface with it +but I use a different version of the compiler, there is no guarantee the interface will be +the same. + +In practice, this means that a Python library built with ``datafusion-python`` as a Rust +dependency will generally **not** be compatible with the DataFusion Python package, even +if they reference the same version of ``datafusion-python``. If you attempt to do this, it may +work on your local computer if you have built both packages with the same optimizations. +This can sometimes lead to a false expectation that the code will work, but it frequently +breaks the moment you try to use your package against the released packages. + +You can find more information about the Rust ABI in their +`online documentation `_. + +The FFI Approach +---------------- + +Rust supports interacting with other programming languages through it's Foreign Function +Interface (FFI). The advantage of using the FFI is that it enables you to write data structures +and functions that have a stable ABI. The allows you to use Rust code with C, Python, and +other languages. In fact, the `PyO3 `_ library uses the FFI to share data +and functions between Python and Rust. + +The approach we are taking in the DataFusion in Python project is to incrementally expose +more portions of the DataFusion project via FFI interfaces. This allows users to write Rust +code that does **not** require the ``datafusion-python`` crate as a dependency, expose their +code in Python via PyO3, and have it interact with the DataFusion Python package. + +Early adopters of this approach include `delta-rs `_ +who has adapted their Table Provider for use in ```datafusion-python``` with only a few lines +of code. Also, the DataFusion Python project uses the existing definitions from +`Apache Arrow CStream Interface `_ +to support importing **and** exporting tables. Any Python package that supports reading +the Arrow C Stream interface can work with DataFusion Python out of the box! You can read +more about working with Arrow sources in the :ref:`Data Sources ` +page. + +To learn more about the Foreign Function Interface in Rust, the +`Rustonomicon `_ is a good resource. + +Inspiration from Arrow +---------------------- + +DataFusion is built upon `Apache Arrow `_. The canonical Python +Arrow implementation, `pyarrow `_ provides +an excellent way to share Arrow data between Python projects without performing any copy +operations on the data. They do this by using a well defined set of interfaces. You can +find the details about their stream interface +`here `_. The +`Rust Arrow Implementation `_ also supports these +``C`` style definitions via the Foreign Function Interface. + +In addition to using these interfaces to transfer Arrow data between libraries, ``pyarrow`` +goes one step further to make sharing the interfaces easier in Python. They do this +by exposing PyCapsules that contain the expected functionality. + +You can learn more about PyCapsules from the official +`Python online documentation `_. PyCapsules +have excellent support in PyO3 already. The +`PyO3 online documentation `_ is a good source +for more details on using PyCapsules in Rust. + +Two lessons we leverage from the Arrow project in DataFusion Python are: + +- We reuse the existing Arrow FFI functionality wherever possible. +- We expose PyCapsules that contain a FFI stable struct. + +Implementation Details +---------------------- + +The bulk of the code necessary to perform our FFI operations is in the upstream +`DataFusion `_ core repository. You can review the code and +documentation in the `datafusion-ffi`_ crate. + +Our FFI implementation is narrowly focused at sharing data and functions with Rust backed +libraries. This allows us to use the `abi_stable crate `_. +This is an excellent crate that allows for easy conversion between Rust native types +and FFI-safe alternatives. For example, if you needed to pass a ``Vec`` via FFI, +you can simply convert it to a ``RVec`` in an intuitive manner. It also supports +features like ``RResult`` and ``ROption`` that do not have an obvious translation to a +C equivalent. + +The `datafusion-ffi`_ crate has been designed to make it easy to convert from DataFusion +traits into their FFI counterparts. For example, if you have defined a custom +`TableProvider `_ +and you want to create a sharable FFI counterpart, you could write: + +.. code-block:: rust + + let my_provider = MyTableProvider::default(); + let ffi_provider = FFI_TableProvider::new(Arc::new(my_provider), false, None); + +If you were interfacing with a library that provided the above ``FFI_TableProvider`` and +you needed to turn it back into an ``TableProvider``, you can turn it into a +``ForeignTableProvider`` with implements the ``TableProvider`` trait. + +.. code-block:: rust + + let foreign_provider: ForeignTableProvider = ffi_provider.into(); + +If you review the code in `datafusion-ffi`_ you will find that each of the traits we share +across the boundary has two portions, one with a ``FFI_`` prefix and one with a ``Foreign`` +prefix. This is used to distinguish which side of the FFI boundary that struct is +designed to be used on. The structures with the ``FFI_`` prefix are to be used on the +**provider** of the structure. In the example we're showing, this means the code that has +written the underlying ``TableProvider`` implementation to access your custom data source. +The structures with the ``Foreign`` prefix are to be used by the receiver. In this case, +it is the ``datafusion-python`` library. + +In order to share these FFI structures, we need to wrap them in some kind of Python object +that can be used to interface from one package to another. As described in the above +section on our inspiration from Arrow, we use ``PyCapsule``. We can create a ``PyCapsule`` +for our provider thusly: + +.. code-block:: rust + + let name = CString::new("datafusion_table_provider")?; + let my_capsule = PyCapsule::new_bound(py, provider, Some(name))?; + +On the receiving side, turn this pycapsule object into the ``FFI_TableProvider``, which +can then be turned into a ``ForeignTableProvider`` the associated code is: + +.. code-block:: rust + + let capsule = capsule.downcast::()?; + let provider = unsafe { capsule.reference::() }; + +By convention the ``datafusion-python`` library expects a Python object that has a +``TableProvider`` PyCapsule to have this capsule accessible by calling a function named +``__datafusion_table_provider__``. You can see a complete working example of how to +share a ``TableProvider`` from one python library to DataFusion Python in the +`repository examples folder `_. + +This section has been written using ``TableProvider`` as an example. It is the first +extension that has been written using this approach and the most thoroughly implemented. +As we continue to expose more of the DataFusion features, we intend to follow this same +design pattern. + +Alternative Approach +-------------------- + +Suppose you needed to expose some other features of DataFusion and you could not wait +for the upstream repository to implement the FFI approach we describe. In this case +you decide to create your dependency on the ``datafusion-python`` crate instead. + +As we discussed, this is not guaranteed to work across different compiler versions and +optimization levels. If you wish to go down this route, there are two approaches we +have identified you can use. + +#. Re-export all of ``datafusion-python`` yourself with your extensions built in. +#. Carefully synchonize your software releases with the ``datafusion-python`` CI build + system so that your libraries use the exact same compiler, features, and + optimization level. + +We currently do not recommend either of these approaches as they are difficult to +maintain over a long period. Additionally, they require a tight version coupling +between libraries. + +Status of Work +-------------- + +At the time of this writing, the FFI features are under active development. To see +the latest status, we recommend reviewing the code in the `datafusion-ffi`_ crate. + +.. _datafusion-ffi: https://crates.io/crates/datafusion-ffi diff --git a/docs/source/index.rst b/docs/source/index.rst index 34eb23b28..558b2d572 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -85,6 +85,7 @@ Example :caption: CONTRIBUTOR GUIDE contributor-guide/introduction + contributor-guide/ffi .. _toc.api: .. toctree:: From 3f3983cc86ffe267cff97480241e8a588ac38fa3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 23 Feb 2025 08:00:52 -0500 Subject: [PATCH 008/145] build(deps): bump arrow from 54.1.0 to 54.2.0 (#1035) Bumps [arrow](https://github.com/apache/arrow-rs) from 54.1.0 to 54.2.0. - [Release notes](https://github.com/apache/arrow-rs/releases) - [Changelog](https://github.com/apache/arrow-rs/blob/main/CHANGELOG-old.md) - [Commits](https://github.com/apache/arrow-rs/compare/54.1.0...54.2.0) --- updated-dependencies: - dependency-name: arrow dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 56 +++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f1b1ed50a..d23ed6169 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -179,9 +179,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6422e12ac345a0678d7a17e316238e3a40547ae7f92052b77bd86d5e0239f3fc" +checksum = "755b6da235ac356a869393c23668c663720b8749dd6f15e52b6c214b4b964cc7" dependencies = [ "arrow-arith", "arrow-array", @@ -201,9 +201,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23cf34bb1f48c41d3475927bcc7be498665b8e80b379b88f62a840337f8b8248" +checksum = "64656a1e0b13ca766f8440752e9a93e11014eec7b67909986f83ed0ab1fe37b8" dependencies = [ "arrow-array", "arrow-buffer", @@ -215,9 +215,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb4a06d507f54b70a277be22a127c8ffe0cec6cd98c0ad8a48e77779bbda8223" +checksum = "57a4a6d2896083cfbdf84a71a863b22460d0708f8206a8373c52e326cc72ea1a" dependencies = [ "ahash", "arrow-buffer", @@ -232,9 +232,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d69d326d5ad1cb82dcefa9ede3fee8fdca98f9982756b16f9cb142f4aa6edc89" +checksum = "cef870583ce5e4f3b123c181706f2002fb134960f9a911900f64ba4830c7a43a" dependencies = [ "bytes", "half", @@ -243,9 +243,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626e65bd42636a84a238bed49d09c8777e3d825bf81f5087a70111c2831d9870" +checksum = "1ac7eba5a987f8b4a7d9629206ba48e19a1991762795bbe5d08497b7736017ee" dependencies = [ "arrow-array", "arrow-buffer", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71c8f959f7a1389b1dbd883cdcd37c3ed12475329c111912f7f69dad8195d8c6" +checksum = "90f12542b8164398fc9ec595ff783c4cf6044daa89622c5a7201be920e4c0d4c" dependencies = [ "arrow-array", "arrow-cast", @@ -280,9 +280,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1858e7c7d01c44cf71c21a85534fd1a54501e8d60d1195d0d6fbcc00f4b10754" +checksum = "b095e8a4f3c309544935d53e04c3bfe4eea4e71c3de6fe0416d1f08bb4441a83" dependencies = [ "arrow-buffer", "arrow-schema", @@ -292,9 +292,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6bb3f727f049884c7603f0364bc9315363f356b59e9f605ea76541847e06a1e" +checksum = "65c63da4afedde2b25ef69825cd4663ca76f78f79ffe2d057695742099130ff6" dependencies = [ "arrow-array", "arrow-buffer", @@ -306,9 +306,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35de94f165ed8830aede72c35f238763794f0d49c69d30c44d49c9834267ff8c" +checksum = "9551d9400532f23a370cabbea1dc5a53c49230397d41f96c4c8eedf306199305" dependencies = [ "arrow-array", "arrow-buffer", @@ -326,9 +326,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8aa06e5f267dc53efbacb933485c79b6fc1685d3ffbe870a16ce4e696fb429da" +checksum = "6c07223476f8219d1ace8cd8d85fa18c4ebd8d945013f25ef5c72e85085ca4ee" dependencies = [ "arrow-array", "arrow-buffer", @@ -339,9 +339,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66f1144bb456a2f9d82677bd3abcea019217e572fc8f07de5a7bac4b2c56eb2c" +checksum = "91b194b38bfd89feabc23e798238989c6648b2506ad639be42ec8eb1658d82c4" dependencies = [ "arrow-array", "arrow-buffer", @@ -352,18 +352,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "105f01ec0090259e9a33a9263ec18ff223ab91a0ea9fbc18042f7e38005142f6" +checksum = "0f40f6be8f78af1ab610db7d9b236e21d587b7168e368a36275d2e5670096735" dependencies = [ "bitflags 2.8.0", ] [[package]] name = "arrow-select" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f690752fdbd2dee278b5f1636fefad8f2f7134c85e20fd59c4199e15a39a6807" +checksum = "ac265273864a820c4a179fc67182ccc41ea9151b97024e1be956f0f2369c2539" dependencies = [ "ahash", "arrow-array", @@ -375,9 +375,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0fff9cd745a7039b66c47ecaf5954460f9fa12eed628f65170117ea93e64ee0" +checksum = "d44c8eed43be4ead49128370f7131f054839d3d6003e52aebf64322470b8fbd0" dependencies = [ "arrow-array", "arrow-buffer", From 69ebf70bd821d0ae516d2f61d96058e2252a7a1f Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 24 Feb 2025 21:30:52 +0100 Subject: [PATCH 009/145] Chore: Release datafusion-python 45 (#1024) * Bump version number to prepare for release * Add changelog 45.0.0 * Add deprecated marker from either typing or typing_extensions based on the python version * Limit pyarrow version per issue # 1023 * Bumping the version number to support new release candidate * There was no guarantee that the record batches would be returned in a single partition, so update the unit test to check all partitions. * Revert "Limit pyarrow version per issue # 1023" This reverts commit b48d5872661017ec21ea71f7dbb9569f2f0bf797. * Correct import for python 3.13 and above * Bump minor version due to pypi requirement * Update cargo lock --- Cargo.lock | 113 +++++++++++++-------------------- Cargo.toml | 2 +- dev/changelog/45.0.0.md | 42 ++++++++++++ python/datafusion/context.py | 5 +- python/datafusion/dataframe.py | 5 +- python/datafusion/expr.py | 6 +- python/datafusion/substrait.py | 5 +- python/tests/test_dataframe.py | 21 ++++-- 8 files changed, 118 insertions(+), 81 deletions(-) create mode 100644 dev/changelog/45.0.0.md diff --git a/Cargo.lock b/Cargo.lock index d23ed6169..5c7f2bf3c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -606,19 +606,18 @@ dependencies = [ [[package]] name = "bzip2" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bafdbf26611df8c14810e268ddceda071c297570a5fb360ceddf617fe417ef58" +checksum = "75b89e7c29231c673a61a46e722602bcd138298f6b9e81e71119693534585f5c" dependencies = [ "bzip2-sys", - "libc", ] [[package]] name = "bzip2-sys" -version = "0.1.11+1.0.8" +version = "0.1.12+1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +checksum = "72ebc2f1a417f01e1da30ef264ee86ae31d2dcd2d603ea283d3c244a883ca2a9" dependencies = [ "cc", "libc", @@ -627,9 +626,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.12" +version = "1.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "755717a7de9ec452bf7f3f1a3099085deabd7f2962b861dae91ecd7a365903d2" +checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9" dependencies = [ "jobserver", "libc", @@ -684,21 +683,20 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.53" +version = "0.1.54" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e24a03c8b52922d68a1589ad61032f2c1aa5a8158d2aa0d93c6e9534944bbad6" +checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0" dependencies = [ "cc", ] [[package]] name = "comfy-table" -version = "7.1.3" +version = "7.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24f165e7b643266ea80cb858aed492ad9280e3e05ce24d4a99d7d7b889b6a4d9" +checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a" dependencies = [ - "strum", - "strum_macros", + "unicode-segmentation", "unicode-width", ] @@ -837,9 +835,9 @@ dependencies = [ [[package]] name = "csv-core" -version = "0.1.11" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" dependencies = [ "memchr", ] @@ -878,7 +876,7 @@ dependencies = [ "async-compression", "async-trait", "bytes", - "bzip2 0.5.0", + "bzip2 0.5.1", "chrono", "datafusion-catalog", "datafusion-common", @@ -1240,7 +1238,7 @@ dependencies = [ "itertools 0.14.0", "log", "paste", - "petgraph 0.7.1", + "petgraph", ] [[package]] @@ -1341,7 +1339,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "44.0.0" +version = "45.2.0" dependencies = [ "arrow", "async-trait", @@ -1436,9 +1434,9 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "equivalent" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" @@ -1456,12 +1454,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" -[[package]] -name = "fixedbitset" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" - [[package]] name = "fixedbitset" version = "0.5.7" @@ -2269,9 +2261,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8402cab7aefae129c6977bb0ff1b8fd9a04eb5b51efc50a70bea51cda0c7924" +checksum = "b3b1c9bd4fe1f0f8b387f6eb9eb3b4a1aa26185e5750efb9140301703f62cd1b" dependencies = [ "adler2", ] @@ -2548,23 +2540,13 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" -[[package]] -name = "petgraph" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" -dependencies = [ - "fixedbitset 0.4.2", - "indexmap", -] - [[package]] name = "petgraph" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ - "fixedbitset 0.5.7", + "fixedbitset", "indexmap", ] @@ -2660,9 +2642,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c0fef6c4230e4ccf618a35c59d7ede15dea37de8427500f50aff708806e42ec" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" dependencies = [ "bytes", "prost-derive", @@ -2670,16 +2652,16 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0f3e5beed80eb580c68e2c600937ac2c4eedabdfd5ef1e5b7ea4f3fba84497b" +checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" dependencies = [ "heck", - "itertools 0.13.0", + "itertools 0.14.0", "log", "multimap", "once_cell", - "petgraph 0.6.5", + "petgraph", "prettyplease", "prost", "prost-types", @@ -2690,12 +2672,12 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "157c5a9d7ea5c2ed2d9fb8f495b64759f7816c7eaea54ba3978f0d63000162e3" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.13.0", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.98", @@ -2703,9 +2685,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc2f1e56baa61e93533aebc21af4d2134b70f66275e0fcdf3cbe43d77ff7e8fc" +checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" dependencies = [ "prost", ] @@ -2721,9 +2703,9 @@ dependencies = [ [[package]] name = "psm" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810" +checksum = "f58e5423e24c18cc840e1c98370b3993c6649cd1678b4d24318bcf0a083cbe88" dependencies = [ "cc", ] @@ -2860,9 +2842,9 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.9" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c40286217b4ba3a71d644d752e6a0b71f13f1b6a2c5311acfcbe0c2418ed904" +checksum = "e46f3055866785f6b92bc6164b76be02ca8f2eb4b002c0354b28cf4c119e5944" dependencies = [ "cfg_aliases", "libc", @@ -3042,15 +3024,14 @@ dependencies = [ [[package]] name = "ring" -version = "0.17.8" +version = "0.17.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +checksum = "e75ec5e92c4d8aede845126adc388046234541629e76029599ed35a003c7ed24" dependencies = [ "cc", "cfg-if", "getrandom 0.2.15", "libc", - "spin", "untrusted", "windows-sys 0.52.0", ] @@ -3097,9 +3078,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.22" +version = "0.23.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb9263ab4eb695e42321db096e3b8fbd715a59b154d5c88d82db2175b681ba7" +checksum = "47796c98c480fce5406ef69d1c76378375492c3b0a0de587be0c1d9feb12f395" dependencies = [ "once_cell", "ring", @@ -3377,9 +3358,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.13.2" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" [[package]] name = "snafu" @@ -3418,12 +3399,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" - [[package]] name = "sqlparser" version = "0.53.0" @@ -3453,9 +3428,9 @@ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "stacker" -version = "0.1.17" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "799c883d55abdb5e98af1a7b3f23b9b6de8ecada0ecac058672d7635eb48ca7b" +checksum = "1d08feb8f695b465baed819b03c128dc23f57a694510ab1f06c77f763975685e" dependencies = [ "cc", "cfg-if", diff --git a/Cargo.toml b/Cargo.toml index d18e0e8f0..5358b1836 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "44.0.0" +version = "45.2.0" homepage = "https://datafusion.apache.org/python" repository = "https://github.com/apache/datafusion-python" authors = ["Apache DataFusion "] diff --git a/dev/changelog/45.0.0.md b/dev/changelog/45.0.0.md new file mode 100644 index 000000000..93659b171 --- /dev/null +++ b/dev/changelog/45.0.0.md @@ -0,0 +1,42 @@ + + +# Apache DataFusion Python 45.0.0 Changelog + +This release consists of 2 commits from 2 contributors. See credits at the end of this changelog for more information. + +**Fixed bugs:** + +- fix: add to_timestamp_nanos [#1020](https://github.com/apache/datafusion-python/pull/1020) (chenkovsky) + +**Other:** + +- Chore/upgrade datafusion 45 [#1010](https://github.com/apache/datafusion-python/pull/1010) (kevinjqliu) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 1 Kevin Liu + 1 Tim Saucer +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 864ef1c8b..21955b6d1 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -21,7 +21,10 @@ from typing import TYPE_CHECKING, Any, Protocol -from typing_extensions import deprecated +try: + from warnings import deprecated # Python 3.13+ +except ImportError: + from typing_extensions import deprecated # Python 3.12 from datafusion.catalog import Catalog, Table from datafusion.dataframe import DataFrame diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 7413a5fa3..23b5d630b 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -33,7 +33,10 @@ overload, ) -from typing_extensions import deprecated +try: + from warnings import deprecated # Python 3.13+ +except ImportError: + from typing_extensions import deprecated # Python 3.12 from datafusion.plan import ExecutionPlan, LogicalPlan from datafusion.record_batch import RecordBatchStream diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 68ddd7c9a..e3d7158eb 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -25,7 +25,11 @@ from typing import TYPE_CHECKING, Any, Optional, Type import pyarrow as pa -from typing_extensions import deprecated + +try: + from warnings import deprecated # Python 3.13+ +except ImportError: + from typing_extensions import deprecated # Python 3.12 from datafusion.common import DataTypeMap, NullTreatment, RexType diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index 402184d3f..06302fe38 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -26,7 +26,10 @@ import pathlib from typing import TYPE_CHECKING -from typing_extensions import deprecated +try: + from warnings import deprecated # Python 3.13+ +except ImportError: + from typing_extensions import deprecated # Python 3.12 from datafusion.plan import LogicalPlan diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 5bc3fb094..c636e896a 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -755,13 +755,20 @@ def test_execution_plan(aggregate_df): assert "CsvExec:" in indent ctx = SessionContext() - stream = ctx.execute(plan, 0) - # get the one and only batch - batch = stream.next() - assert batch is not None - # there should be no more batches - with pytest.raises(StopIteration): - stream.next() + rows_returned = 0 + for idx in range(0, plan.partition_count): + stream = ctx.execute(plan, idx) + try: + batch = stream.next() + assert batch is not None + rows_returned += len(batch.to_pyarrow()[0]) + except StopIteration: + # This is one of the partitions with no values + pass + with pytest.raises(StopIteration): + stream.next() + + assert rows_returned == 5 def test_repartition(df): From a80a788f69cf46ef002b3c537837548cc103748c Mon Sep 17 00:00:00 2001 From: kosiew Date: Sat, 8 Mar 2025 21:22:36 +0800 Subject: [PATCH 010/145] Enable Dataframe to be converted into views which can be used in register_table (#1016) * add test_view * feat: add into_view method to register DataFrame as a view * add pytableprovider * feat: add as_table method to PyTableProvider and update into_view to return PyTable * refactor: simplify as_table method and update documentation for into_view * test: improve test_register_filtered_dataframe by removing redundant comments and assertions * test: enhance test_register_filtered_dataframe with additional assertions for DataFrame results * ruff formatted * cleanup: remove unused imports from test_view.py * docs: add example for registering a DataFrame as a view in README.md * docs: update docstring for into_view method to clarify usage as ViewTable * chore: add license header to test_view.py * ruff correction * refactor: rename into_view method to _into_view * ruff lint * refactor: simplify into_view method and update Rust binding convention * docs: add views section to user guide with example on registering views * feat: add register_view method to SessionContext for DataFrame registration * docs: update README and user guide to reflect register_view method for DataFrame registration * docs: remove some documentation from PyDataFrame --- README.md | 40 +++++++++++++ .../user-guide/common-operations/index.rst | 1 + .../user-guide/common-operations/views.rst | 58 +++++++++++++++++++ python/datafusion/context.py | 12 ++++ python/datafusion/dataframe.py | 4 ++ python/tests/test_view.py | 49 ++++++++++++++++ src/dataframe.rs | 39 +++++++++++++ 7 files changed, 203 insertions(+) create mode 100644 docs/source/user-guide/common-operations/views.rst create mode 100644 python/tests/test_view.py diff --git a/README.md b/README.md index 9c56b62dd..4f80dbe18 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,46 @@ This produces the following chart: ![Chart](examples/chart.png) +## Registering a DataFrame as a View + +You can use SessionContext's `register_view` method to convert a DataFrame into a view and register it with the context. + +```python +from datafusion import SessionContext, col, literal + +# Create a DataFusion context +ctx = SessionContext() + +# Create sample data +data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} + +# Create a DataFrame from the dictionary +df = ctx.from_pydict(data, "my_table") + +# Filter the DataFrame (for example, keep rows where a > 2) +df_filtered = df.filter(col("a") > literal(2)) + +# Register the dataframe as a view with the context +ctx.register_view("view1", df_filtered) + +# Now run a SQL query against the registered view +df_view = ctx.sql("SELECT * FROM view1") + +# Collect the results +results = df_view.collect() + +# Convert results to a list of dictionaries for display +result_dicts = [batch.to_pydict() for batch in results] + +print(result_dicts) +``` + +This will output: + +```python +[{'a': [3, 4, 5], 'b': [30, 40, 50]}] +``` + ## Configuration It is possible to configure runtime (memory and disk settings) and configuration settings when creating a context. diff --git a/docs/source/user-guide/common-operations/index.rst b/docs/source/user-guide/common-operations/index.rst index d7c708c21..7abd1f138 100644 --- a/docs/source/user-guide/common-operations/index.rst +++ b/docs/source/user-guide/common-operations/index.rst @@ -23,6 +23,7 @@ The contents of this section are designed to guide a new user through how to use .. toctree:: :maxdepth: 2 + views basic-info select-and-filter expressions diff --git a/docs/source/user-guide/common-operations/views.rst b/docs/source/user-guide/common-operations/views.rst new file mode 100644 index 000000000..df11e3abe --- /dev/null +++ b/docs/source/user-guide/common-operations/views.rst @@ -0,0 +1,58 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +====================== +Registering Views +====================== + +You can use the context's ``register_view`` method to register a DataFrame as a view + +.. code-block:: python + + from datafusion import SessionContext, col, literal + + # Create a DataFusion context + ctx = SessionContext() + + # Create sample data + data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} + + # Create a DataFrame from the dictionary + df = ctx.from_pydict(data, "my_table") + + # Filter the DataFrame (for example, keep rows where a > 2) + df_filtered = df.filter(col("a") > literal(2)) + + # Register the dataframe as a view with the context + ctx.register_view("view1", df_filtered) + + # Now run a SQL query against the registered view + df_view = ctx.sql("SELECT * FROM view1") + + # Collect the results + results = df_view.collect() + + # Convert results to a list of dictionaries for display + result_dicts = [batch.to_pydict() for batch in results] + + print(result_dicts) + +This will output: + +.. code-block:: python + + [{'a': [3, 4, 5], 'b': [30, 40, 50]}] diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 21955b6d1..befc4dce6 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -707,6 +707,18 @@ def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFr """ return DataFrame(self.ctx.from_polars(data, name)) + # https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 + # is the discussion on how we arrived at adding register_view + def register_view(self, name: str, df: DataFrame): + """Register a :py:class: `~datafusion.detaframe.DataFrame` as a view. + + Args: + name (str): The name to register the view under. + df (DataFrame): The DataFrame to be converted into a view and registered. + """ + view = df.into_view() + self.ctx.register_table(name, view) + def register_table(self, name: str, table: Table) -> None: """Register a :py:class: `~datafusion.catalog.Table` as a table. diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 23b5d630b..85a179ec9 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -124,6 +124,10 @@ def __init__(self, df: DataFrameInternal) -> None: """ self.df = df + def into_view(self) -> pa.Table: + """Convert DataFrame as a ViewTable which can be used in register_table.""" + return self.df.into_view() + def __getitem__(self, key: str | List[str]) -> DataFrame: """Return a new :py:class`DataFrame` with the specified column or columns. diff --git a/python/tests/test_view.py b/python/tests/test_view.py new file mode 100644 index 000000000..1d92cc0d4 --- /dev/null +++ b/python/tests/test_view.py @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from datafusion import SessionContext, col, literal + + +def test_register_filtered_dataframe(): + ctx = SessionContext() + + data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} + + df = ctx.from_pydict(data, "my_table") + + df_filtered = df.filter(col("a") > literal(2)) + + ctx.register_view("view1", df_filtered) + + df_view = ctx.sql("SELECT * FROM view1") + + filtered_results = df_view.collect() + + result_dicts = [batch.to_pydict() for batch in filtered_results] + + expected_results = [{"a": [3, 4, 5], "b": [30, 40, 50]}] + + assert result_dicts == expected_results + + df_results = df.collect() + + df_result_dicts = [batch.to_pydict() for batch in df_results] + + expected_df_results = [{"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}] + + assert df_result_dicts == expected_df_results diff --git a/src/dataframe.rs b/src/dataframe.rs index ed9578a71..243e2e14f 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -30,6 +30,7 @@ use datafusion::arrow::util::pretty; use datafusion::common::UnnestOptions; use datafusion::config::{CsvOptions, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; +use datafusion::datasource::TableProvider; use datafusion::execution::SendableRecordBatchStream; use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; use datafusion::prelude::*; @@ -39,6 +40,7 @@ use pyo3::pybacked::PyBackedStr; use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods}; use tokio::task::JoinHandle; +use crate::catalog::PyTable; use crate::errors::{py_datafusion_err, PyDataFusionError}; use crate::expr::sort_expr::to_sort_expressions; use crate::physical_plan::PyExecutionPlan; @@ -50,6 +52,25 @@ use crate::{ expr::{sort_expr::PySortExpr, PyExpr}, }; +// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 +// - we have not decided on the table_provider approach yet +// this is an interim implementation +#[pyclass(name = "TableProvider", module = "datafusion")] +pub struct PyTableProvider { + provider: Arc, +} + +impl PyTableProvider { + pub fn new(provider: Arc) -> Self { + Self { provider } + } + + pub fn as_table(&self) -> PyTable { + let table_provider: Arc = self.provider.clone(); + PyTable::new(table_provider) + } +} + /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. /// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment. @@ -156,6 +177,24 @@ impl PyDataFrame { PyArrowType(self.df.schema().into()) } + /// Convert this DataFrame into a Table that can be used in register_table + /// By convention, into_... methods consume self and return the new object. + /// Disabling the clippy lint, so we can use &self + /// because we're working with Python bindings + /// where objects are shared + /// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 + /// - we have not decided on the table_provider approach yet + #[allow(clippy::wrong_self_convention)] + fn into_view(&self) -> PyDataFusionResult { + // Call the underlying Rust DataFrame::into_view method. + // Note that the Rust method consumes self; here we clone the inner Arc + // so that we don’t invalidate this PyDataFrame. + let table_provider = self.df.as_ref().clone().into_view(); + let table_provider = PyTableProvider::new(table_provider); + + Ok(table_provider.as_table()) + } + #[pyo3(signature = (*args))] fn select_columns(&self, args: Vec) -> PyDataFusionResult { let args = args.iter().map(|s| s.as_ref()).collect::>(); From 9027b4d79fdd7a41dd9c1f25c2ecebc1fabf50f2 Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Sat, 8 Mar 2025 21:24:02 +0800 Subject: [PATCH 011/145] fix: type checking (#993) * fix: type checking * update license * format * format * update catalog * revert type annotation * format * format * update --- python/datafusion/catalog.py | 5 +++-- python/datafusion/context.py | 19 ++++++++++++------ python/datafusion/dataframe.py | 3 ++- python/datafusion/expr.py | 8 ++++---- python/datafusion/functions.py | 10 +++++++--- python/datafusion/input/location.py | 10 +++++----- python/datafusion/udf.py | 7 ++++--- python/tests/test_functions.py | 30 +++++++++++++++++++++++++++++ 8 files changed, 68 insertions(+), 24 deletions(-) diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index 703037665..0560f4704 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -66,11 +66,12 @@ def __init__(self, table: df_internal.Table) -> None: """This constructor is not typically called by the end user.""" self.table = table + @property def schema(self) -> pyarrow.Schema: """Returns the schema associated with this table.""" - return self.table.schema() + return self.table.schema @property def kind(self) -> str: """Returns the kind of table.""" - return self.table.kind() + return self.table.kind diff --git a/python/datafusion/context.py b/python/datafusion/context.py index befc4dce6..282b2a477 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -728,7 +728,7 @@ def register_table(self, name: str, table: Table) -> None: name: Name of the resultant table. table: DataFusion table to add to the session context. """ - self.ctx.register_table(name, table) + self.ctx.register_table(name, table.table) def deregister_table(self, name: str) -> None: """Remove a table from the session.""" @@ -767,7 +767,7 @@ def register_parquet( file_extension: str = ".parquet", skip_metadata: bool = True, schema: pyarrow.Schema | None = None, - file_sort_order: list[list[Expr]] | None = None, + file_sort_order: list[list[SortExpr]] | None = None, ) -> None: """Register a Parquet file as a table. @@ -798,7 +798,9 @@ def register_parquet( file_extension, skip_metadata, schema, - file_sort_order, + [sort_list_to_raw_sort_list(exprs) for exprs in file_sort_order] + if file_sort_order is not None + else None, ) def register_csv( @@ -934,7 +936,7 @@ def register_udwf(self, udwf: WindowUDF) -> None: def catalog(self, name: str = "datafusion") -> Catalog: """Retrieve a catalog by name.""" - return self.ctx.catalog(name) + return Catalog(self.ctx.catalog(name)) @deprecated( "Use the catalog provider interface ``SessionContext.Catalog`` to " @@ -1054,7 +1056,7 @@ def read_parquet( file_extension: str = ".parquet", skip_metadata: bool = True, schema: pyarrow.Schema | None = None, - file_sort_order: list[list[Expr]] | None = None, + file_sort_order: list[list[Expr | SortExpr]] | None = None, ) -> DataFrame: """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. @@ -1078,6 +1080,11 @@ def read_parquet( """ if table_partition_cols is None: table_partition_cols = [] + file_sort_order = ( + [sort_list_to_raw_sort_list(f) for f in file_sort_order] + if file_sort_order is not None + else None + ) return DataFrame( self.ctx.read_parquet( str(path), @@ -1121,7 +1128,7 @@ def read_table(self, table: Table) -> DataFrame: :py:class:`~datafusion.catalog.ListingTable`, create a :py:class:`~datafusion.dataframe.DataFrame`. """ - return DataFrame(self.ctx.read_table(table)) + return DataFrame(self.ctx.read_table(table.table)) def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream: """Execute the ``plan`` and return the results.""" diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 85a179ec9..de5d8376e 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -52,6 +52,7 @@ from enum import Enum from datafusion._internal import DataFrame as DataFrameInternal +from datafusion._internal import expr as expr_internal from datafusion.expr import Expr, SortExpr, sort_or_default @@ -277,7 +278,7 @@ def with_columns( def _simplify_expression( *exprs: Expr | Iterable[Expr], **named_exprs: Expr - ) -> list[Expr]: + ) -> list[expr_internal.Expr]: expr_list = [] for expr in exprs: if isinstance(expr, Expr): diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index e3d7158eb..3639abec6 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -176,7 +176,7 @@ def sort_or_default(e: Expr | SortExpr) -> expr_internal.SortExpr: """Helper function to return a default Sort if an Expr is provided.""" if isinstance(e, SortExpr): return e.raw_sort - return SortExpr(e.expr, True, True).raw_sort + return SortExpr(e, True, True).raw_sort def sort_list_to_raw_sort_list( @@ -231,7 +231,7 @@ def variant_name(self) -> str: def __richcmp__(self, other: Expr, op: int) -> Expr: """Comparison operator.""" - return Expr(self.expr.__richcmp__(other, op)) + return Expr(self.expr.__richcmp__(other.expr, op)) def __repr__(self) -> str: """Generate a string representation of this expression.""" @@ -417,7 +417,7 @@ def sort(self, ascending: bool = True, nulls_first: bool = True) -> SortExpr: ascending: If true, sort in ascending order. nulls_first: Return null values first. """ - return SortExpr(self.expr, ascending=ascending, nulls_first=nulls_first) + return SortExpr(self, ascending=ascending, nulls_first=nulls_first) def is_null(self) -> Expr: """Returns ``True`` if this expression is null.""" @@ -789,7 +789,7 @@ class SortExpr: def __init__(self, expr: Expr, ascending: bool, nulls_first: bool) -> None: """This constructor should not be called by the end user.""" - self.raw_sort = expr_internal.SortExpr(expr, ascending, nulls_first) + self.raw_sort = expr_internal.SortExpr(expr.expr, ascending, nulls_first) def expr(self) -> Expr: """Return the raw expr backing the SortExpr.""" diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 5c260aade..b449c4868 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -366,7 +366,7 @@ def concat_ws(separator: str, *args: Expr) -> Expr: def order_by(expr: Expr, ascending: bool = True, nulls_first: bool = True) -> SortExpr: """Creates a new sort expression.""" - return SortExpr(expr.expr, ascending=ascending, nulls_first=nulls_first) + return SortExpr(expr, ascending=ascending, nulls_first=nulls_first) def alias(expr: Expr, name: str) -> Expr: @@ -942,6 +942,7 @@ def to_timestamp_millis(arg: Expr, *formatters: Expr) -> Expr: See :py:func:`to_timestamp` for a description on how to use formatters. """ + formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_millis(arg.expr, *formatters)) @@ -950,6 +951,7 @@ def to_timestamp_micros(arg: Expr, *formatters: Expr) -> Expr: See :py:func:`to_timestamp` for a description on how to use formatters. """ + formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_micros(arg.expr, *formatters)) @@ -958,6 +960,7 @@ def to_timestamp_nanos(arg: Expr, *formatters: Expr) -> Expr: See :py:func:`to_timestamp` for a description on how to use formatters. """ + formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_nanos(arg.expr, *formatters)) @@ -966,6 +969,7 @@ def to_timestamp_seconds(arg: Expr, *formatters: Expr) -> Expr: See :py:func:`to_timestamp` for a description on how to use formatters. """ + formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_seconds(arg.expr, *formatters)) @@ -1078,9 +1082,9 @@ def range(start: Expr, stop: Expr, step: Expr) -> Expr: return Expr(f.range(start.expr, stop.expr, step.expr)) -def uuid(arg: Expr) -> Expr: +def uuid() -> Expr: """Returns uuid v4 as a string value.""" - return Expr(f.uuid(arg.expr)) + return Expr(f.uuid()) def struct(*args: Expr) -> Expr: diff --git a/python/datafusion/input/location.py b/python/datafusion/input/location.py index a8252b53c..517cd1578 100644 --- a/python/datafusion/input/location.py +++ b/python/datafusion/input/location.py @@ -37,12 +37,12 @@ def is_correct_input(self, input_item: Any, table_name: str, **kwargs): def build_table( self, - input_file: str, + input_item: str, table_name: str, **kwargs, ) -> SqlTable: """Create a table from the input source.""" - _, extension = os.path.splitext(input_file) + _, extension = os.path.splitext(input_item) format = extension.lstrip(".").lower() num_rows = 0 # Total number of rows in the file. Used for statistics columns = [] @@ -50,7 +50,7 @@ def build_table( import pyarrow.parquet as pq # Read the Parquet metadata - metadata = pq.read_metadata(input_file) + metadata = pq.read_metadata(input_item) num_rows = metadata.num_rows # Iterate through the schema and build the SqlTable for col in metadata.schema: @@ -69,7 +69,7 @@ def build_table( # to get that information. However, this should only be occurring # at table creation time and therefore shouldn't # slow down query performance. - with open(input_file, "r") as file: + with open(input_item, "r") as file: reader = csv.reader(file) header_row = next(reader) print(header_row) @@ -84,6 +84,6 @@ def build_table( ) # Input could possibly be multiple files. Create a list if so - input_files = glob.glob(input_file) + input_files = glob.glob(input_item) return SqlTable(table_name, columns, num_rows, input_files) diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index c97f453d0..0bba3d723 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -85,7 +85,7 @@ class ScalarUDF: def __init__( self, - name: Optional[str], + name: str, func: Callable[..., _R], input_types: pyarrow.DataType | list[pyarrow.DataType], return_type: _R, @@ -182,7 +182,7 @@ class AggregateUDF: def __init__( self, - name: Optional[str], + name: str, accumulator: Callable[[], Accumulator], input_types: list[pyarrow.DataType], return_type: pyarrow.DataType, @@ -277,6 +277,7 @@ def sum_bias_10() -> Summarize: ) if name is None: name = accum.__call__().__class__.__qualname__.lower() + assert name is not None if isinstance(input_types, pyarrow.DataType): input_types = [input_types] return AggregateUDF( @@ -462,7 +463,7 @@ class WindowUDF: def __init__( self, - name: Optional[str], + name: str, func: Callable[[], WindowEvaluator], input_types: list[pyarrow.DataType], return_type: pyarrow.DataType, diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index b1a739b49..fca05bb8f 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -871,7 +871,22 @@ def test_temporal_functions(df): f.to_timestamp_millis(literal("2023-09-07 05:06:14.523952")), f.to_timestamp_micros(literal("2023-09-07 05:06:14.523952")), f.extract(literal("day"), column("d")), + f.to_timestamp( + literal("2023-09-07 05:06:14.523952000"), literal("%Y-%m-%d %H:%M:%S.%f") + ), + f.to_timestamp_seconds( + literal("2023-09-07 05:06:14.523952000"), literal("%Y-%m-%d %H:%M:%S.%f") + ), + f.to_timestamp_millis( + literal("2023-09-07 05:06:14.523952000"), literal("%Y-%m-%d %H:%M:%S.%f") + ), + f.to_timestamp_micros( + literal("2023-09-07 05:06:14.523952000"), literal("%Y-%m-%d %H:%M:%S.%f") + ), f.to_timestamp_nanos(literal("2023-09-07 05:06:14.523952")), + f.to_timestamp_nanos( + literal("2023-09-07 05:06:14.523952000"), literal("%Y-%m-%d %H:%M:%S.%f") + ), ) result = df.collect() assert len(result) == 1 @@ -913,6 +928,21 @@ def test_temporal_functions(df): assert result.column(11) == pa.array( [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") ) + assert result.column(12) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14)] * 3, type=pa.timestamp("s") + ) + assert result.column(13) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14, 523000)] * 3, type=pa.timestamp("ms") + ) + assert result.column(14) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us") + ) + assert result.column(15) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + ) + assert result.column(16) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + ) def test_arrow_cast(df): From acd70409f73f299a144e7ff4115c6e6035c3ffb5 Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Sat, 8 Mar 2025 16:37:10 +0100 Subject: [PATCH 012/145] feat: reads using global ctx (#982) * feat: reads using global ctx * Add text to io methods to describe the context they are using --------- Co-authored-by: Tim Saucer --- python/datafusion/__init__.py | 5 + python/datafusion/io.py | 199 ++++++++++++++++++++++++++ python/tests/test_io.py | 95 ++++++++++++ python/tests/test_wrapper_coverage.py | 2 + src/context.rs | 12 +- src/utils.rs | 8 ++ 6 files changed, 319 insertions(+), 2 deletions(-) create mode 100644 python/datafusion/io.py create mode 100644 python/tests/test_io.py diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 85aefcce7..f11ce54a6 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -45,6 +45,7 @@ Expr, WindowFrame, ) +from .io import read_avro, read_csv, read_json, read_parquet from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream from .udf import Accumulator, AggregateUDF, ScalarUDF, WindowUDF @@ -81,6 +82,10 @@ "functions", "object_store", "substrait", + "read_parquet", + "read_avro", + "read_csv", + "read_json", ] diff --git a/python/datafusion/io.py b/python/datafusion/io.py new file mode 100644 index 000000000..7f3b77efa --- /dev/null +++ b/python/datafusion/io.py @@ -0,0 +1,199 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""IO read functions using global context.""" + +import pathlib + +import pyarrow + +from datafusion.dataframe import DataFrame +from datafusion.expr import Expr + +from ._internal import SessionContext as SessionContextInternal + + +def read_parquet( + path: str | pathlib.Path, + table_partition_cols: list[tuple[str, str]] | None = None, + parquet_pruning: bool = True, + file_extension: str = ".parquet", + skip_metadata: bool = True, + schema: pyarrow.Schema | None = None, + file_sort_order: list[list[Expr]] | None = None, +) -> DataFrame: + """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. + + This function will use the global context. Any functions or tables registered + with another context may not be accessible when used with a DataFrame created + using this function. + + Args: + path: Path to the Parquet file. + table_partition_cols: Partition columns. + parquet_pruning: Whether the parquet reader should use the predicate + to prune row groups. + file_extension: File extension; only files with this extension are + selected for data input. + skip_metadata: Whether the parquet reader should skip any metadata + that may be in the file schema. This can help avoid schema + conflicts due to metadata. + schema: An optional schema representing the parquet files. If None, + the parquet reader will try to infer it based on data in the + file. + file_sort_order: Sort order for the file. + + Returns: + DataFrame representation of the read Parquet files + """ + if table_partition_cols is None: + table_partition_cols = [] + return DataFrame( + SessionContextInternal._global_ctx().read_parquet( + str(path), + table_partition_cols, + parquet_pruning, + file_extension, + skip_metadata, + schema, + file_sort_order, + ) + ) + + +def read_json( + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + schema_infer_max_records: int = 1000, + file_extension: str = ".json", + table_partition_cols: list[tuple[str, str]] | None = None, + file_compression_type: str | None = None, +) -> DataFrame: + """Read a line-delimited JSON data source. + + This function will use the global context. Any functions or tables registered + with another context may not be accessible when used with a DataFrame created + using this function. + + Args: + path: Path to the JSON file. + schema: The data source schema. + schema_infer_max_records: Maximum number of rows to read from JSON + files for schema inference if needed. + file_extension: File extension; only files with this extension are + selected for data input. + table_partition_cols: Partition columns. + file_compression_type: File compression type. + + Returns: + DataFrame representation of the read JSON files. + """ + if table_partition_cols is None: + table_partition_cols = [] + return DataFrame( + SessionContextInternal._global_ctx().read_json( + str(path), + schema, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, + ) + ) + + +def read_csv( + path: str | pathlib.Path | list[str] | list[pathlib.Path], + schema: pyarrow.Schema | None = None, + has_header: bool = True, + delimiter: str = ",", + schema_infer_max_records: int = 1000, + file_extension: str = ".csv", + table_partition_cols: list[tuple[str, str]] | None = None, + file_compression_type: str | None = None, +) -> DataFrame: + """Read a CSV data source. + + This function will use the global context. Any functions or tables registered + with another context may not be accessible when used with a DataFrame created + using this function. + + Args: + path: Path to the CSV file + schema: An optional schema representing the CSV files. If None, the + CSV reader will try to infer it based on data in file. + has_header: Whether the CSV file have a header. If schema inference + is run on a file with no headers, default column names are + created. + delimiter: An optional column delimiter. + schema_infer_max_records: Maximum number of rows to read from CSV + files for schema inference if needed. + file_extension: File extension; only files with this extension are + selected for data input. + table_partition_cols: Partition columns. + file_compression_type: File compression type. + + Returns: + DataFrame representation of the read CSV files + """ + if table_partition_cols is None: + table_partition_cols = [] + + path = [str(p) for p in path] if isinstance(path, list) else str(path) + + return DataFrame( + SessionContextInternal._global_ctx().read_csv( + path, + schema, + has_header, + delimiter, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, + ) + ) + + +def read_avro( + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + file_partition_cols: list[tuple[str, str]] | None = None, + file_extension: str = ".avro", +) -> DataFrame: + """Create a :py:class:`DataFrame` for reading Avro data source. + + This function will use the global context. Any functions or tables registered + with another context may not be accessible when used with a DataFrame created + using this function. + + Args: + path: Path to the Avro file. + schema: The data source schema. + file_partition_cols: Partition columns. + file_extension: File extension to select. + + Returns: + DataFrame representation of the read Avro file + """ + if file_partition_cols is None: + file_partition_cols = [] + return DataFrame( + SessionContextInternal._global_ctx().read_avro( + str(path), schema, file_partition_cols, file_extension + ) + ) diff --git a/python/tests/test_io.py b/python/tests/test_io.py new file mode 100644 index 000000000..21ad188ee --- /dev/null +++ b/python/tests/test_io.py @@ -0,0 +1,95 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import os +import pathlib + +import pyarrow as pa +from datafusion import column +from datafusion.io import read_avro, read_csv, read_json, read_parquet + + +def test_read_json_global_ctx(ctx): + path = os.path.dirname(os.path.abspath(__file__)) + + # Default + test_data_path = os.path.join(path, "data_test_context", "data.json") + df = read_json(test_data_path) + result = df.collect() + + assert result[0].column(0) == pa.array(["a", "b", "c"]) + assert result[0].column(1) == pa.array([1, 2, 3]) + + # Schema + schema = pa.schema( + [ + pa.field("A", pa.string(), nullable=True), + ] + ) + df = read_json(test_data_path, schema=schema) + result = df.collect() + + assert result[0].column(0) == pa.array(["a", "b", "c"]) + assert result[0].schema == schema + + # File extension + test_data_path = os.path.join(path, "data_test_context", "data.json") + df = read_json(test_data_path, file_extension=".json") + result = df.collect() + + assert result[0].column(0) == pa.array(["a", "b", "c"]) + assert result[0].column(1) == pa.array([1, 2, 3]) + + +def test_read_parquet_global(): + parquet_df = read_parquet(path="parquet/data/alltypes_plain.parquet") + parquet_df.show() + assert parquet_df is not None + + path = pathlib.Path.cwd() / "parquet/data/alltypes_plain.parquet" + parquet_df = read_parquet(path=path) + assert parquet_df is not None + + +def test_read_csv(): + csv_df = read_csv(path="testing/data/csv/aggregate_test_100.csv") + csv_df.select(column("c1")).show() + + +def test_read_csv_list(): + csv_df = read_csv(path=["testing/data/csv/aggregate_test_100.csv"]) + expected = csv_df.count() * 2 + + double_csv_df = read_csv( + path=[ + "testing/data/csv/aggregate_test_100.csv", + "testing/data/csv/aggregate_test_100.csv", + ] + ) + actual = double_csv_df.count() + + double_csv_df.select(column("c1")).show() + assert actual == expected + + +def test_read_avro(): + avro_df = read_avro(path="testing/data/avro/alltypes_plain.avro") + avro_df.show() + assert avro_df is not None + + path = pathlib.Path.cwd() / "testing/data/avro/alltypes_plain.avro" + avro_df = read_avro(path=path) + assert avro_df is not None diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index 86f2d57f2..ac064ba95 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -34,6 +34,8 @@ def missing_exports(internal_obj, wrapped_obj) -> None: return for attr in dir(internal_obj): + if attr in ["_global_ctx"]: + continue assert attr in dir(wrapped_obj) internal_attr = getattr(internal_obj, attr) diff --git a/src/context.rs b/src/context.rs index 0f962638e..9ba87eb8a 100644 --- a/src/context.rs +++ b/src/context.rs @@ -44,7 +44,7 @@ use crate::store::StorageContexts; use crate::udaf::PyAggregateUDF; use crate::udf::PyScalarUDF; use crate::udwf::PyWindowUDF; -use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future}; +use crate::utils::{get_global_ctx, get_tokio_runtime, validate_pycapsule, wait_for_future}; use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::arrow::record_batch::RecordBatch; @@ -69,7 +69,7 @@ use datafusion::prelude::{ AvroReadOptions, CsvReadOptions, DataFrame, NdJsonReadOptions, ParquetReadOptions, }; use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider}; -use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple}; +use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType}; use tokio::task::JoinHandle; /// Configuration options for a SessionContext @@ -306,6 +306,14 @@ impl PySessionContext { }) } + #[classmethod] + #[pyo3(signature = ())] + fn _global_ctx(_cls: &Bound<'_, PyType>) -> PyResult { + Ok(Self { + ctx: get_global_ctx().clone(), + }) + } + /// Register an object store with the given name #[pyo3(signature = (scheme, store, host=None))] pub fn register_object_store( diff --git a/src/utils.rs b/src/utils.rs index ed224b364..999aad755 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -17,6 +17,7 @@ use crate::errors::{PyDataFusionError, PyDataFusionResult}; use crate::TokioRuntime; +use datafusion::execution::context::SessionContext; use datafusion::logical_expr::Volatility; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; @@ -37,6 +38,13 @@ pub(crate) fn get_tokio_runtime() -> &'static TokioRuntime { RUNTIME.get_or_init(|| TokioRuntime(tokio::runtime::Runtime::new().unwrap())) } +/// Utility to get the Global Datafussion CTX +#[inline] +pub(crate) fn get_global_ctx() -> &'static SessionContext { + static CTX: OnceLock = OnceLock::new(); + CTX.get_or_init(|| SessionContext::new()) +} + /// Utility to collect rust futures with GIL released pub fn wait_for_future(py: Python, f: F) -> F::Output where From 973d7ec4a8196a78bc4fb32db4f24e523997ba4c Mon Sep 17 00:00:00 2001 From: Crystal Zhou <45134936+CrystalZhou0529@users.noreply.github.com> Date: Sat, 8 Mar 2025 16:23:54 -0500 Subject: [PATCH 013/145] feat: Implementation of udf and udaf decorator (#1040) * Implementation of udf and udaf decorator * Rename decorators back to udf and udaf, update documentations * Minor typo fixes * Fixing linting errors * ruff formatting --------- Co-authored-by: Tim Saucer --- python/datafusion/udf.py | 257 +++++++++++++++++++++++++++----------- python/tests/test_udaf.py | 42 +++++++ python/tests/test_udf.py | 42 ++++++- 3 files changed, 265 insertions(+), 76 deletions(-) diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index 0bba3d723..af7bcf2ed 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -19,6 +19,7 @@ from __future__ import annotations +import functools from abc import ABCMeta, abstractmethod from enum import Enum from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar @@ -110,43 +111,102 @@ def __call__(self, *args: Expr) -> Expr: args_raw = [arg.expr for arg in args] return Expr(self._udf.__call__(*args_raw)) - @staticmethod - def udf( - func: Callable[..., _R], - input_types: list[pyarrow.DataType], - return_type: _R, - volatility: Volatility | str, - name: Optional[str] = None, - ) -> ScalarUDF: - """Create a new User-Defined Function. + class udf: + """Create a new User-Defined Function (UDF). + + This class can be used both as a **function** and as a **decorator**. + + Usage: + - **As a function**: Call `udf(func, input_types, return_type, volatility, + name)`. + - **As a decorator**: Use `@udf(input_types, return_type, volatility, + name)`. In this case, do **not** pass `func` explicitly. Args: - func: A callable python function. - input_types: The data types of the arguments to ``func``. This list - must be of the same length as the number of arguments. - return_type: The data type of the return value from the python - function. - volatility: See ``Volatility`` for allowed values. - name: A descriptive name for the function. + func (Callable, optional): **Only needed when calling as a function.** + Skip this argument when using `udf` as a decorator. + input_types (list[pyarrow.DataType]): The data types of the arguments + to `func`. This list must be of the same length as the number of + arguments. + return_type (_R): The data type of the return value from the function. + volatility (Volatility | str): See `Volatility` for allowed values. + name (Optional[str]): A descriptive name for the function. Returns: - A user-defined aggregate function, which can be used in either data - aggregation or window function calls. + A user-defined function that can be used in SQL expressions, + data aggregation, or window function calls. + + Example: + **Using `udf` as a function:** + ``` + def double_func(x): + return x * 2 + double_udf = udf(double_func, [pyarrow.int32()], pyarrow.int32(), + "volatile", "double_it") + ``` + + **Using `udf` as a decorator:** + ``` + @udf([pyarrow.int32()], pyarrow.int32(), "volatile", "double_it") + def double_udf(x): + return x * 2 + ``` """ - if not callable(func): - raise TypeError("`func` argument must be callable") - if name is None: - if hasattr(func, "__qualname__"): - name = func.__qualname__.lower() + + def __new__(cls, *args, **kwargs): + """Create a new UDF. + + Trigger UDF function or decorator depending on if the first args is callable + """ + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return cls._function(*args, **kwargs) else: - name = func.__class__.__name__.lower() - return ScalarUDF( - name=name, - func=func, - input_types=input_types, - return_type=return_type, - volatility=volatility, - ) + # Case 2: Used as a decorator with parameters + return cls._decorator(*args, **kwargs) + + @staticmethod + def _function( + func: Callable[..., _R], + input_types: list[pyarrow.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> ScalarUDF: + if not callable(func): + raise TypeError("`func` argument must be callable") + if name is None: + if hasattr(func, "__qualname__"): + name = func.__qualname__.lower() + else: + name = func.__class__.__name__.lower() + return ScalarUDF( + name=name, + func=func, + input_types=input_types, + return_type=return_type, + volatility=volatility, + ) + + @staticmethod + def _decorator( + input_types: list[pyarrow.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ): + def decorator(func): + udf_caller = ScalarUDF.udf( + func, input_types, return_type, volatility, name + ) + + @functools.wraps(func) + def wrapper(*args, **kwargs): + return udf_caller(*args, **kwargs) + + return wrapper + + return decorator class Accumulator(metaclass=ABCMeta): @@ -212,25 +272,27 @@ def __call__(self, *args: Expr) -> Expr: args_raw = [arg.expr for arg in args] return Expr(self._udaf.__call__(*args_raw)) - @staticmethod - def udaf( - accum: Callable[[], Accumulator], - input_types: pyarrow.DataType | list[pyarrow.DataType], - return_type: pyarrow.DataType, - state_type: list[pyarrow.DataType], - volatility: Volatility | str, - name: Optional[str] = None, - ) -> AggregateUDF: - """Create a new User-Defined Aggregate Function. + class udaf: + """Create a new User-Defined Aggregate Function (UDAF). - If your :py:class:`Accumulator` can be instantiated with no arguments, you - can simply pass it's type as ``accum``. If you need to pass additional arguments - to it's constructor, you can define a lambda or a factory method. During runtime - the :py:class:`Accumulator` will be constructed for every instance in - which this UDAF is used. The following examples are all valid. + This class allows you to define an **aggregate function** that can be used in + data aggregation or window function calls. - .. code-block:: python + Usage: + - **As a function**: Call `udaf(accum, input_types, return_type, state_type, + volatility, name)`. + - **As a decorator**: Use `@udaf(input_types, return_type, state_type, + volatility, name)`. + When using `udaf` as a decorator, **do not pass `accum` explicitly**. + **Function example:** + + If your `:py:class:Accumulator` can be instantiated with no arguments, you + can simply pass it's type as `accum`. If you need to pass additional + arguments to it's constructor, you can define a lambda or a factory method. + During runtime the `:py:class:Accumulator` will be constructed for every + instance in which this UDAF is used. The following examples are all valid. + ``` import pyarrow as pa import pyarrow.compute as pc @@ -253,12 +315,24 @@ def evaluate(self) -> pa.Scalar: def sum_bias_10() -> Summarize: return Summarize(10.0) - udaf1 = udaf(Summarize, pa.float64(), pa.float64(), [pa.float64()], "immutable") - udaf2 = udaf(sum_bias_10, pa.float64(), pa.float64(), [pa.float64()], "immutable") - udaf3 = udaf(lambda: Summarize(20.0), pa.float64(), pa.float64(), [pa.float64()], "immutable") + udaf1 = udaf(Summarize, pa.float64(), pa.float64(), [pa.float64()], + "immutable") + udaf2 = udaf(sum_bias_10, pa.float64(), pa.float64(), [pa.float64()], + "immutable") + udaf3 = udaf(lambda: Summarize(20.0), pa.float64(), pa.float64(), + [pa.float64()], "immutable") + ``` + + **Decorator example:** + ``` + @udaf(pa.float64(), pa.float64(), [pa.float64()], "immutable") + def udf4() -> Summarize: + return Summarize(10.0) + ``` Args: - accum: The accumulator python function. + accum: The accumulator python function. **Only needed when calling as a + function. Skip this argument when using `udaf` as a decorator.** input_types: The data types of the arguments to ``accum``. return_type: The data type of the return value. state_type: The data types of the intermediate accumulation. @@ -268,26 +342,69 @@ def sum_bias_10() -> Summarize: Returns: A user-defined aggregate function, which can be used in either data aggregation or window function calls. - """ # noqa W505 - if not callable(accum): - raise TypeError("`func` must be callable.") - if not isinstance(accum.__call__(), Accumulator): - raise TypeError( - "Accumulator must implement the abstract base class Accumulator" + """ + + def __new__(cls, *args, **kwargs): + """Create a new UDAF. + + Trigger UDAF function or decorator depending on if the first args is + callable + """ + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return cls._function(*args, **kwargs) + else: + # Case 2: Used as a decorator with parameters + return cls._decorator(*args, **kwargs) + + @staticmethod + def _function( + accum: Callable[[], Accumulator], + input_types: pyarrow.DataType | list[pyarrow.DataType], + return_type: pyarrow.DataType, + state_type: list[pyarrow.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ) -> AggregateUDF: + if not callable(accum): + raise TypeError("`func` must be callable.") + if not isinstance(accum.__call__(), Accumulator): + raise TypeError( + "Accumulator must implement the abstract base class Accumulator" + ) + if name is None: + name = accum.__call__().__class__.__qualname__.lower() + if isinstance(input_types, pyarrow.DataType): + input_types = [input_types] + return AggregateUDF( + name=name, + accumulator=accum, + input_types=input_types, + return_type=return_type, + state_type=state_type, + volatility=volatility, ) - if name is None: - name = accum.__call__().__class__.__qualname__.lower() - assert name is not None - if isinstance(input_types, pyarrow.DataType): - input_types = [input_types] - return AggregateUDF( - name=name, - accumulator=accum, - input_types=input_types, - return_type=return_type, - state_type=state_type, - volatility=volatility, - ) + + @staticmethod + def _decorator( + input_types: pyarrow.DataType | list[pyarrow.DataType], + return_type: pyarrow.DataType, + state_type: list[pyarrow.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ): + def decorator(accum: Callable[[], Accumulator]): + udaf_caller = AggregateUDF.udaf( + accum, input_types, return_type, state_type, volatility, name + ) + + @functools.wraps(accum) + def wrapper(*args, **kwargs): + return udaf_caller(*args, **kwargs) + + return wrapper + + return decorator class WindowEvaluator(metaclass=ABCMeta): diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py index 0005a3da8..e69c77d3c 100644 --- a/python/tests/test_udaf.py +++ b/python/tests/test_udaf.py @@ -117,6 +117,26 @@ def test_udaf_aggregate(df): assert result.column(0) == pa.array([1.0 + 2.0 + 3.0]) +def test_udaf_decorator_aggregate(df): + @udaf(pa.float64(), pa.float64(), [pa.float64()], "immutable") + def summarize(): + return Summarize() + + df1 = df.aggregate([], [summarize(column("a"))]) + + # execute and collect the first (and only) batch + result = df1.collect()[0] + + assert result.column(0) == pa.array([1.0 + 2.0 + 3.0]) + + df2 = df.aggregate([], [summarize(column("a"))]) + + # Run a second time to ensure the state is properly reset + result = df2.collect()[0] + + assert result.column(0) == pa.array([1.0 + 2.0 + 3.0]) + + def test_udaf_aggregate_with_arguments(df): bias = 10.0 @@ -143,6 +163,28 @@ def test_udaf_aggregate_with_arguments(df): assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0]) +def test_udaf_decorator_aggregate_with_arguments(df): + bias = 10.0 + + @udaf(pa.float64(), pa.float64(), [pa.float64()], "immutable") + def summarize(): + return Summarize(bias) + + df1 = df.aggregate([], [summarize(column("a"))]) + + # execute and collect the first (and only) batch + result = df1.collect()[0] + + assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0]) + + df2 = df.aggregate([], [summarize(column("a"))]) + + # Run a second time to ensure the state is properly reset + result = df2.collect()[0] + + assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0]) + + def test_group_by(df): summarize = udaf( Summarize, diff --git a/python/tests/test_udf.py b/python/tests/test_udf.py index 3a5dce6d6..a6c047552 100644 --- a/python/tests/test_udf.py +++ b/python/tests/test_udf.py @@ -24,7 +24,7 @@ def df(ctx): # create a RecordBatch and a new DataFrame from it batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 4, 6])], + [pa.array([1, 2, 3]), pa.array([4, 4, None])], names=["a", "b"], ) return ctx.create_dataframe([[batch]], name="test_table") @@ -39,10 +39,20 @@ def test_udf(df): volatility="immutable", ) - df = df.select(is_null(column("a"))) + df = df.select(is_null(column("b"))) result = df.collect()[0].column(0) - assert result == pa.array([False, False, False]) + assert result == pa.array([False, False, True]) + + +def test_udf_decorator(df): + @udf([pa.int64()], pa.bool_(), "immutable") + def is_null(x: pa.Array) -> pa.Array: + return x.is_null() + + df = df.select(is_null(column("b"))) + result = df.collect()[0].column(0) + assert result == pa.array([False, False, True]) def test_register_udf(ctx, df) -> None: @@ -56,10 +66,10 @@ def test_register_udf(ctx, df) -> None: ctx.register_udf(is_null) - df_result = ctx.sql("select is_null(a) from test_table") + df_result = ctx.sql("select is_null(b) from test_table") result = df_result.collect()[0].column(0) - assert result == pa.array([False, False, False]) + assert result == pa.array([False, False, True]) class OverThresholdUDF: @@ -70,7 +80,7 @@ def __call__(self, values: pa.Array) -> pa.Array: return pa.array(v.as_py() >= self.threshold for v in values) -def test_udf_with_parameters(df) -> None: +def test_udf_with_parameters_function(df) -> None: udf_no_param = udf( OverThresholdUDF(), pa.int64(), @@ -94,3 +104,23 @@ def test_udf_with_parameters(df) -> None: result = df2.collect()[0].column(0) assert result == pa.array([False, True, True]) + + +def test_udf_with_parameters_decorator(df) -> None: + @udf([pa.int64()], pa.bool_(), "immutable") + def udf_no_param(values: pa.Array) -> pa.Array: + return OverThresholdUDF()(values) + + df1 = df.select(udf_no_param(column("a"))) + result = df1.collect()[0].column(0) + + assert result == pa.array([True, True, True]) + + @udf([pa.int64()], pa.bool_(), "immutable") + def udf_with_param(values: pa.Array) -> pa.Array: + return OverThresholdUDF(2)(values) + + df2 = df.select(udf_with_param(column("a"))) + result = df2.collect()[0].column(0) + + assert result == pa.array([False, True, True]) From d72f5605b3d523585d04857505793920f96242ba Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 10 Mar 2025 06:56:12 -0400 Subject: [PATCH 014/145] Enable FA ruff lint (#1052) --- examples/python-udwf.py | 2 ++ pyproject.toml | 2 +- python/datafusion/io.py | 2 ++ python/tests/test_udaf.py | 2 ++ python/tests/test_udwf.py | 2 ++ 5 files changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/python-udwf.py b/examples/python-udwf.py index 32f8fadaa..7d39dc1b8 100644 --- a/examples/python-udwf.py +++ b/examples/python-udwf.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +from __future__ import annotations + import datafusion import pyarrow as pa from datafusion import col, lit, udwf diff --git a/pyproject.toml b/pyproject.toml index f416e02a5..d16a18aa6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,7 @@ features = ["substrait"] # Enable docstring linting using the google style guide [tool.ruff.lint] -select = ["E4", "E7", "E9", "F", "D", "W", "I"] +select = ["E4", "E7", "E9", "F", "FA", "D", "W", "I"] [tool.ruff.lint.pydocstyle] convention = "google" diff --git a/python/datafusion/io.py b/python/datafusion/io.py index 7f3b77efa..3b6264948 100644 --- a/python/datafusion/io.py +++ b/python/datafusion/io.py @@ -17,6 +17,8 @@ """IO read functions using global context.""" +from __future__ import annotations + import pathlib import pyarrow diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py index e69c77d3c..97cf81f3c 100644 --- a/python/tests/test_udaf.py +++ b/python/tests/test_udaf.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +from __future__ import annotations + from typing import List import pyarrow as pa diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py index 0ffa04179..2fea34aa3 100644 --- a/python/tests/test_udwf.py +++ b/python/tests/test_udwf.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +from __future__ import annotations + import pyarrow as pa import pytest from datafusion import SessionContext, column, lit, udwf From 0002372ccdb780e011631c797ec9613174cf0a94 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 10 Mar 2025 14:22:42 -0400 Subject: [PATCH 015/145] Enable take comments to assign issues to users (#1058) --- .github/workflows/take.yml | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 .github/workflows/take.yml diff --git a/.github/workflows/take.yml b/.github/workflows/take.yml new file mode 100644 index 000000000..86dc190ad --- /dev/null +++ b/.github/workflows/take.yml @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Assign the issue via a `take` comment +on: + issue_comment: + types: created + +permissions: + issues: write + +jobs: + issue_assign: + runs-on: ubuntu-latest + if: (!github.event.issue.pull_request) && github.event.comment.body == 'take' + concurrency: + group: ${{ github.actor }}-issue-assign + steps: + - run: | + CODE=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -LI https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees/${{ github.event.comment.user.login }} -o /dev/null -w '%{http_code}\n' -s) + if [ "$CODE" -eq "204" ] + then + echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees + else + echo "Cannot assign issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + fi \ No newline at end of file From 9d634de6df2f8b76bd303ab1f5972f01deb2210d Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 10 Mar 2025 14:24:40 -0400 Subject: [PATCH 016/145] Update python min version to 3.9 (#1043) * 3.8 -> 3.9 * upgrade pyo3 abi3-py38 -> abi3-py39 --- Cargo.toml | 2 +- .../source/contributor-guide/introduction.rst | 2 +- examples/ffi-table-provider/Cargo.lock | 75 +- examples/ffi-table-provider/Cargo.toml | 2 +- examples/ffi-table-provider/pyproject.toml | 2 +- pyproject.toml | 3 +- uv.lock | 707 ++---------------- 7 files changed, 121 insertions(+), 672 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5358b1836..50967a219 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,7 @@ substrait = ["dep:datafusion-substrait"] [dependencies] tokio = { version = "1.42", features = ["macros", "rt", "rt-multi-thread", "sync"] } -pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py38"] } +pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } pyo3-async-runtimes = { version = "0.23", features = ["tokio-runtime"]} arrow = { version = "54", features = ["pyarrow"] } datafusion = { version = "45.0.0", features = ["avro", "unicode_expressions"] } diff --git a/docs/source/contributor-guide/introduction.rst b/docs/source/contributor-guide/introduction.rst index 25f2c21a4..2fba64111 100644 --- a/docs/source/contributor-guide/introduction.rst +++ b/docs/source/contributor-guide/introduction.rst @@ -118,7 +118,7 @@ be ignored by ``git``. .. code-block:: implementation=CPython - version=3.8 + version=3.9 shared=true abi3=true lib_name=python3.12 diff --git a/examples/ffi-table-provider/Cargo.lock b/examples/ffi-table-provider/Cargo.lock index 32af85180..8d0edd515 100644 --- a/examples/ffi-table-provider/Cargo.lock +++ b/examples/ffi-table-provider/Cargo.lock @@ -766,7 +766,8 @@ dependencies = [ [[package]] name = "datafusion" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eae420e7a5b0b7f1c39364cc76cbcd0f5fdc416b2514ae3847c2676bbd60702a" dependencies = [ "arrow", "arrow-array", @@ -816,7 +817,8 @@ dependencies = [ [[package]] name = "datafusion-catalog" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f27987bc22b810939e8dfecc55571e9d50355d6ea8ec1c47af8383a76a6d0e1" dependencies = [ "arrow", "async-trait", @@ -836,7 +838,8 @@ dependencies = [ [[package]] name = "datafusion-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3f6d5b8c9408cc692f7c194b8aa0c0f9b253e065a8d960ad9cdc2a13e697602" dependencies = [ "ahash", "arrow", @@ -862,7 +865,8 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d4603c8e8a4baf77660ab7074cc66fc15cc8a18f2ce9dfadb755fc6ee294e48" dependencies = [ "log", "tokio", @@ -871,12 +875,14 @@ dependencies = [ [[package]] name = "datafusion-doc" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5bf4bc68623a5cf231eed601ed6eb41f46a37c4d15d11a0bff24cbc8396cd66" [[package]] name = "datafusion-execution" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b491c012cdf8e051053426013429a76f74ee3c2db68496c79c323ca1084d27" dependencies = [ "arrow", "dashmap", @@ -894,7 +900,8 @@ dependencies = [ [[package]] name = "datafusion-expr" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a181408d4fc5dc22f9252781a8f39f2d0e5d1b33ec9bde242844980a2689c1" dependencies = [ "arrow", "chrono", @@ -914,7 +921,8 @@ dependencies = [ [[package]] name = "datafusion-expr-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1129b48e8534d8c03c6543bcdccef0b55c8ac0c1272a15a56c67068b6eb1885" dependencies = [ "arrow", "datafusion-common", @@ -925,7 +933,8 @@ dependencies = [ [[package]] name = "datafusion-ffi" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff47a79d442207c168c6e3e1d970c248589c148e4800e5b285ac1b2cb1a230f8" dependencies = [ "abi_stable", "arrow", @@ -945,7 +954,8 @@ dependencies = [ [[package]] name = "datafusion-functions" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6125874e4856dfb09b59886784fcb74cde5cfc5930b3a80a1a728ef7a010df6b" dependencies = [ "arrow", "arrow-buffer", @@ -974,7 +984,8 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3add7b1d3888e05e7c95f2b281af900ca69ebdcb21069ba679b33bde8b3b9d6" dependencies = [ "ahash", "arrow", @@ -996,7 +1007,8 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e18baa4cfc3d2f144f74148ed68a1f92337f5072b6dde204a0dbbdf3324989c" dependencies = [ "ahash", "arrow", @@ -1008,7 +1020,8 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ec5ee8cecb0dc370291279673097ddabec03a011f73f30d7f1096457127e03e" dependencies = [ "arrow", "arrow-array", @@ -1031,7 +1044,8 @@ dependencies = [ [[package]] name = "datafusion-functions-table" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c403ddd473bbb0952ba880008428b3c7febf0ed3ce1eec35a205db20efb2a36" dependencies = [ "arrow", "async-trait", @@ -1046,7 +1060,8 @@ dependencies = [ [[package]] name = "datafusion-functions-window" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ab18c2fb835614d06a75f24a9e09136d3a8c12a92d97c95a6af316a1787a9c5" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1062,7 +1077,8 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a77b73bc15e7d1967121fdc7a55d819bfb9d6c03766a6c322247dce9094a53a4" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1071,7 +1087,8 @@ dependencies = [ [[package]] name = "datafusion-macros" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09369b8d962291e808977cf94d495fd8b5b38647232d7ef562c27ac0f495b0af" dependencies = [ "datafusion-expr", "quote", @@ -1081,7 +1098,8 @@ dependencies = [ [[package]] name = "datafusion-optimizer" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2403a7e4a84637f3de7d8d4d7a9ccc0cc4be92d89b0161ba3ee5be82f0531c54" dependencies = [ "arrow", "chrono", @@ -1099,7 +1117,8 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86ff72ac702b62dbf2650c4e1d715ebd3e4aab14e3885e72e8549e250307347c" dependencies = [ "ahash", "arrow", @@ -1123,7 +1142,8 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60982b7d684e25579ee29754b4333057ed62e2cc925383c5f0bd8cab7962f435" dependencies = [ "ahash", "arrow", @@ -1137,7 +1157,8 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac5e85c189d5238a5cf181a624e450c4cd4c66ac77ca551d6f3ff9080bac90bb" dependencies = [ "arrow", "arrow-schema", @@ -1158,7 +1179,8 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c36bf163956d7e2542657c78b3383fdc78f791317ef358a359feffcdb968106f" dependencies = [ "ahash", "arrow", @@ -1189,7 +1211,8 @@ dependencies = [ [[package]] name = "datafusion-proto" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2db5d79f0c974041787b899d24dc91bdab2ff112d1942dd71356a4ce3b407e6c" dependencies = [ "arrow", "chrono", @@ -1204,7 +1227,8 @@ dependencies = [ [[package]] name = "datafusion-proto-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de21bde1603aac0ff32cf478e47081be6e3583c6861fe8f57034da911efe7578" dependencies = [ "arrow", "datafusion-common", @@ -1214,7 +1238,8 @@ dependencies = [ [[package]] name = "datafusion-sql" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13caa4daede211ecec53c78b13c503b592794d125f9a3cc3afe992edf9e7f43" dependencies = [ "arrow", "arrow-array", diff --git a/examples/ffi-table-provider/Cargo.toml b/examples/ffi-table-provider/Cargo.toml index 0e558fdd0..f4e4fda79 100644 --- a/examples/ffi-table-provider/Cargo.toml +++ b/examples/ffi-table-provider/Cargo.toml @@ -23,7 +23,7 @@ edition = "2021" [dependencies] datafusion = { version = "45.0.0" } datafusion-ffi = { version = "45.0.0" } -pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py38"] } +pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } arrow = { version = "54" } arrow-array = { version = "54" } arrow-schema = { version = "54" } diff --git a/examples/ffi-table-provider/pyproject.toml b/examples/ffi-table-provider/pyproject.toml index 116efae9c..9cd25b423 100644 --- a/examples/ffi-table-provider/pyproject.toml +++ b/examples/ffi-table-provider/pyproject.toml @@ -21,7 +21,7 @@ build-backend = "maturin" [project] name = "ffi_table_provider" -requires-python = ">=3.8" +requires-python = ">=3.9" classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: Implementation :: CPython", diff --git a/pyproject.toml b/pyproject.toml index d16a18aa6..1c2733677 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ name = "datafusion" description = "Build and run queries against data" readme = "README.md" license = { file = "LICENSE.txt" } -requires-python = ">=3.8" +requires-python = ">=3.9" keywords = ["datafusion", "dataframe", "rust", "query-engine"] classifiers = [ "Development Status :: 2 - Pre-Alpha", @@ -35,7 +35,6 @@ classifiers = [ "Operating System :: Microsoft :: Windows", "Operating System :: POSIX :: Linux", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", diff --git a/uv.lock b/uv.lock index 587ddc8b7..619b92856 100644 --- a/uv.lock +++ b/uv.lock @@ -1,23 +1,10 @@ version = 1 -requires-python = ">=3.8" +requires-python = ">=3.9" resolution-markers = [ "python_full_version >= '3.12'", "python_full_version == '3.11.*'", "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", - "python_full_version < '3.9'", -] - -[[package]] -name = "alabaster" -version = "0.7.13" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/94/71/a8ee96d1fd95ca04a0d2e2d9c4081dac4c2d2b12f7ddb899c8cb9bfd1532/alabaster-0.7.13.tar.gz", hash = "sha256:a27a4a084d5e690e16e01e03ad2b2e552c61a65469419b907243193de1a84ae2", size = 11454 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/64/88/c7083fc61120ab661c5d0b82cb77079fc1429d3f913a456c1c82cf4658f7/alabaster-0.7.13-py3-none-any.whl", hash = "sha256:1ee19aca801bbabb5ba3f5f258e4422dfa86f82f3e9cefb0859b283cdd7f62a3", size = 13857 }, + "python_full_version < '3.10'", ] [[package]] @@ -25,7 +12,7 @@ name = "alabaster" version = "0.7.16" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] sdist = { url = "https://files.pythonhosted.org/packages/c9/3e/13dd8e5ed9094e734ac430b5d0eb4f2bb001708a8b7856cbf8e084e001ba/alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65", size = 23776 } wheels = [ @@ -46,42 +33,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929 }, ] -[[package]] -name = "appnope" -version = "0.1.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/35/5d/752690df9ef5b76e169e68d6a129fa6d08a7100ca7f754c89495db3c6019/appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee", size = 4170 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321 }, -] - -[[package]] -name = "astroid" -version = "3.2.4" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9e/53/1067e1113ecaf58312357f2cd93063674924119d80d173adc3f6f2387aa2/astroid-3.2.4.tar.gz", hash = "sha256:0e14202810b30da1b735827f78f5157be2bbd4a7a59b7707ca0bfc2fb4c0063a", size = 397576 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/80/96/b32bbbb46170a1c8b8b1f28c794202e25cfe743565e9d3469b8eb1e0cc05/astroid-3.2.4-py3-none-any.whl", hash = "sha256:413658a61eeca6202a59231abb473f932038fbcbf1666587f66d482083413a25", size = 276348 }, -] - [[package]] name = "astroid" version = "3.3.8" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] dependencies = [ - { name = "typing-extensions", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/80/c5/5c83c48bbf547f3dd8b587529db7cf5a265a3368b33e85e76af8ff6061d3/astroid-3.3.8.tar.gz", hash = "sha256:a88c7994f914a4ea8572fac479459f4955eeccc877be3f2d959a33273b0cf40b", size = 398196 } wheels = [ @@ -101,23 +58,11 @@ wheels = [ name = "babel" version = "2.16.0" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pytz", marker = "python_full_version < '3.9'" }, -] sdist = { url = "https://files.pythonhosted.org/packages/2a/74/f1bc80f23eeba13393b7222b11d95ca3af2c1e28edca18af487137eefed9/babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316", size = 9348104 } wheels = [ { url = "https://files.pythonhosted.org/packages/ed/20/bc79bc575ba2e2a7f70e8a1155618bb1301eaa5132a8271373a6903f73f8/babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b", size = 9587599 }, ] -[[package]] -name = "backcall" -version = "0.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a2/40/764a663805d84deee23043e1426a9175567db89c8b3287b5c2ad9f71aa93/backcall-0.2.0.tar.gz", hash = "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e", size = 18041 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4c/1c/ff6546b6c12603d8dd1070aa3c3d273ad4c07f5771689a7b69a550e8c951/backcall-0.2.0-py2.py3-none-any.whl", hash = "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255", size = 11157 }, -] - [[package]] name = "beautifulsoup4" version = "4.12.3" @@ -194,14 +139,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/47/d7145bf2dc04684935d57d67dff9d6d795b2ba2796806bb109864be3a151/cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9", size = 488469 }, { url = "https://files.pythonhosted.org/packages/bf/ee/f94057fa6426481d663b88637a9a10e859e492c73d0384514a17d78ee205/cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d", size = 172475 }, { url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009 }, - { url = "https://files.pythonhosted.org/packages/48/08/15bf6b43ae9bd06f6b00ad8a91f5a8fe1069d4c9fab550a866755402724e/cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b", size = 182457 }, - { url = "https://files.pythonhosted.org/packages/c2/5b/f1523dd545f92f7df468e5f653ffa4df30ac222f3c884e51e139878f1cb5/cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964", size = 425932 }, - { url = "https://files.pythonhosted.org/packages/53/93/7e547ab4105969cc8c93b38a667b82a835dd2cc78f3a7dad6130cfd41e1d/cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9", size = 448585 }, - { url = "https://files.pythonhosted.org/packages/56/c4/a308f2c332006206bb511de219efeff090e9d63529ba0a77aae72e82248b/cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc", size = 456268 }, - { url = "https://files.pythonhosted.org/packages/ca/5b/b63681518265f2f4060d2b60755c1c77ec89e5e045fc3773b72735ddaad5/cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c", size = 436592 }, - { url = "https://files.pythonhosted.org/packages/bb/19/b51af9f4a4faa4a8ac5a0e5d5c2522dcd9703d07fac69da34a36c4d960d3/cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1", size = 446512 }, - { url = "https://files.pythonhosted.org/packages/e2/63/2bed8323890cb613bbecda807688a31ed11a7fe7afe31f8faaae0206a9a3/cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8", size = 171576 }, - { url = "https://files.pythonhosted.org/packages/2f/70/80c33b044ebc79527447fd4fbc5455d514c3bb840dede4455de97da39b4d/cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1", size = 181229 }, { url = "https://files.pythonhosted.org/packages/b9/ea/8bb50596b8ffbc49ddd7a1ad305035daa770202a6b782fc164647c2673ad/cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16", size = 182220 }, { url = "https://files.pythonhosted.org/packages/ae/11/e77c8cd24f58285a82c23af484cf5b124a376b32644e445960d1a4654c3a/cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36", size = 178605 }, { url = "https://files.pythonhosted.org/packages/ed/65/25a8dc32c53bf5b7b6c2686b42ae2ad58743f7ff644844af7cdb29b49361/cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8", size = 424910 }, @@ -274,19 +211,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e4/93/946a86ce20790e11312c87c75ba68d5f6ad2208cfb52b2d6a2c32840d922/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd", size = 145732 }, { url = "https://files.pythonhosted.org/packages/cd/e5/131d2fb1b0dddafc37be4f3a2fa79aa4c037368be9423061dccadfd90091/charset_normalizer-3.4.1-cp313-cp313-win32.whl", hash = "sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407", size = 95391 }, { url = "https://files.pythonhosted.org/packages/27/f2/4f9a69cc7712b9b5ad8fdb87039fd89abba997ad5cbe690d1835d40405b0/charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971", size = 102702 }, - { url = "https://files.pythonhosted.org/packages/10/bd/6517ea94f2672e801011d50b5d06be2a0deaf566aea27bcdcd47e5195357/charset_normalizer-3.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ecddf25bee22fe4fe3737a399d0d177d72bc22be6913acfab364b40bce1ba83c", size = 195653 }, - { url = "https://files.pythonhosted.org/packages/e5/0d/815a2ba3f283b4eeaa5ece57acade365c5b4135f65a807a083c818716582/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c60ca7339acd497a55b0ea5d506b2a2612afb2826560416f6894e8b5770d4a9", size = 140701 }, - { url = "https://files.pythonhosted.org/packages/aa/17/c94be7ee0d142687e047fe1de72060f6d6837f40eedc26e87e6e124a3fc6/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7b2d86dd06bfc2ade3312a83a5c364c7ec2e3498f8734282c6c3d4b07b346b8", size = 150495 }, - { url = "https://files.pythonhosted.org/packages/f7/33/557ac796c47165fc141e4fb71d7b0310f67e05cb420756f3a82e0a0068e0/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd78cfcda14a1ef52584dbb008f7ac81c1328c0f58184bf9a84c49c605002da6", size = 142946 }, - { url = "https://files.pythonhosted.org/packages/1e/0d/38ef4ae41e9248d63fc4998d933cae22473b1b2ac4122cf908d0f5eb32aa/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e27f48bcd0957c6d4cb9d6fa6b61d192d0b13d5ef563e5f2ae35feafc0d179c", size = 144737 }, - { url = "https://files.pythonhosted.org/packages/43/01/754cdb29dd0560f58290aaaa284d43eea343ad0512e6ad3b8b5c11f08592/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01ad647cdd609225c5350561d084b42ddf732f4eeefe6e678765636791e78b9a", size = 147471 }, - { url = "https://files.pythonhosted.org/packages/ba/cd/861883ba5160c7a9bd242c30b2c71074cda2aefcc0addc91118e0d4e0765/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:619a609aa74ae43d90ed2e89bdd784765de0a25ca761b93e196d938b8fd1dbbd", size = 140801 }, - { url = "https://files.pythonhosted.org/packages/6f/7f/0c0dad447819e90b93f8ed238cc8f11b91353c23c19e70fa80483a155bed/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:89149166622f4db9b4b6a449256291dc87a99ee53151c74cbd82a53c8c2f6ccd", size = 149312 }, - { url = "https://files.pythonhosted.org/packages/8e/09/9f8abcc6fff60fb727268b63c376c8c79cc37b833c2dfe1f535dfb59523b/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:7709f51f5f7c853f0fb938bcd3bc59cdfdc5203635ffd18bf354f6967ea0f824", size = 152347 }, - { url = "https://files.pythonhosted.org/packages/be/e5/3f363dad2e24378f88ccf63ecc39e817c29f32e308ef21a7a6d9c1201165/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:345b0426edd4e18138d6528aed636de7a9ed169b4aaf9d61a8c19e39d26838ca", size = 149888 }, - { url = "https://files.pythonhosted.org/packages/e4/10/a78c0e91f487b4ad0ef7480ac765e15b774f83de2597f1b6ef0eaf7a2f99/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0907f11d019260cdc3f94fbdb23ff9125f6b5d1039b76003b5b0ac9d6a6c9d5b", size = 145169 }, - { url = "https://files.pythonhosted.org/packages/d3/81/396e7d7f5d7420da8273c91175d2e9a3f569288e3611d521685e4b9ac9cc/charset_normalizer-3.4.1-cp38-cp38-win32.whl", hash = "sha256:ea0d8d539afa5eb2728aa1932a988a9a7af94f18582ffae4bc10b3fbdad0626e", size = 95094 }, - { url = "https://files.pythonhosted.org/packages/40/bb/20affbbd9ea29c71ea123769dc568a6d42052ff5089c5fe23e21e21084a6/charset_normalizer-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:329ce159e82018d646c7ac45b01a430369d526569ec08516081727a20e9e4af4", size = 102139 }, { url = "https://files.pythonhosted.org/packages/7f/c0/b913f8f02836ed9ab32ea643c6fe4d3325c3d8627cf6e78098671cafff86/charset_normalizer-3.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41", size = 197867 }, { url = "https://files.pythonhosted.org/packages/0f/6c/2bee440303d705b6fb1e2ec789543edec83d32d258299b16eed28aad48e0/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f", size = 141385 }, { url = "https://files.pythonhosted.org/packages/3d/04/cb42585f07f6f9fd3219ffb6f37d5a39b4fd2db2355b23683060029c35f7/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2", size = 151367 }, @@ -351,11 +275,9 @@ wheels = [ [[package]] name = "datafusion" -version = "44.0.0" source = { editable = "." } dependencies = [ - { name = "pyarrow", version = "17.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "pyarrow", version = "18.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "pyarrow" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] @@ -369,20 +291,16 @@ dev = [ { name = "toml" }, ] docs = [ - { name = "ipython", version = "8.12.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "ipython", version = "8.18.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "ipython", version = "8.18.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "ipython", version = "8.31.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "jinja2" }, { name = "myst-parser", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "myst-parser", version = "4.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "pandas", version = "2.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "pandas", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "pandas" }, { name = "pickleshare" }, { name = "pydata-sphinx-theme" }, - { name = "setuptools", version = "75.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "setuptools", version = "75.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, - { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "setuptools" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "sphinx-autoapi" }, ] @@ -435,28 +353,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec", size = 9998 }, ] -[[package]] -name = "docutils" -version = "0.20.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/1f/53/a5da4f2c5739cf66290fac1431ee52aff6851c7c8ffd8264f13affd7bcdd/docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b", size = 2058365 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/26/87/f238c0670b94533ac0353a4e2a1a771a0cc73277b88bff23d3ae35a256c1/docutils-0.20.1-py3-none-any.whl", hash = "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6", size = 572666 }, -] - [[package]] name = "docutils" version = "0.21.2" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444 } wheels = [ { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408 }, @@ -503,8 +403,7 @@ name = "importlib-metadata" version = "8.5.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "zipp", version = "3.20.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "zipp", version = "3.21.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "zipp", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/cd/12/33e59336dca5be0c398a7482335911a33aa0e20776128f038019f1a95f1b/importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7", size = 55304 } wheels = [ @@ -520,52 +419,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 }, ] -[[package]] -name = "ipython" -version = "8.12.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "appnope", marker = "python_full_version < '3.9' and sys_platform == 'darwin'" }, - { name = "backcall", marker = "python_full_version < '3.9'" }, - { name = "colorama", marker = "python_full_version < '3.9' and sys_platform == 'win32'" }, - { name = "decorator", marker = "python_full_version < '3.9'" }, - { name = "jedi", marker = "python_full_version < '3.9'" }, - { name = "matplotlib-inline", marker = "python_full_version < '3.9'" }, - { name = "pexpect", marker = "python_full_version < '3.9' and sys_platform != 'win32'" }, - { name = "pickleshare", marker = "python_full_version < '3.9'" }, - { name = "prompt-toolkit", marker = "python_full_version < '3.9'" }, - { name = "pygments", marker = "python_full_version < '3.9'" }, - { name = "stack-data", marker = "python_full_version < '3.9'" }, - { name = "traitlets", marker = "python_full_version < '3.9'" }, - { name = "typing-extensions", marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9e/6a/44ef299b1762f5a73841e87fae8a73a8cc8aee538d6dc8c77a5afe1fd2ce/ipython-8.12.3.tar.gz", hash = "sha256:3910c4b54543c2ad73d06579aa771041b7d5707b033bd488669b4cf544e3b363", size = 5470171 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8d/97/8fe103906cd81bc42d3b0175b5534a9f67dccae47d6451131cf8d0d70bb2/ipython-8.12.3-py3-none-any.whl", hash = "sha256:b0340d46a933d27c657b211a329d0be23793c36595acf9e6ef4164bc01a1804c", size = 798307 }, -] - [[package]] name = "ipython" version = "8.18.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "colorama", marker = "python_full_version == '3.9.*' and sys_platform == 'win32'" }, - { name = "decorator", marker = "python_full_version == '3.9.*'" }, - { name = "exceptiongroup", marker = "python_full_version == '3.9.*'" }, - { name = "jedi", marker = "python_full_version == '3.9.*'" }, - { name = "matplotlib-inline", marker = "python_full_version == '3.9.*'" }, - { name = "pexpect", marker = "python_full_version == '3.9.*' and sys_platform != 'win32'" }, - { name = "prompt-toolkit", marker = "python_full_version == '3.9.*'" }, - { name = "pygments", marker = "python_full_version == '3.9.*'" }, - { name = "stack-data", marker = "python_full_version == '3.9.*'" }, - { name = "traitlets", marker = "python_full_version == '3.9.*'" }, - { name = "typing-extensions", marker = "python_full_version == '3.9.*'" }, + { name = "colorama", marker = "python_full_version < '3.10' and sys_platform == 'win32'" }, + { name = "decorator", marker = "python_full_version < '3.10'" }, + { name = "exceptiongroup", marker = "python_full_version < '3.10'" }, + { name = "jedi", marker = "python_full_version < '3.10'" }, + { name = "matplotlib-inline", marker = "python_full_version < '3.10'" }, + { name = "pexpect", marker = "python_full_version < '3.10' and sys_platform != 'win32'" }, + { name = "prompt-toolkit", marker = "python_full_version < '3.10'" }, + { name = "pygments", marker = "python_full_version < '3.10'" }, + { name = "stack-data", marker = "python_full_version < '3.10'" }, + { name = "traitlets", marker = "python_full_version < '3.10'" }, + { name = "typing-extensions", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b1/b9/3ba6c45a6df813c09a48bac313c22ff83efa26cbb55011218d925a46e2ad/ipython-8.18.1.tar.gz", hash = "sha256:ca6f079bb33457c66e233e4580ebfc4128855b4cf6370dddd73842a9563e8a27", size = 5486330 } wheels = [ @@ -616,8 +488,7 @@ name = "jinja2" version = "3.1.5" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "markupsafe", version = "2.1.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "markupsafe", version = "3.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "markupsafe" }, ] sdist = { url = "https://files.pythonhosted.org/packages/af/92/b3130cbbf5591acf9ade8708c365f3238046ac7cb8ccba6e81abccb0ccff/jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb", size = 244674 } wheels = [ @@ -636,77 +507,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528 }, ] -[[package]] -name = "markupsafe" -version = "2.1.5" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/87/5b/aae44c6655f3801e81aa3eef09dbbf012431987ba564d7231722f68df02d/MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b", size = 19384 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e4/54/ad5eb37bf9d51800010a74e4665425831a9db4e7c4e0fde4352e391e808e/MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc", size = 18206 }, - { url = "https://files.pythonhosted.org/packages/6a/4a/a4d49415e600bacae038c67f9fecc1d5433b9d3c71a4de6f33537b89654c/MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5", size = 14079 }, - { url = "https://files.pythonhosted.org/packages/0a/7b/85681ae3c33c385b10ac0f8dd025c30af83c78cec1c37a6aa3b55e67f5ec/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46", size = 26620 }, - { url = "https://files.pythonhosted.org/packages/7c/52/2b1b570f6b8b803cef5ac28fdf78c0da318916c7d2fe9402a84d591b394c/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f", size = 25818 }, - { url = "https://files.pythonhosted.org/packages/29/fe/a36ba8c7ca55621620b2d7c585313efd10729e63ef81e4e61f52330da781/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900", size = 25493 }, - { url = "https://files.pythonhosted.org/packages/60/ae/9c60231cdfda003434e8bd27282b1f4e197ad5a710c14bee8bea8a9ca4f0/MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff", size = 30630 }, - { url = "https://files.pythonhosted.org/packages/65/dc/1510be4d179869f5dafe071aecb3f1f41b45d37c02329dfba01ff59e5ac5/MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad", size = 29745 }, - { url = "https://files.pythonhosted.org/packages/30/39/8d845dd7d0b0613d86e0ef89549bfb5f61ed781f59af45fc96496e897f3a/MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd", size = 30021 }, - { url = "https://files.pythonhosted.org/packages/c7/5c/356a6f62e4f3c5fbf2602b4771376af22a3b16efa74eb8716fb4e328e01e/MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4", size = 16659 }, - { url = "https://files.pythonhosted.org/packages/69/48/acbf292615c65f0604a0c6fc402ce6d8c991276e16c80c46a8f758fbd30c/MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5", size = 17213 }, - { url = "https://files.pythonhosted.org/packages/11/e7/291e55127bb2ae67c64d66cef01432b5933859dfb7d6949daa721b89d0b3/MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f", size = 18219 }, - { url = "https://files.pythonhosted.org/packages/6b/cb/aed7a284c00dfa7c0682d14df85ad4955a350a21d2e3b06d8240497359bf/MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2", size = 14098 }, - { url = "https://files.pythonhosted.org/packages/1c/cf/35fe557e53709e93feb65575c93927942087e9b97213eabc3fe9d5b25a55/MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced", size = 29014 }, - { url = "https://files.pythonhosted.org/packages/97/18/c30da5e7a0e7f4603abfc6780574131221d9148f323752c2755d48abad30/MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5", size = 28220 }, - { url = "https://files.pythonhosted.org/packages/0c/40/2e73e7d532d030b1e41180807a80d564eda53babaf04d65e15c1cf897e40/MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c", size = 27756 }, - { url = "https://files.pythonhosted.org/packages/18/46/5dca760547e8c59c5311b332f70605d24c99d1303dd9a6e1fc3ed0d73561/MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f", size = 33988 }, - { url = "https://files.pythonhosted.org/packages/6d/c5/27febe918ac36397919cd4a67d5579cbbfa8da027fa1238af6285bb368ea/MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a", size = 32718 }, - { url = "https://files.pythonhosted.org/packages/f8/81/56e567126a2c2bc2684d6391332e357589a96a76cb9f8e5052d85cb0ead8/MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f", size = 33317 }, - { url = "https://files.pythonhosted.org/packages/00/0b/23f4b2470accb53285c613a3ab9ec19dc944eaf53592cb6d9e2af8aa24cc/MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906", size = 16670 }, - { url = "https://files.pythonhosted.org/packages/b7/a2/c78a06a9ec6d04b3445a949615c4c7ed86a0b2eb68e44e7541b9d57067cc/MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617", size = 17224 }, - { url = "https://files.pythonhosted.org/packages/53/bd/583bf3e4c8d6a321938c13f49d44024dbe5ed63e0a7ba127e454a66da974/MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1", size = 18215 }, - { url = "https://files.pythonhosted.org/packages/48/d6/e7cd795fc710292c3af3a06d80868ce4b02bfbbf370b7cee11d282815a2a/MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4", size = 14069 }, - { url = "https://files.pythonhosted.org/packages/51/b5/5d8ec796e2a08fc814a2c7d2584b55f889a55cf17dd1a90f2beb70744e5c/MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee", size = 29452 }, - { url = "https://files.pythonhosted.org/packages/0a/0d/2454f072fae3b5a137c119abf15465d1771319dfe9e4acbb31722a0fff91/MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5", size = 28462 }, - { url = "https://files.pythonhosted.org/packages/2d/75/fd6cb2e68780f72d47e6671840ca517bda5ef663d30ada7616b0462ad1e3/MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b", size = 27869 }, - { url = "https://files.pythonhosted.org/packages/b0/81/147c477391c2750e8fc7705829f7351cf1cd3be64406edcf900dc633feb2/MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a", size = 33906 }, - { url = "https://files.pythonhosted.org/packages/8b/ff/9a52b71839d7a256b563e85d11050e307121000dcebc97df120176b3ad93/MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f", size = 32296 }, - { url = "https://files.pythonhosted.org/packages/88/07/2dc76aa51b481eb96a4c3198894f38b480490e834479611a4053fbf08623/MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169", size = 33038 }, - { url = "https://files.pythonhosted.org/packages/96/0c/620c1fb3661858c0e37eb3cbffd8c6f732a67cd97296f725789679801b31/MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad", size = 16572 }, - { url = "https://files.pythonhosted.org/packages/3f/14/c3554d512d5f9100a95e737502f4a2323a1959f6d0d01e0d0997b35f7b10/MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb", size = 17127 }, - { url = "https://files.pythonhosted.org/packages/f8/ff/2c942a82c35a49df5de3a630ce0a8456ac2969691b230e530ac12314364c/MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a", size = 18192 }, - { url = "https://files.pythonhosted.org/packages/4f/14/6f294b9c4f969d0c801a4615e221c1e084722ea6114ab2114189c5b8cbe0/MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46", size = 14072 }, - { url = "https://files.pythonhosted.org/packages/81/d4/fd74714ed30a1dedd0b82427c02fa4deec64f173831ec716da11c51a50aa/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532", size = 26928 }, - { url = "https://files.pythonhosted.org/packages/c7/bd/50319665ce81bb10e90d1cf76f9e1aa269ea6f7fa30ab4521f14d122a3df/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab", size = 26106 }, - { url = "https://files.pythonhosted.org/packages/4c/6f/f2b0f675635b05f6afd5ea03c094557bdb8622fa8e673387444fe8d8e787/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68", size = 25781 }, - { url = "https://files.pythonhosted.org/packages/51/e0/393467cf899b34a9d3678e78961c2c8cdf49fb902a959ba54ece01273fb1/MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0", size = 30518 }, - { url = "https://files.pythonhosted.org/packages/f6/02/5437e2ad33047290dafced9df741d9efc3e716b75583bbd73a9984f1b6f7/MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4", size = 29669 }, - { url = "https://files.pythonhosted.org/packages/0e/7d/968284145ffd9d726183ed6237c77938c021abacde4e073020f920e060b2/MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3", size = 29933 }, - { url = "https://files.pythonhosted.org/packages/bf/f3/ecb00fc8ab02b7beae8699f34db9357ae49d9f21d4d3de6f305f34fa949e/MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff", size = 16656 }, - { url = "https://files.pythonhosted.org/packages/92/21/357205f03514a49b293e214ac39de01fadd0970a6e05e4bf1ddd0ffd0881/MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029", size = 17206 }, - { url = "https://files.pythonhosted.org/packages/0f/31/780bb297db036ba7b7bbede5e1d7f1e14d704ad4beb3ce53fb495d22bc62/MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf", size = 18193 }, - { url = "https://files.pythonhosted.org/packages/6c/77/d77701bbef72892affe060cdacb7a2ed7fd68dae3b477a8642f15ad3b132/MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2", size = 14073 }, - { url = "https://files.pythonhosted.org/packages/d9/a7/1e558b4f78454c8a3a0199292d96159eb4d091f983bc35ef258314fe7269/MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8", size = 26486 }, - { url = "https://files.pythonhosted.org/packages/5f/5a/360da85076688755ea0cceb92472923086993e86b5613bbae9fbc14136b0/MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3", size = 25685 }, - { url = "https://files.pythonhosted.org/packages/6a/18/ae5a258e3401f9b8312f92b028c54d7026a97ec3ab20bfaddbdfa7d8cce8/MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465", size = 25338 }, - { url = "https://files.pythonhosted.org/packages/0b/cc/48206bd61c5b9d0129f4d75243b156929b04c94c09041321456fd06a876d/MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e", size = 30439 }, - { url = "https://files.pythonhosted.org/packages/d1/06/a41c112ab9ffdeeb5f77bc3e331fdadf97fa65e52e44ba31880f4e7f983c/MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea", size = 29531 }, - { url = "https://files.pythonhosted.org/packages/02/8c/ab9a463301a50dab04d5472e998acbd4080597abc048166ded5c7aa768c8/MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6", size = 29823 }, - { url = "https://files.pythonhosted.org/packages/bc/29/9bc18da763496b055d8e98ce476c8e718dcfd78157e17f555ce6dd7d0895/MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf", size = 16658 }, - { url = "https://files.pythonhosted.org/packages/f6/f8/4da07de16f10551ca1f640c92b5f316f9394088b183c6a57183df6de5ae4/MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5", size = 17211 }, -] - [[package]] name = "markupsafe" version = "3.0.2" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/b2/97/5d42485e71dfc078108a86d6de8fa46db44a1a9295e89c5d6d4a06e23a62/markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0", size = 20537 } wheels = [ { url = "https://files.pythonhosted.org/packages/04/90/d08277ce111dd22f77149fd1a5d4653eeb3b3eaacbdfcbae5afb2600eebd/MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8", size = 14357 }, @@ -832,18 +636,15 @@ name = "myst-parser" version = "3.0.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", - "python_full_version < '3.9'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "docutils", version = "0.20.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "docutils", marker = "python_full_version < '3.10'" }, { name = "jinja2", marker = "python_full_version < '3.10'" }, { name = "markdown-it-py", marker = "python_full_version < '3.10'" }, { name = "mdit-py-plugins", marker = "python_full_version < '3.10'" }, { name = "pyyaml", marker = "python_full_version < '3.10'" }, - { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/49/64/e2f13dac02f599980798c01156393b781aec983b52a6e4057ee58f07c43a/myst_parser-3.0.1.tar.gz", hash = "sha256:88f0cb406cb363b077d176b51c476f62d60604d68a8dcdf4832e080441301a87", size = 92392 } wheels = [ @@ -860,7 +661,7 @@ resolution-markers = [ "python_full_version == '3.10.*'", ] dependencies = [ - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "docutils", marker = "python_full_version >= '3.10'" }, { name = "jinja2", marker = "python_full_version >= '3.10'" }, { name = "markdown-it-py", marker = "python_full_version >= '3.10'" }, { name = "mdit-py-plugins", marker = "python_full_version >= '3.10'" }, @@ -872,50 +673,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ca/b4/b036f8fdb667587bb37df29dc6644681dd78b7a2a6321a34684b79412b28/myst_parser-4.0.0-py3-none-any.whl", hash = "sha256:b9317997552424448c6096c2558872fdb6f81d3ecb3a40ce84a7518798f3f28d", size = 84563 }, ] -[[package]] -name = "numpy" -version = "1.24.4" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/a4/9b/027bec52c633f6556dba6b722d9a0befb40498b9ceddd29cbe67a45a127c/numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463", size = 10911229 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6b/80/6cdfb3e275d95155a34659163b83c09e3a3ff9f1456880bec6cc63d71083/numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64", size = 19789140 }, - { url = "https://files.pythonhosted.org/packages/64/5f/3f01d753e2175cfade1013eea08db99ba1ee4bdb147ebcf3623b75d12aa7/numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1", size = 13854297 }, - { url = "https://files.pythonhosted.org/packages/5a/b3/2f9c21d799fa07053ffa151faccdceeb69beec5a010576b8991f614021f7/numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4", size = 13995611 }, - { url = "https://files.pythonhosted.org/packages/10/be/ae5bf4737cb79ba437879915791f6f26d92583c738d7d960ad94e5c36adf/numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6", size = 17282357 }, - { url = "https://files.pythonhosted.org/packages/c0/64/908c1087be6285f40e4b3e79454552a701664a079321cff519d8c7051d06/numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc", size = 12429222 }, - { url = "https://files.pythonhosted.org/packages/22/55/3d5a7c1142e0d9329ad27cece17933b0e2ab4e54ddc5c1861fbfeb3f7693/numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e", size = 14841514 }, - { url = "https://files.pythonhosted.org/packages/a9/cc/5ed2280a27e5dab12994c884f1f4d8c3bd4d885d02ae9e52a9d213a6a5e2/numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810", size = 19775508 }, - { url = "https://files.pythonhosted.org/packages/c0/bc/77635c657a3668cf652806210b8662e1aff84b818a55ba88257abf6637a8/numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254", size = 13840033 }, - { url = "https://files.pythonhosted.org/packages/a7/4c/96cdaa34f54c05e97c1c50f39f98d608f96f0677a6589e64e53104e22904/numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7", size = 13991951 }, - { url = "https://files.pythonhosted.org/packages/22/97/dfb1a31bb46686f09e68ea6ac5c63fdee0d22d7b23b8f3f7ea07712869ef/numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5", size = 17278923 }, - { url = "https://files.pythonhosted.org/packages/35/e2/76a11e54139654a324d107da1d98f99e7aa2a7ef97cfd7c631fba7dbde71/numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d", size = 12422446 }, - { url = "https://files.pythonhosted.org/packages/d8/ec/ebef2f7d7c28503f958f0f8b992e7ce606fb74f9e891199329d5f5f87404/numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694", size = 14834466 }, - { url = "https://files.pythonhosted.org/packages/11/10/943cfb579f1a02909ff96464c69893b1d25be3731b5d3652c2e0cf1281ea/numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61", size = 19780722 }, - { url = "https://files.pythonhosted.org/packages/a7/ae/f53b7b265fdc701e663fbb322a8e9d4b14d9cb7b2385f45ddfabfc4327e4/numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f", size = 13843102 }, - { url = "https://files.pythonhosted.org/packages/25/6f/2586a50ad72e8dbb1d8381f837008a0321a3516dfd7cb57fc8cf7e4bb06b/numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e", size = 14039616 }, - { url = "https://files.pythonhosted.org/packages/98/5d/5738903efe0ecb73e51eb44feafba32bdba2081263d40c5043568ff60faf/numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc", size = 17316263 }, - { url = "https://files.pythonhosted.org/packages/d1/57/8d328f0b91c733aa9aa7ee540dbc49b58796c862b4fbcb1146c701e888da/numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2", size = 12455660 }, - { url = "https://files.pythonhosted.org/packages/69/65/0d47953afa0ad569d12de5f65d964321c208492064c38fe3b0b9744f8d44/numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706", size = 14868112 }, - { url = "https://files.pythonhosted.org/packages/9a/cd/d5b0402b801c8a8b56b04c1e85c6165efab298d2f0ab741c2406516ede3a/numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400", size = 19816549 }, - { url = "https://files.pythonhosted.org/packages/14/27/638aaa446f39113a3ed38b37a66243e21b38110d021bfcb940c383e120f2/numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f", size = 13879950 }, - { url = "https://files.pythonhosted.org/packages/8f/27/91894916e50627476cff1a4e4363ab6179d01077d71b9afed41d9e1f18bf/numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9", size = 14030228 }, - { url = "https://files.pythonhosted.org/packages/7a/7c/d7b2a0417af6428440c0ad7cb9799073e507b1a465f827d058b826236964/numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d", size = 17311170 }, - { url = "https://files.pythonhosted.org/packages/18/9d/e02ace5d7dfccee796c37b995c63322674daf88ae2f4a4724c5dd0afcc91/numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835", size = 12454918 }, - { url = "https://files.pythonhosted.org/packages/63/38/6cc19d6b8bfa1d1a459daf2b3fe325453153ca7019976274b6f33d8b5663/numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8", size = 14867441 }, - { url = "https://files.pythonhosted.org/packages/a4/fd/8dff40e25e937c94257455c237b9b6bf5a30d42dd1cc11555533be099492/numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef", size = 19156590 }, - { url = "https://files.pythonhosted.org/packages/42/e7/4bf953c6e05df90c6d351af69966384fed8e988d0e8c54dad7103b59f3ba/numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a", size = 16705744 }, - { url = "https://files.pythonhosted.org/packages/fc/dd/9106005eb477d022b60b3817ed5937a43dad8fd1f20b0610ea8a32fcb407/numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2", size = 14734290 }, -] - [[package]] name = "numpy" version = "2.0.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015 } wheels = [ @@ -1041,63 +804,16 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 }, ] -[[package]] -name = "pandas" -version = "2.0.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "numpy", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "python-dateutil", marker = "python_full_version < '3.9'" }, - { name = "pytz", marker = "python_full_version < '3.9'" }, - { name = "tzdata", marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b1/a7/824332581e258b5aa4f3763ecb2a797e5f9a54269044ba2e50ac19936b32/pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c", size = 5284455 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3c/b2/0d4a5729ce1ce11630c4fc5d5522a33b967b3ca146c210f58efde7c40e99/pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8", size = 11760908 }, - { url = "https://files.pythonhosted.org/packages/4a/f6/f620ca62365d83e663a255a41b08d2fc2eaf304e0b8b21bb6d62a7390fe3/pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f", size = 10823486 }, - { url = "https://files.pythonhosted.org/packages/c2/59/cb4234bc9b968c57e81861b306b10cd8170272c57b098b724d3de5eda124/pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183", size = 11571897 }, - { url = "https://files.pythonhosted.org/packages/e3/59/35a2892bf09ded9c1bf3804461efe772836a5261ef5dfb4e264ce813ff99/pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0", size = 12306421 }, - { url = "https://files.pythonhosted.org/packages/94/71/3a0c25433c54bb29b48e3155b959ac78f4c4f2f06f94d8318aac612cb80f/pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210", size = 9540792 }, - { url = "https://files.pythonhosted.org/packages/ed/30/b97456e7063edac0e5a405128065f0cd2033adfe3716fb2256c186bd41d0/pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e", size = 10664333 }, - { url = "https://files.pythonhosted.org/packages/b3/92/a5e5133421b49e901a12e02a6a7ef3a0130e10d13db8cb657fdd0cba3b90/pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8", size = 11645672 }, - { url = "https://files.pythonhosted.org/packages/8f/bb/aea1fbeed5b474cb8634364718abe9030d7cc7a30bf51f40bd494bbc89a2/pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26", size = 10693229 }, - { url = "https://files.pythonhosted.org/packages/d6/90/e7d387f1a416b14e59290baa7a454a90d719baebbf77433ff1bdcc727800/pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d", size = 11581591 }, - { url = "https://files.pythonhosted.org/packages/d0/28/88b81881c056376254618fad622a5e94b5126db8c61157ea1910cd1c040a/pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df", size = 12219370 }, - { url = "https://files.pythonhosted.org/packages/e4/a5/212b9039e25bf8ebb97e417a96660e3dc925dacd3f8653d531b8f7fd9be4/pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd", size = 9482935 }, - { url = "https://files.pythonhosted.org/packages/9e/71/756a1be6bee0209d8c0d8c5e3b9fc72c00373f384a4017095ec404aec3ad/pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b", size = 10607692 }, - { url = "https://files.pythonhosted.org/packages/78/a8/07dd10f90ca915ed914853cd57f79bfc22e1ef4384ab56cb4336d2fc1f2a/pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061", size = 11653303 }, - { url = "https://files.pythonhosted.org/packages/53/c3/f8e87361f7fdf42012def602bfa2a593423c729f5cb7c97aed7f51be66ac/pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5", size = 10710932 }, - { url = "https://files.pythonhosted.org/packages/a7/87/828d50c81ce0f434163bf70b925a0eec6076808e0bca312a79322b141f66/pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089", size = 11684018 }, - { url = "https://files.pythonhosted.org/packages/f8/7f/5b047effafbdd34e52c9e2d7e44f729a0655efafb22198c45cf692cdc157/pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0", size = 12353723 }, - { url = "https://files.pythonhosted.org/packages/ea/ae/26a2eda7fa581347d69e51f93892493b2074ef3352ac71033c9f32c52389/pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02", size = 9646403 }, - { url = "https://files.pythonhosted.org/packages/c3/6c/ea362eef61f05553aaf1a24b3e96b2d0603f5dc71a3bd35688a24ed88843/pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78", size = 10777638 }, - { url = "https://files.pythonhosted.org/packages/f8/c7/cfef920b7b457dff6928e824896cb82367650ea127d048ee0b820026db4f/pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b", size = 11834160 }, - { url = "https://files.pythonhosted.org/packages/6c/1c/689c9d99bc4e5d366a5fd871f0bcdee98a6581e240f96b78d2d08f103774/pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e", size = 10862752 }, - { url = "https://files.pythonhosted.org/packages/cc/b8/4d082f41c27c95bf90485d1447b647cc7e5680fea75e315669dc6e4cb398/pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b", size = 11715852 }, - { url = "https://files.pythonhosted.org/packages/9e/0d/91a9fd2c202f2b1d97a38ab591890f86480ecbb596cbc56d035f6f23fdcc/pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641", size = 12398496 }, - { url = "https://files.pythonhosted.org/packages/26/7d/d8aa0a2c4f3f5f8ea59fb946c8eafe8f508090ca73e2b08a9af853c1103e/pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682", size = 9630766 }, - { url = "https://files.pythonhosted.org/packages/9a/f2/0ad053856debbe90c83de1b4f05915f85fd2146f20faf9daa3b320d36df3/pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc", size = 10755902 }, -] - [[package]] name = "pandas" version = "2.2.3" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "python-dateutil", marker = "python_full_version >= '3.9'" }, - { name = "pytz", marker = "python_full_version >= '3.9'" }, - { name = "tzdata", marker = "python_full_version >= '3.9'" }, + { name = "python-dateutil" }, + { name = "pytz" }, + { name = "tzdata" }, ] sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213 } wheels = [ @@ -1213,65 +929,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842 }, ] -[[package]] -name = "pyarrow" -version = "17.0.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "numpy", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/27/4e/ea6d43f324169f8aec0e57569443a38bab4b398d09769ca64f7b4d467de3/pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28", size = 1112479 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/39/5d/78d4b040bc5ff2fc6c3d03e80fca396b742f6c125b8af06bcf7427f931bc/pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07", size = 28994846 }, - { url = "https://files.pythonhosted.org/packages/3b/73/8ed168db7642e91180330e4ea9f3ff8bab404678f00d32d7df0871a4933b/pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655", size = 27165908 }, - { url = "https://files.pythonhosted.org/packages/81/36/e78c24be99242063f6d0590ef68c857ea07bdea470242c361e9a15bd57a4/pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545", size = 39264209 }, - { url = "https://files.pythonhosted.org/packages/18/4c/3db637d7578f683b0a8fb8999b436bdbedd6e3517bd4f90c70853cf3ad20/pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2", size = 39862883 }, - { url = "https://files.pythonhosted.org/packages/81/3c/0580626896c842614a523e66b351181ed5bb14e5dfc263cd68cea2c46d90/pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8", size = 38723009 }, - { url = "https://files.pythonhosted.org/packages/ee/fb/c1b47f0ada36d856a352da261a44d7344d8f22e2f7db3945f8c3b81be5dd/pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047", size = 39855626 }, - { url = "https://files.pythonhosted.org/packages/19/09/b0a02908180a25d57312ab5919069c39fddf30602568980419f4b02393f6/pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087", size = 25147242 }, - { url = "https://files.pythonhosted.org/packages/f9/46/ce89f87c2936f5bb9d879473b9663ce7a4b1f4359acc2f0eb39865eaa1af/pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977", size = 29028748 }, - { url = "https://files.pythonhosted.org/packages/8d/8e/ce2e9b2146de422f6638333c01903140e9ada244a2a477918a368306c64c/pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3", size = 27190965 }, - { url = "https://files.pythonhosted.org/packages/3b/c8/5675719570eb1acd809481c6d64e2136ffb340bc387f4ca62dce79516cea/pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15", size = 39269081 }, - { url = "https://files.pythonhosted.org/packages/5e/78/3931194f16ab681ebb87ad252e7b8d2c8b23dad49706cadc865dff4a1dd3/pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597", size = 39864921 }, - { url = "https://files.pythonhosted.org/packages/d8/81/69b6606093363f55a2a574c018901c40952d4e902e670656d18213c71ad7/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420", size = 38740798 }, - { url = "https://files.pythonhosted.org/packages/4c/21/9ca93b84b92ef927814cb7ba37f0774a484c849d58f0b692b16af8eebcfb/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4", size = 39871877 }, - { url = "https://files.pythonhosted.org/packages/30/d1/63a7c248432c71c7d3ee803e706590a0b81ce1a8d2b2ae49677774b813bb/pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03", size = 25151089 }, - { url = "https://files.pythonhosted.org/packages/d4/62/ce6ac1275a432b4a27c55fe96c58147f111d8ba1ad800a112d31859fae2f/pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22", size = 29019418 }, - { url = "https://files.pythonhosted.org/packages/8e/0a/dbd0c134e7a0c30bea439675cc120012337202e5fac7163ba839aa3691d2/pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053", size = 27152197 }, - { url = "https://files.pythonhosted.org/packages/cb/05/3f4a16498349db79090767620d6dc23c1ec0c658a668d61d76b87706c65d/pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a", size = 39263026 }, - { url = "https://files.pythonhosted.org/packages/c2/0c/ea2107236740be8fa0e0d4a293a095c9f43546a2465bb7df34eee9126b09/pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc", size = 39880798 }, - { url = "https://files.pythonhosted.org/packages/f6/b0/b9164a8bc495083c10c281cc65064553ec87b7537d6f742a89d5953a2a3e/pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a", size = 38715172 }, - { url = "https://files.pythonhosted.org/packages/f1/c4/9625418a1413005e486c006e56675334929fad864347c5ae7c1b2e7fe639/pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b", size = 39874508 }, - { url = "https://files.pythonhosted.org/packages/ae/49/baafe2a964f663413be3bd1cf5c45ed98c5e42e804e2328e18f4570027c1/pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7", size = 25099235 }, - { url = "https://files.pythonhosted.org/packages/8d/bd/8f52c1d7b430260f80a349cffa2df351750a737b5336313d56dcadeb9ae1/pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204", size = 28999345 }, - { url = "https://files.pythonhosted.org/packages/64/d9/51e35550f2f18b8815a2ab25948f735434db32000c0e91eba3a32634782a/pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8", size = 27168441 }, - { url = "https://files.pythonhosted.org/packages/18/d8/7161d87d07ea51be70c49f615004c1446d5723622a18b2681f7e4b71bf6e/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155", size = 39363163 }, - { url = "https://files.pythonhosted.org/packages/3f/08/bc497130789833de09e345e3ce4647e3ce86517c4f70f2144f0367ca378b/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145", size = 39965253 }, - { url = "https://files.pythonhosted.org/packages/d3/2e/493dd7db889402b4c7871ca7dfdd20f2c5deedbff802d3eb8576359930f9/pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c", size = 38805378 }, - { url = "https://files.pythonhosted.org/packages/e6/c1/4c6bcdf7a820034aa91a8b4d25fef38809be79b42ca7aaa16d4680b0bbac/pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c", size = 39958364 }, - { url = "https://files.pythonhosted.org/packages/d1/db/42ac644453cfdfc60fe002b46d647fe7a6dfad753ef7b28e99b4c936ad5d/pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca", size = 25229211 }, - { url = "https://files.pythonhosted.org/packages/43/e0/a898096d35be240aa61fb2d54db58b86d664b10e1e51256f9300f47565e8/pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb", size = 29007881 }, - { url = "https://files.pythonhosted.org/packages/59/22/f7d14907ed0697b5dd488d393129f2738629fa5bcba863e00931b7975946/pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df", size = 27178117 }, - { url = "https://files.pythonhosted.org/packages/bf/ee/661211feac0ed48467b1d5c57298c91403809ec3ab78b1d175e1d6ad03cf/pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687", size = 39273896 }, - { url = "https://files.pythonhosted.org/packages/af/61/bcd9b58e38ead6ad42b9ed00da33a3f862bc1d445e3d3164799c25550ac2/pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b", size = 39875438 }, - { url = "https://files.pythonhosted.org/packages/75/63/29d1bfcc57af73cde3fc3baccab2f37548de512dbe0ab294b033cd203516/pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5", size = 38735092 }, - { url = "https://files.pythonhosted.org/packages/39/f4/90258b4de753df7cc61cefb0312f8abcf226672e96cc64996e66afce817a/pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda", size = 39867610 }, - { url = "https://files.pythonhosted.org/packages/e7/f6/b75d4816c32f1618ed31a005ee635dd1d91d8164495d94f2ea092f594661/pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204", size = 25148611 }, -] - [[package]] name = "pyarrow" version = "18.1.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/7f/7b/640785a9062bb00314caa8a387abce547d2a420cf09bd6c715fe659ccffb/pyarrow-18.1.0.tar.gz", hash = "sha256:9386d3ca9c145b5539a1cfc75df07757dff870168c959b473a0bccbc3abc8c73", size = 1118671 } wheels = [ { url = "https://files.pythonhosted.org/packages/1a/bb/8d4a1573f66e0684f190dd2b55fd0b97a7214de8882d58a3867e777bf640/pyarrow-18.1.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e21488d5cfd3d8b500b3238a6c4b075efabc18f0f6d80b29239737ebd69caa6c", size = 29531620 }, @@ -1332,10 +993,8 @@ version = "0.8.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "beautifulsoup4" }, - { name = "docutils", version = "0.20.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, - { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "docutils" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/fc/d6/3921de802cf1ee771f0e76c9068b52498aeb8eeec6b830ff931c81c7ecf3/pydata_sphinx_theme-0.8.0.tar.gz", hash = "sha256:9f72015d9c572ea92e3007ab221a8325767c426783b6b9941813e65fa988dc90", size = 1123746 } @@ -1349,13 +1008,11 @@ version = "2.5.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "deprecated" }, - { name = "pyjwt", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, extra = ["crypto"], marker = "python_full_version < '3.9'" }, - { name = "pyjwt", version = "2.10.1", source = { registry = "https://pypi.org/simple" }, extra = ["crypto"], marker = "python_full_version >= '3.9'" }, + { name = "pyjwt", extra = ["crypto"] }, { name = "pynacl" }, { name = "requests" }, { name = "typing-extensions" }, - { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "urllib3", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "urllib3" }, ] sdist = { url = "https://files.pythonhosted.org/packages/16/ce/aa91d30040d9552c274e7ea8bd10a977600d508d579a4bb262b95eccf961/pygithub-2.5.0.tar.gz", hash = "sha256:e1613ac508a9be710920d26eb18b1905ebd9926aa49398e88151c1b526aad3cf", size = 3552804 } wheels = [ @@ -1371,33 +1028,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 }, ] -[[package]] -name = "pyjwt" -version = "2.9.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/fb/68/ce067f09fca4abeca8771fe667d89cc347d1e99da3e093112ac329c6020e/pyjwt-2.9.0.tar.gz", hash = "sha256:7e1e5b56cc735432a7369cbfa0efe50fa113ebecdc04ae6922deba8b84582d0c", size = 78825 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/79/84/0fdf9b18ba31d69877bd39c9cd6052b47f3761e9910c15de788e519f079f/PyJWT-2.9.0-py3-none-any.whl", hash = "sha256:3b02fb0f44517787776cf48f2ae25d8e14f300e6d7545a4315cee571a415e850", size = 22344 }, -] - -[package.optional-dependencies] -crypto = [ - { name = "cryptography", marker = "python_full_version < '3.9'" }, -] - [[package]] name = "pyjwt" version = "2.10.1" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/e7/46/bd74733ff231675599650d3e47f361794b22ef3e3770998dda30d3b63726/pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953", size = 87785 } wheels = [ { url = "https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb", size = 22997 }, @@ -1405,7 +1039,7 @@ wheels = [ [package.optional-dependencies] crypto = [ - { name = "cryptography", marker = "python_full_version >= '3.9'" }, + { name = "cryptography" }, ] [[package]] @@ -1508,13 +1142,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597 }, { url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527 }, { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446 }, - { url = "https://files.pythonhosted.org/packages/74/d9/323a59d506f12f498c2097488d80d16f4cf965cee1791eab58b56b19f47a/PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a", size = 183218 }, - { url = "https://files.pythonhosted.org/packages/74/cc/20c34d00f04d785f2028737e2e2a8254e1425102e730fee1d6396f832577/PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5", size = 728067 }, - { url = "https://files.pythonhosted.org/packages/20/52/551c69ca1501d21c0de51ddafa8c23a0191ef296ff098e98358f69080577/PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d", size = 757812 }, - { url = "https://files.pythonhosted.org/packages/fd/7f/2c3697bba5d4aa5cc2afe81826d73dfae5f049458e44732c7a0938baa673/PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083", size = 746531 }, - { url = "https://files.pythonhosted.org/packages/8c/ab/6226d3df99900e580091bb44258fde77a8433511a86883bd4681ea19a858/PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706", size = 800820 }, - { url = "https://files.pythonhosted.org/packages/a0/99/a9eb0f3e710c06c5d922026f6736e920d431812ace24aae38228d0d64b04/PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a", size = 145514 }, - { url = "https://files.pythonhosted.org/packages/75/8a/ee831ad5fafa4431099aa4e078d4c8efd43cd5e48fbc774641d233b683a9/PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff", size = 162702 }, { url = "https://files.pythonhosted.org/packages/65/d8/b7a1db13636d7fb7d4ff431593c510c8b8fca920ade06ca8ef20015493c5/PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d", size = 184777 }, { url = "https://files.pythonhosted.org/packages/0a/02/6ec546cd45143fdf9840b2c6be8d875116a64076218b61d68e12548e5839/PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f", size = 172318 }, { url = "https://files.pythonhosted.org/packages/0e/9a/8cc68be846c972bda34f6c2a93abb644fb2476f4dcc924d52175786932c9/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290", size = 720891 }, @@ -1534,8 +1161,7 @@ dependencies = [ { name = "certifi" }, { name = "charset-normalizer" }, { name = "idna" }, - { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "urllib3", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "urllib3" }, ] sdist = { url = "https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218 } wheels = [ @@ -1567,28 +1193,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b2/94/0498cdb7316ed67a1928300dd87d659c933479f44dec51b4f62bfd1f8028/ruff-0.9.1-py3-none-win_arm64.whl", hash = "sha256:1cd76c7f9c679e6e8f2af8f778367dca82b95009bc7b1a85a47f1521ae524fa7", size = 9145708 }, ] -[[package]] -name = "setuptools" -version = "75.3.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/ed/22/a438e0caa4576f8c383fa4d35f1cc01655a46c75be358960d815bfbb12bd/setuptools-75.3.0.tar.gz", hash = "sha256:fba5dd4d766e97be1b1681d98712680ae8f2f26d7881245f2ce9e40714f1a686", size = 1351577 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/90/12/282ee9bce8b58130cb762fbc9beabd531549952cac11fc56add11dcb7ea0/setuptools-75.3.0-py3-none-any.whl", hash = "sha256:f2504966861356aa38616760c0f66568e535562374995367b4e69c7143cf6bcd", size = 1251070 }, -] - [[package]] name = "setuptools" version = "75.8.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/92/ec/089608b791d210aec4e7f97488e67ab0d33add3efccb83a056cbafe3a2a6/setuptools-75.8.0.tar.gz", hash = "sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6", size = 1343222 } wheels = [ { url = "https://files.pythonhosted.org/packages/69/8a/b9dc7678803429e4a3bc9ba462fa3dd9066824d3c607490235c6a796be5a/setuptools-75.8.0-py3-none-any.whl", hash = "sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3", size = 1228782 }, @@ -1621,63 +1229,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/c2/fe97d779f3ef3b15f05c94a2f1e3d21732574ed441687474db9d342a7315/soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9", size = 36186 }, ] -[[package]] -name = "sphinx" -version = "7.1.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "alabaster", version = "0.7.13", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "babel", marker = "python_full_version < '3.9'" }, - { name = "colorama", marker = "python_full_version < '3.9' and sys_platform == 'win32'" }, - { name = "docutils", version = "0.20.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "imagesize", marker = "python_full_version < '3.9'" }, - { name = "importlib-metadata", marker = "python_full_version < '3.9'" }, - { name = "jinja2", marker = "python_full_version < '3.9'" }, - { name = "packaging", marker = "python_full_version < '3.9'" }, - { name = "pygments", marker = "python_full_version < '3.9'" }, - { name = "requests", marker = "python_full_version < '3.9'" }, - { name = "snowballstemmer", marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-applehelp", version = "1.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-devhelp", version = "1.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-htmlhelp", version = "2.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-jsmath", marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-qthelp", version = "1.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-serializinghtml", version = "1.1.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/dc/01/688bdf9282241dca09fe6e3a1110eda399fa9b10d0672db609e37c2e7a39/sphinx-7.1.2.tar.gz", hash = "sha256:780f4d32f1d7d1126576e0e5ecc19dc32ab76cd24e950228dcf7b1f6d3d9e22f", size = 6828258 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/48/17/325cf6a257d84751a48ae90752b3d8fe0be8f9535b6253add61c49d0d9bc/sphinx-7.1.2-py3-none-any.whl", hash = "sha256:d170a81825b2fcacb6dfd5a0d7f578a053e45d3f2b153fecc948c37344eb4cbe", size = 3169543 }, -] - [[package]] name = "sphinx" version = "7.4.7" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "alabaster", version = "0.7.16", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "babel", marker = "python_full_version == '3.9.*'" }, - { name = "colorama", marker = "python_full_version == '3.9.*' and sys_platform == 'win32'" }, - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "imagesize", marker = "python_full_version == '3.9.*'" }, - { name = "importlib-metadata", marker = "python_full_version == '3.9.*'" }, - { name = "jinja2", marker = "python_full_version == '3.9.*'" }, - { name = "packaging", marker = "python_full_version == '3.9.*'" }, - { name = "pygments", marker = "python_full_version == '3.9.*'" }, - { name = "requests", marker = "python_full_version == '3.9.*'" }, - { name = "snowballstemmer", marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-applehelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-devhelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-htmlhelp", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-jsmath", marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-qthelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-serializinghtml", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "tomli", marker = "python_full_version == '3.9.*'" }, + { name = "alabaster", version = "0.7.16", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "babel", marker = "python_full_version < '3.10'" }, + { name = "colorama", marker = "python_full_version < '3.10' and sys_platform == 'win32'" }, + { name = "docutils", marker = "python_full_version < '3.10'" }, + { name = "imagesize", marker = "python_full_version < '3.10'" }, + { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, + { name = "jinja2", marker = "python_full_version < '3.10'" }, + { name = "packaging", marker = "python_full_version < '3.10'" }, + { name = "pygments", marker = "python_full_version < '3.10'" }, + { name = "requests", marker = "python_full_version < '3.10'" }, + { name = "snowballstemmer", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-applehelp", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-devhelp", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-htmlhelp", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-jsmath", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-qthelp", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-serializinghtml", marker = "python_full_version < '3.10'" }, + { name = "tomli", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/5b/be/50e50cb4f2eff47df05673d361095cafd95521d2a22521b920c67a372dcb/sphinx-7.4.7.tar.gz", hash = "sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe", size = 8067911 } wheels = [ @@ -1697,19 +1274,19 @@ dependencies = [ { name = "alabaster", version = "1.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "babel", marker = "python_full_version >= '3.10'" }, { name = "colorama", marker = "python_full_version >= '3.10' and sys_platform == 'win32'" }, - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "docutils", marker = "python_full_version >= '3.10'" }, { name = "imagesize", marker = "python_full_version >= '3.10'" }, { name = "jinja2", marker = "python_full_version >= '3.10'" }, { name = "packaging", marker = "python_full_version >= '3.10'" }, { name = "pygments", marker = "python_full_version >= '3.10'" }, { name = "requests", marker = "python_full_version >= '3.10'" }, { name = "snowballstemmer", marker = "python_full_version >= '3.10'" }, - { name = "sphinxcontrib-applehelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "sphinxcontrib-devhelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "sphinxcontrib-htmlhelp", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-applehelp", marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-devhelp", marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-htmlhelp", marker = "python_full_version >= '3.10'" }, { name = "sphinxcontrib-jsmath", marker = "python_full_version >= '3.10'" }, - { name = "sphinxcontrib-qthelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "sphinxcontrib-serializinghtml", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-qthelp", marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-serializinghtml", marker = "python_full_version >= '3.10'" }, { name = "tomli", marker = "python_full_version == '3.10.*'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/be0b61178fe2cdcb67e2a92fc9ebb488e3c51c4f74a36a7824c0adf23425/sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927", size = 8184611 } @@ -1722,97 +1299,40 @@ name = "sphinx-autoapi" version = "3.4.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "astroid", version = "3.2.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "astroid", version = "3.3.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "astroid" }, { name = "jinja2" }, { name = "pyyaml" }, - { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "stdlib-list", version = "0.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "stdlib-list", version = "0.11.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "stdlib-list", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/4a/eb/cc243583bb1d518ca3b10998c203d919a8ed90affd4831f2b61ad09043d2/sphinx_autoapi-3.4.0.tar.gz", hash = "sha256:e6d5371f9411bbb9fca358c00a9e57aef3ac94cbfc5df4bab285946462f69e0c", size = 29292 } wheels = [ { url = "https://files.pythonhosted.org/packages/de/d6/f2acdc2567337fd5f5dc091a4e58d8a0fb14927b9779fc1e5ecee96d9824/sphinx_autoapi-3.4.0-py3-none-any.whl", hash = "sha256:4027fef2875a22c5f2a57107c71641d82f6166bf55beb407a47aaf3ef14e7b92", size = 34095 }, ] -[[package]] -name = "sphinxcontrib-applehelp" -version = "1.0.4" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/32/df/45e827f4d7e7fcc84e853bcef1d836effd762d63ccb86f43ede4e98b478c/sphinxcontrib-applehelp-1.0.4.tar.gz", hash = "sha256:828f867945bbe39817c210a1abfd1bc4895c8b73fcaade56d45357a348a07d7e", size = 24766 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/06/c1/5e2cafbd03105ce50d8500f9b4e8a6e8d02e22d0475b574c3b3e9451a15f/sphinxcontrib_applehelp-1.0.4-py3-none-any.whl", hash = "sha256:29d341f67fb0f6f586b23ad80e072c8e6ad0b48417db2bde114a4c9746feb228", size = 120601 }, -] - [[package]] name = "sphinxcontrib-applehelp" version = "2.0.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/ba/6e/b837e84a1a704953c62ef8776d45c3e8d759876b4a84fe14eba2859106fe/sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1", size = 20053 } wheels = [ { url = "https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5", size = 119300 }, ] -[[package]] -name = "sphinxcontrib-devhelp" -version = "1.0.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/98/33/dc28393f16385f722c893cb55539c641c9aaec8d1bc1c15b69ce0ac2dbb3/sphinxcontrib-devhelp-1.0.2.tar.gz", hash = "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4", size = 17398 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c5/09/5de5ed43a521387f18bdf5f5af31d099605c992fd25372b2b9b825ce48ee/sphinxcontrib_devhelp-1.0.2-py2.py3-none-any.whl", hash = "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e", size = 84690 }, -] - [[package]] name = "sphinxcontrib-devhelp" version = "2.0.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/f6/d2/5beee64d3e4e747f316bae86b55943f51e82bb86ecd325883ef65741e7da/sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad", size = 12967 } wheels = [ { url = "https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2", size = 82530 }, ] -[[package]] -name = "sphinxcontrib-htmlhelp" -version = "2.0.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/b3/47/64cff68ea3aa450c373301e5bebfbb9fce0a3e70aca245fcadd4af06cd75/sphinxcontrib-htmlhelp-2.0.1.tar.gz", hash = "sha256:0cbdd302815330058422b98a113195c9249825d681e18f11e8b1f78a2f11efff", size = 27967 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6e/ee/a1f5e39046cbb5f8bc8fba87d1ddf1c6643fbc9194e58d26e606de4b9074/sphinxcontrib_htmlhelp-2.0.1-py3-none-any.whl", hash = "sha256:c38cb46dccf316c79de6e5515e1770414b797162b23cd3d06e67020e1d2a6903", size = 99833 }, -] - [[package]] name = "sphinxcontrib-htmlhelp" version = "2.1.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/43/93/983afd9aa001e5201eab16b5a444ed5b9b0a7a010541e0ddfbbfd0b2470c/sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9", size = 22617 } wheels = [ { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705 }, @@ -1827,55 +1347,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", size = 5071 }, ] -[[package]] -name = "sphinxcontrib-qthelp" -version = "1.0.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/b1/8e/c4846e59f38a5f2b4a0e3b27af38f2fcf904d4bfd82095bf92de0b114ebd/sphinxcontrib-qthelp-1.0.3.tar.gz", hash = "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72", size = 21658 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2b/14/05f9206cf4e9cfca1afb5fd224c7cd434dcc3a433d6d9e4e0264d29c6cdb/sphinxcontrib_qthelp-1.0.3-py2.py3-none-any.whl", hash = "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6", size = 90609 }, -] - [[package]] name = "sphinxcontrib-qthelp" version = "2.0.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/68/bc/9104308fc285eb3e0b31b67688235db556cd5b0ef31d96f30e45f2e51cae/sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab", size = 17165 } wheels = [ { url = "https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb", size = 88743 }, ] -[[package]] -name = "sphinxcontrib-serializinghtml" -version = "1.1.5" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/b5/72/835d6fadb9e5d02304cf39b18f93d227cd93abd3c41ebf58e6853eeb1455/sphinxcontrib-serializinghtml-1.1.5.tar.gz", hash = "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952", size = 21019 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c6/77/5464ec50dd0f1c1037e3c93249b040c8fc8078fdda97530eeb02424b6eea/sphinxcontrib_serializinghtml-1.1.5-py2.py3-none-any.whl", hash = "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd", size = 94021 }, -] - [[package]] name = "sphinxcontrib-serializinghtml" version = "2.0.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/3b/44/6716b257b0aa6bfd51a1b31665d1c205fb12cb5ad56de752dfa15657de2f/sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d", size = 16080 } wheels = [ { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072 }, @@ -1895,25 +1379,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521 }, ] -[[package]] -name = "stdlib-list" -version = "0.10.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/39/bb/1cdbc326a5ab0026602e0489cbf02357e78140253c4b57cd866d380eb355/stdlib_list-0.10.0.tar.gz", hash = "sha256:6519c50d645513ed287657bfe856d527f277331540691ddeaf77b25459964a14", size = 59447 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/13/d9/9085375f0d23a4896b307bf14dcc61b49ec8cc67cb33e06cf95bf3af3966/stdlib_list-0.10.0-py3-none-any.whl", hash = "sha256:b3a911bc441d03e0332dd1a9e7d0870ba3bb0a542a74d7524f54fb431256e214", size = 79814 }, -] - [[package]] name = "stdlib-list" version = "0.11.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/5d/04/6b37a71e92ddca16b190b7df62494ac4779d58ced4787f73584eb32c8f03/stdlib_list-0.11.0.tar.gz", hash = "sha256:b74a7b643a77a12637e907f3f62f0ab9f67300bce4014f6b2d3c8b4c8fd63c66", size = 60335 } wheels = [ { url = "https://files.pythonhosted.org/packages/16/fe/e07300c027a868d32d8ed7a425503401e91a03ff90e7ca525c115c634ffb/stdlib_list-0.11.0-py3-none-any.whl", hash = "sha256:8bf8decfffaaf273d4cfeb5bd852b910a00dec1037dcf163576803622bccf597", size = 83617 }, @@ -1994,28 +1463,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a6/ab/7e5f53c3b9d14972843a647d8d7a853969a58aecc7559cb3267302c94774/tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd", size = 346586 }, ] -[[package]] -name = "urllib3" -version = "2.2.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/ed/63/22ba4ebfe7430b76388e7cd448d5478814d3032121827c12a2cc287e2260/urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9", size = 300677 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ce/d9/5f4c13cecde62396b0d3fe530a50ccea91e7dfc1ccf0e09c228841bb5ba8/urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac", size = 126338 }, -] - [[package]] name = "urllib3" version = "2.3.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/aa/63/e53da845320b757bf29ef6a9062f5c669fe997973f966045cb019c3f4b66/urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d", size = 307268 } wheels = [ { url = "https://files.pythonhosted.org/packages/c8/19/4ec628951a74043532ca2cf5d97b7b14863931476d117c471e8e2b1eb39f/urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df", size = 128369 }, @@ -2091,17 +1542,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a7/b1/0bb11e29aa5139d90b770ebbfa167267b1fc548d2302c30c8f7572851738/wrapt-1.17.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c82b8785d98cdd9fed4cac84d765d234ed3251bd6afe34cb7ac523cb93e8b4f", size = 106377 }, { url = "https://files.pythonhosted.org/packages/6a/e1/0122853035b40b3f333bbb25f1939fc1045e21dd518f7f0922b60c156f7c/wrapt-1.17.2-cp313-cp313t-win32.whl", hash = "sha256:13e6afb7fe71fe7485a4550a8844cc9ffbe263c0f1a1eea569bc7091d4898555", size = 37986 }, { url = "https://files.pythonhosted.org/packages/09/5e/1655cf481e079c1f22d0cabdd4e51733679932718dc23bf2db175f329b76/wrapt-1.17.2-cp313-cp313t-win_amd64.whl", hash = "sha256:eaf675418ed6b3b31c7a989fd007fa7c3be66ce14e5c3b27336383604c9da85c", size = 40750 }, - { url = "https://files.pythonhosted.org/packages/0c/66/95b9e90e6e1274999b183c9c3f984996d870e933ca9560115bd1cd1d6f77/wrapt-1.17.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5c803c401ea1c1c18de70a06a6f79fcc9c5acfc79133e9869e730ad7f8ad8ef9", size = 53234 }, - { url = "https://files.pythonhosted.org/packages/a4/b6/6eced5e2db5924bf6d9223d2bb96b62e00395aae77058e6a9e11bf16b3bd/wrapt-1.17.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f917c1180fdb8623c2b75a99192f4025e412597c50b2ac870f156de8fb101119", size = 38462 }, - { url = "https://files.pythonhosted.org/packages/5d/a4/c8472fe2568978b5532df84273c53ddf713f689d408a4335717ab89547e0/wrapt-1.17.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ecc840861360ba9d176d413a5489b9a0aff6d6303d7e733e2c4623cfa26904a6", size = 38730 }, - { url = "https://files.pythonhosted.org/packages/3c/70/1d259c6b1ad164eb23ff70e3e452dd1950f96e6473f72b7207891d0fd1f0/wrapt-1.17.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb87745b2e6dc56361bfde481d5a378dc314b252a98d7dd19a651a3fa58f24a9", size = 86225 }, - { url = "https://files.pythonhosted.org/packages/a9/68/6b83367e1afb8de91cbea4ef8e85b58acdf62f034f05d78c7b82afaa23d8/wrapt-1.17.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58455b79ec2661c3600e65c0a716955adc2410f7383755d537584b0de41b1d8a", size = 78055 }, - { url = "https://files.pythonhosted.org/packages/0d/21/09573d2443916705c57fdab85d508f592c0a58d57becc53e15755d67fba2/wrapt-1.17.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4e42a40a5e164cbfdb7b386c966a588b1047558a990981ace551ed7e12ca9c2", size = 85592 }, - { url = "https://files.pythonhosted.org/packages/45/ce/700e17a852dd5dec894e241c72973ea82363486bcc1fb05d47b4fbd1d683/wrapt-1.17.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:91bd7d1773e64019f9288b7a5101f3ae50d3d8e6b1de7edee9c2ccc1d32f0c0a", size = 83906 }, - { url = "https://files.pythonhosted.org/packages/37/14/bd210faf0a66faeb8529d42b6b45a25d6aa6ce25ddfc19168e4161aed227/wrapt-1.17.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:bb90fb8bda722a1b9d48ac1e6c38f923ea757b3baf8ebd0c82e09c5c1a0e7a04", size = 76763 }, - { url = "https://files.pythonhosted.org/packages/34/0c/85af70d291f44659c422416f0272046109e785bf6db8c081cfeeae5715c5/wrapt-1.17.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:08e7ce672e35efa54c5024936e559469436f8b8096253404faeb54d2a878416f", size = 83573 }, - { url = "https://files.pythonhosted.org/packages/f8/1e/b215068e824878f69ea945804fa26c176f7c2735a3ad5367d78930bd076a/wrapt-1.17.2-cp38-cp38-win32.whl", hash = "sha256:410a92fefd2e0e10d26210e1dfb4a876ddaf8439ef60d6434f21ef8d87efc5b7", size = 36408 }, - { url = "https://files.pythonhosted.org/packages/52/27/3dd9ad5f1097b33c95d05929e409cc86d7c765cb5437b86694dc8f8e9af0/wrapt-1.17.2-cp38-cp38-win_amd64.whl", hash = "sha256:95c658736ec15602da0ed73f312d410117723914a5c91a14ee4cdd72f1d790b3", size = 38737 }, { url = "https://files.pythonhosted.org/packages/8a/f4/6ed2b8f6f1c832933283974839b88ec7c983fd12905e01e97889dadf7559/wrapt-1.17.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:99039fa9e6306880572915728d7f6c24a86ec57b0a83f6b2491e1d8ab0235b9a", size = 53308 }, { url = "https://files.pythonhosted.org/packages/a2/a9/712a53f8f4f4545768ac532619f6e56d5d0364a87b2212531685e89aeef8/wrapt-1.17.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2696993ee1eebd20b8e4ee4356483c4cb696066ddc24bd70bcbb80fa56ff9061", size = 38489 }, { url = "https://files.pythonhosted.org/packages/fa/9b/e172c8f28a489a2888df18f953e2f6cb8d33b1a2e78c9dfc52d8bf6a5ead/wrapt-1.17.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:612dff5db80beef9e649c6d803a8d50c409082f1fedc9dbcdfde2983b2025b82", size = 38776 }, @@ -2116,25 +1556,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2d/82/f56956041adef78f849db6b289b282e72b55ab8045a75abad81898c28d19/wrapt-1.17.2-py3-none-any.whl", hash = "sha256:b18f2d1533a71f069c7f82d524a52599053d4c7166e9dd374ae2136b7f40f7c8", size = 23594 }, ] -[[package]] -name = "zipp" -version = "3.20.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/54/bf/5c0000c44ebc80123ecbdddba1f5dcd94a5ada602a9c225d84b5aaa55e86/zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29", size = 24199 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/62/8b/5ba542fa83c90e09eac972fc9baca7a88e7e7ca4b221a89251954019308b/zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350", size = 9200 }, -] - [[package]] name = "zipp" version = "3.21.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/3f/50/bad581df71744867e9468ebd0bcd6505de3b275e06f202c2cb016e3ff56f/zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4", size = 24545 } wheels = [ { url = "https://files.pythonhosted.org/packages/b7/1a/7e4798e9339adc931158c9d69ecc34f5e6791489d469f5e50ec15e35f458/zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931", size = 9630 }, From b194a8772e58ccefc697e11671113127a8038716 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 12 Mar 2025 14:25:32 -0400 Subject: [PATCH 017/145] feat/improve ruff test coverage (#1055) * Run python tests on all currently supported python versions * Update ruff checks to select all * Ruff auto fix * Applying ruff suggestions * noqa rules updates per ruff checks * Working through more ruff suggestions * Working through more ruff suggestions * update timestamps on tests * More ruff updates * More ruff updates * Instead of importing udf static functions as variables, import * More ruff formatting suggestions * more ruff formatting suggestions * More ruff formatting * More ruff formatting * Cut off lint errors for this PR * Working through more ruff checks and disabling a bunch for now * Address CI difference from local ruff * UDWF isn't a proper abstract base class right now since users can opt in to all methods * Update pre-commit to match the version of ruff used in CI * To enable testing in python 3.9 we need numpy. Also going to the current minimal supported version * Update min requried version of python to 3.9 in pyproject.toml. The other changes will come in #1043 that is soon to be merged. * Suppress UP035 * ruff format --- .github/workflows/test.yaml | 2 + .pre-commit-config.yaml | 2 +- benchmarks/tpch/tpch.py | 14 +- dev/release/check-rat-report.py | 2 +- dev/release/generate-changelog.py | 10 +- docs/source/conf.py | 2 +- examples/python-udwf.py | 2 +- examples/tpch/_tests.py | 15 +- pyproject.toml | 76 +++++- python/datafusion/__init__.py | 50 ++-- python/datafusion/common.py | 14 +- python/datafusion/context.py | 4 +- python/datafusion/dataframe.py | 15 +- python/datafusion/expr.py | 94 +++---- python/datafusion/functions.py | 46 ++-- python/datafusion/input/__init__.py | 2 +- python/datafusion/input/base.py | 6 +- python/datafusion/input/location.py | 40 +-- python/datafusion/io.py | 20 +- python/datafusion/object_store.py | 2 +- python/datafusion/plan.py | 8 +- python/datafusion/record_batch.py | 8 +- python/datafusion/substrait.py | 21 +- python/datafusion/udf.py | 236 +++++++++-------- python/tests/generic.py | 19 +- python/tests/test_aggregation.py | 16 +- python/tests/test_catalog.py | 9 +- python/tests/test_context.py | 53 ++-- python/tests/test_dataframe.py | 38 ++- python/tests/test_expr.py | 11 +- python/tests/test_functions.py | 358 ++++++++++++++------------ python/tests/test_imports.py | 7 +- python/tests/test_input.py | 12 +- python/tests/test_io.py | 13 +- python/tests/test_sql.py | 35 +-- python/tests/test_store.py | 13 +- python/tests/test_substrait.py | 2 +- python/tests/test_udaf.py | 10 +- python/tests/test_udwf.py | 2 +- python/tests/test_wrapper_coverage.py | 7 +- 40 files changed, 697 insertions(+), 599 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index c1d9ac838..da3582766 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -33,9 +33,11 @@ jobs: fail-fast: false matrix: python-version: + - "3.9" - "3.10" - "3.11" - "3.12" + - "3.13" toolchain: - "stable" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b548ff18f..abcfcf321 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: - id: actionlint-docker - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.3.0 + rev: v0.9.10 hooks: # Run the linter. - id: ruff diff --git a/benchmarks/tpch/tpch.py b/benchmarks/tpch/tpch.py index fb86b12b6..bfb9ac398 100644 --- a/benchmarks/tpch/tpch.py +++ b/benchmarks/tpch/tpch.py @@ -59,13 +59,13 @@ def bench(data_path, query_path): end = time.time() time_millis = (end - start) * 1000 total_time_millis += time_millis - print("setup,{}".format(round(time_millis, 1))) - results.write("setup,{}\n".format(round(time_millis, 1))) + print(f"setup,{round(time_millis, 1)}") + results.write(f"setup,{round(time_millis, 1)}\n") results.flush() # run queries for query in range(1, 23): - with open("{}/q{}.sql".format(query_path, query)) as f: + with open(f"{query_path}/q{query}.sql") as f: text = f.read() tmp = text.split(";") queries = [] @@ -83,14 +83,14 @@ def bench(data_path, query_path): end = time.time() time_millis = (end - start) * 1000 total_time_millis += time_millis - print("q{},{}".format(query, round(time_millis, 1))) - results.write("q{},{}\n".format(query, round(time_millis, 1))) + print(f"q{query},{round(time_millis, 1)}") + results.write(f"q{query},{round(time_millis, 1)}\n") results.flush() except Exception as e: print("query", query, "failed", e) - print("total,{}".format(round(total_time_millis, 1))) - results.write("total,{}\n".format(round(total_time_millis, 1))) + print(f"total,{round(total_time_millis, 1)}") + results.write(f"total,{round(total_time_millis, 1)}\n") if __name__ == "__main__": diff --git a/dev/release/check-rat-report.py b/dev/release/check-rat-report.py index d3dd7c5dd..0c9f4c326 100644 --- a/dev/release/check-rat-report.py +++ b/dev/release/check-rat-report.py @@ -29,7 +29,7 @@ exclude_globs_filename = sys.argv[1] xml_filename = sys.argv[2] -globs = [line.strip() for line in open(exclude_globs_filename, "r")] +globs = [line.strip() for line in open(exclude_globs_filename)] tree = ET.parse(xml_filename) root = tree.getroot() diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py index 2564eea86..e30e2def2 100755 --- a/dev/release/generate-changelog.py +++ b/dev/release/generate-changelog.py @@ -26,15 +26,11 @@ def print_pulls(repo_name, title, pulls): if len(pulls) > 0: - print("**{}:**".format(title)) + print(f"**{title}:**") print() for pull, commit in pulls: - url = "https://github.com/{}/pull/{}".format(repo_name, pull.number) - print( - "- {} [#{}]({}) ({})".format( - pull.title, pull.number, url, commit.author.login - ) - ) + url = f"https://github.com/{repo_name}/pull/{pull.number}" + print(f"- {pull.title} [#{pull.number}]({url}) ({commit.author.login})") print() diff --git a/docs/source/conf.py b/docs/source/conf.py index 2e5a41339..c82a189e0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -73,7 +73,7 @@ autoapi_python_class_content = "both" -def autoapi_skip_member_fn(app, what, name, obj, skip, options): +def autoapi_skip_member_fn(app, what, name, obj, skip, options): # noqa: ARG001 skip_contents = [ # Re-exports ("class", "datafusion.DataFrame"), diff --git a/examples/python-udwf.py b/examples/python-udwf.py index 7d39dc1b8..98d118bf2 100644 --- a/examples/python-udwf.py +++ b/examples/python-udwf.py @@ -59,7 +59,7 @@ def __init__(self, alpha: float) -> None: def supports_bounded_execution(self) -> bool: return True - def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: + def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: # noqa: ARG002 # Override the default range of current row since uses_window_frame is False # So for the purpose of this test we just smooth from the previous row to # current. diff --git a/examples/tpch/_tests.py b/examples/tpch/_tests.py index c4d872085..2be4dfabd 100644 --- a/examples/tpch/_tests.py +++ b/examples/tpch/_tests.py @@ -27,28 +27,25 @@ def df_selection(col_name, col_type): if col_type == pa.float64() or isinstance(col_type, pa.Decimal128Type): return F.round(col(col_name), lit(2)).alias(col_name) - elif col_type == pa.string() or col_type == pa.string_view(): + if col_type == pa.string() or col_type == pa.string_view(): return F.trim(col(col_name)).alias(col_name) - else: - return col(col_name) + return col(col_name) def load_schema(col_name, col_type): if col_type == pa.int64() or col_type == pa.int32(): return col_name, pa.string() - elif isinstance(col_type, pa.Decimal128Type): + if isinstance(col_type, pa.Decimal128Type): return col_name, pa.float64() - else: - return col_name, col_type + return col_name, col_type def expected_selection(col_name, col_type): if col_type == pa.int64() or col_type == pa.int32(): return F.trim(col(col_name)).cast(col_type).alias(col_name) - elif col_type == pa.string() or col_type == pa.string_view(): + if col_type == pa.string() or col_type == pa.string_view(): return F.trim(col(col_name)).alias(col_name) - else: - return col(col_name) + return col(col_name) def selections_and_schema(original_schema): diff --git a/pyproject.toml b/pyproject.toml index 1c2733677..060e3b80a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,57 @@ features = ["substrait"] # Enable docstring linting using the google style guide [tool.ruff.lint] -select = ["E4", "E7", "E9", "F", "FA", "D", "W", "I"] +select = ["ALL" ] +ignore = [ + "A001", # Allow using words like min as variable names + "A002", # Allow using words like filter as variable names + "ANN401", # Allow Any for wrapper classes + "COM812", # Recommended to ignore these rules when using with ruff-format + "FIX002", # Allow TODO lines - consider removing at some point + "FBT001", # Allow boolean positional args + "FBT002", # Allow boolean positional args + "ISC001", # Recommended to ignore these rules when using with ruff-format + "SLF001", # Allow accessing private members + "TD002", + "TD003", # Allow TODO lines + "UP007", # Disallowing Union is pedantic + # TODO: Enable all of the following, but this PR is getting too large already + "PT001", + "ANN204", + "B008", + "EM101", + "PLR0913", + "PLR1714", + "ANN201", + "C400", + "TRY003", + "B904", + "UP006", + "RUF012", + "FBT003", + "C416", + "SIM102", + "PGH003", + "PLR2004", + "PERF401", + "PD901", + "EM102", + "ERA001", + "SIM108", + "ICN001", + "ANN001", + "ANN202", + "PTH", + "N812", + "INP001", + "DTZ007", + "PLW2901", + "RET503", + "RUF015", + "A005", + "TC001", + "UP035", +] [tool.ruff.lint.pydocstyle] convention = "google" @@ -75,16 +125,30 @@ max-doc-length = 88 # Disable docstring checking for these directories [tool.ruff.lint.per-file-ignores] -"python/tests/*" = ["D"] -"examples/*" = ["D", "W505"] -"dev/*" = ["D"] -"benchmarks/*" = ["D", "F"] +"python/tests/*" = [ + "ANN", + "ARG", + "BLE001", + "D", + "S101", + "SLF", + "PD", + "PLR2004", + "PT011", + "RUF015", + "S608", + "PLR0913", + "PT004", +] +"examples/*" = ["D", "W505", "E501", "T201", "S101"] +"dev/*" = ["D", "E", "T", "S", "PLR", "C", "SIM", "UP", "EXE", "N817"] +"benchmarks/*" = ["D", "F", "T", "BLE", "FURB", "PLR", "E", "TD", "TRY", "S", "SIM", "EXE", "UP"] "docs/*" = ["D"] [dependency-groups] dev = [ "maturin>=1.8.1", - "numpy>1.24.4 ; python_full_version >= '3.10'", + "numpy>1.25.0", "pytest>=7.4.4", "ruff>=0.9.1", "toml>=0.10.2", diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index f11ce54a6..286e5dc31 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -48,44 +48,47 @@ from .io import read_avro, read_csv, read_json, read_parquet from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream -from .udf import Accumulator, AggregateUDF, ScalarUDF, WindowUDF +from .udf import Accumulator, AggregateUDF, ScalarUDF, WindowUDF, udaf, udf, udwf __version__ = importlib_metadata.version(__name__) __all__ = [ "Accumulator", + "AggregateUDF", + "Catalog", "Config", - "DataFrame", - "SessionContext", - "SessionConfig", - "SQLOptions", - "RuntimeEnvBuilder", - "Expr", - "ScalarUDF", - "WindowFrame", - "column", - "col", - "literal", - "lit", "DFSchema", - "Catalog", + "DataFrame", "Database", - "Table", - "AggregateUDF", - "WindowUDF", - "LogicalPlan", "ExecutionPlan", + "Expr", + "LogicalPlan", "RecordBatch", "RecordBatchStream", + "RuntimeEnvBuilder", + "SQLOptions", + "ScalarUDF", + "SessionConfig", + "SessionContext", + "Table", + "WindowFrame", + "WindowUDF", + "col", + "column", "common", "expr", "functions", + "lit", + "literal", "object_store", - "substrait", - "read_parquet", "read_avro", "read_csv", "read_json", + "read_parquet", + "substrait", + "udaf", + "udf", + "udwf", ] @@ -120,10 +123,3 @@ def str_lit(value): def lit(value): """Create a literal expression.""" return Expr.literal(value) - - -udf = ScalarUDF.udf - -udaf = AggregateUDF.udaf - -udwf = WindowUDF.udwf diff --git a/python/datafusion/common.py b/python/datafusion/common.py index a2298c634..e762a993b 100644 --- a/python/datafusion/common.py +++ b/python/datafusion/common.py @@ -20,7 +20,7 @@ from ._internal import common as common_internal -# TODO these should all have proper wrapper classes +# TODO: these should all have proper wrapper classes DFSchema = common_internal.DFSchema DataType = common_internal.DataType @@ -38,15 +38,15 @@ "DFSchema", "DataType", "DataTypeMap", - "RexType", - "PythonType", - "SqlType", "NullTreatment", - "SqlTable", + "PythonType", + "RexType", + "SqlFunction", "SqlSchema", - "SqlView", "SqlStatistics", - "SqlFunction", + "SqlTable", + "SqlType", + "SqlView", ] diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 282b2a477..0ab1a908a 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -393,8 +393,6 @@ def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeEnvBuilder: class RuntimeConfig(RuntimeEnvBuilder): """See `RuntimeEnvBuilder`.""" - pass - class SQLOptions: """Options to be used when performing SQL queries.""" @@ -498,7 +496,7 @@ def __init__( self.ctx = SessionContextInternal(config, runtime) - def enable_url_table(self) -> "SessionContext": + def enable_url_table(self) -> SessionContext: """Control if local files can be queried as tables. Returns: diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index de5d8376e..d1c71c2bb 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -29,6 +29,7 @@ List, Literal, Optional, + Type, Union, overload, ) @@ -49,10 +50,11 @@ import polars as pl import pyarrow as pa + from datafusion._internal import DataFrame as DataFrameInternal + from datafusion._internal import expr as expr_internal + from enum import Enum -from datafusion._internal import DataFrame as DataFrameInternal -from datafusion._internal import expr as expr_internal from datafusion.expr import Expr, SortExpr, sort_or_default @@ -73,7 +75,7 @@ class Compression(Enum): LZ4_RAW = "lz4_raw" @classmethod - def from_str(cls, value: str) -> "Compression": + def from_str(cls: Type[Compression], value: str) -> Compression: """Convert a string to a Compression enum value. Args: @@ -88,8 +90,9 @@ def from_str(cls, value: str) -> "Compression": try: return cls(value.lower()) except ValueError: + valid_values = str([item.value for item in Compression]) raise ValueError( - f"{value} is not a valid Compression. Valid values are: {[item.value for item in Compression]}" + f"{value} is not a valid Compression. Valid values are: {valid_values}" ) def get_default_level(self) -> Optional[int]: @@ -104,9 +107,9 @@ def get_default_level(self) -> Optional[int]: # https://github.com/apache/datafusion-python/pull/981#discussion_r1904789223 if self == Compression.GZIP: return 6 - elif self == Compression.BROTLI: + if self == Compression.BROTLI: return 1 - elif self == Compression.ZSTD: + if self == Compression.ZSTD: return 4 return None diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 3639abec6..702f75aed 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -101,63 +101,63 @@ WindowExpr = expr_internal.WindowExpr __all__ = [ - "Expr", - "Column", - "Literal", - "BinaryExpr", - "Literal", + "Aggregate", "AggregateFunction", - "Not", - "IsNotNull", - "IsNull", - "IsTrue", - "IsFalse", - "IsUnknown", - "IsNotTrue", - "IsNotFalse", - "IsNotUnknown", - "Negative", - "Like", - "ILike", - "SimilarTo", - "ScalarVariable", "Alias", - "InList", - "Exists", - "Subquery", - "InSubquery", - "ScalarSubquery", - "Placeholder", - "GroupingSet", + "Analyze", + "Between", + "BinaryExpr", "Case", "CaseBuilder", "Cast", - "TryCast", - "Between", + "Column", + "CreateMemoryTable", + "CreateView", + "Distinct", + "DropTable", + "EmptyRelation", + "Exists", "Explain", + "Expr", + "Extension", + "Filter", + "GroupingSet", + "ILike", + "InList", + "InSubquery", + "IsFalse", + "IsNotFalse", + "IsNotNull", + "IsNotTrue", + "IsNotUnknown", + "IsNull", + "IsTrue", + "IsUnknown", + "Join", + "JoinConstraint", + "JoinType", + "Like", "Limit", - "Aggregate", + "Literal", + "Literal", + "Negative", + "Not", + "Partitioning", + "Placeholder", + "Projection", + "Repartition", + "ScalarSubquery", + "ScalarVariable", + "SimilarTo", "Sort", "SortExpr", - "Analyze", - "EmptyRelation", - "Join", - "JoinType", - "JoinConstraint", + "Subquery", + "SubqueryAlias", + "TableScan", + "TryCast", "Union", "Unnest", "UnnestExpr", - "Extension", - "Filter", - "Projection", - "TableScan", - "CreateMemoryTable", - "CreateView", - "Distinct", - "SubqueryAlias", - "DropTable", - "Partitioning", - "Repartition", "Window", "WindowExpr", "WindowFrame", @@ -311,7 +311,7 @@ def __getitem__(self, key: str | int) -> Expr: ) return Expr(self.expr.__getitem__(key)) - def __eq__(self, rhs: Any) -> Expr: + def __eq__(self, rhs: object) -> Expr: """Equal to. Accepts either an expression or any valid PyArrow scalar literal value. @@ -320,7 +320,7 @@ def __eq__(self, rhs: Any) -> Expr: rhs = Expr.literal(rhs) return Expr(self.expr.__eq__(rhs.expr)) - def __ne__(self, rhs: Any) -> Expr: + def __ne__(self, rhs: object) -> Expr: """Not equal to. Accepts either an expression or any valid PyArrow scalar literal value. diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index b449c4868..0cc7434cf 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -18,13 +18,12 @@ from __future__ import annotations -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional import pyarrow as pa from datafusion._internal import functions as f from datafusion.common import NullTreatment -from datafusion.context import SessionContext from datafusion.expr import ( CaseBuilder, Expr, @@ -34,6 +33,9 @@ sort_list_to_raw_sort_list, ) +if TYPE_CHECKING: + from datafusion.context import SessionContext + __all__ = [ "abs", "acos", @@ -81,8 +83,8 @@ "array_sort", "array_to_string", "array_union", - "arrow_typeof", "arrow_cast", + "arrow_typeof", "ascii", "asin", "asinh", @@ -97,6 +99,7 @@ "bool_and", "bool_or", "btrim", + "cardinality", "case", "cbrt", "ceil", @@ -116,6 +119,7 @@ "covar", "covar_pop", "covar_samp", + "cume_dist", "current_date", "current_time", "date_bin", @@ -125,17 +129,17 @@ "datetrunc", "decode", "degrees", + "dense_rank", "digest", "empty", "encode", "ends_with", - "extract", "exp", + "extract", "factorial", "find_in_set", "first_value", "flatten", - "cardinality", "floor", "from_unixtime", "gcd", @@ -143,8 +147,10 @@ "initcap", "isnan", "iszero", + "lag", "last_value", "lcm", + "lead", "left", "length", "levenshtein", @@ -166,10 +172,10 @@ "list_prepend", "list_push_back", "list_push_front", - "list_repeat", "list_remove", "list_remove_all", "list_remove_n", + "list_repeat", "list_replace", "list_replace_all", "list_replace_n", @@ -180,14 +186,14 @@ "list_union", "ln", "log", - "log10", "log2", + "log10", "lower", "lpad", "ltrim", "make_array", - "make_list", "make_date", + "make_list", "max", "md5", "mean", @@ -195,19 +201,22 @@ "min", "named_struct", "nanvl", - "nvl", "now", "nth_value", + "ntile", "nullif", + "nvl", "octet_length", "order_by", "overlay", + "percent_rank", "pi", "pow", "power", "radians", "random", "range", + "rank", "regexp_like", "regexp_match", "regexp_replace", @@ -225,6 +234,7 @@ "reverse", "right", "round", + "row_number", "rpad", "rtrim", "sha224", @@ -252,8 +262,8 @@ "to_hex", "to_timestamp", "to_timestamp_micros", - "to_timestamp_nanos", "to_timestamp_millis", + "to_timestamp_nanos", "to_timestamp_seconds", "to_unixtime", "translate", @@ -268,14 +278,6 @@ "when", # Window Functions "window", - "lead", - "lag", - "row_number", - "rank", - "dense_rank", - "percent_rank", - "cume_dist", - "ntile", ] @@ -292,14 +294,14 @@ def nullif(expr1: Expr, expr2: Expr) -> Expr: return Expr(f.nullif(expr1.expr, expr2.expr)) -def encode(input: Expr, encoding: Expr) -> Expr: +def encode(expr: Expr, encoding: Expr) -> Expr: """Encode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" - return Expr(f.encode(input.expr, encoding.expr)) + return Expr(f.encode(expr.expr, encoding.expr)) -def decode(input: Expr, encoding: Expr) -> Expr: +def decode(expr: Expr, encoding: Expr) -> Expr: """Decode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" - return Expr(f.decode(input.expr, encoding.expr)) + return Expr(f.decode(expr.expr, encoding.expr)) def array_to_string(expr: Expr, delimiter: Expr) -> Expr: diff --git a/python/datafusion/input/__init__.py b/python/datafusion/input/__init__.py index f85ce21f0..f0c1f42b4 100644 --- a/python/datafusion/input/__init__.py +++ b/python/datafusion/input/__init__.py @@ -23,5 +23,5 @@ from .location import LocationInputPlugin __all__ = [ - LocationInputPlugin, + "LocationInputPlugin", ] diff --git a/python/datafusion/input/base.py b/python/datafusion/input/base.py index 4eba19784..f67dde2a1 100644 --- a/python/datafusion/input/base.py +++ b/python/datafusion/input/base.py @@ -38,11 +38,9 @@ class BaseInputSource(ABC): """ @abstractmethod - def is_correct_input(self, input_item: Any, table_name: str, **kwargs) -> bool: + def is_correct_input(self, input_item: Any, table_name: str, **kwargs: Any) -> bool: """Returns `True` if the input is valid.""" - pass @abstractmethod - def build_table(self, input_item: Any, table_name: str, **kwarg) -> SqlTable: + def build_table(self, input_item: Any, table_name: str, **kwarg: Any) -> SqlTable: # type: ignore[invalid-type-form] """Create a table from the input source.""" - pass diff --git a/python/datafusion/input/location.py b/python/datafusion/input/location.py index 517cd1578..08d98d115 100644 --- a/python/datafusion/input/location.py +++ b/python/datafusion/input/location.py @@ -18,7 +18,7 @@ """The default input source for DataFusion.""" import glob -import os +from pathlib import Path from typing import Any from datafusion.common import DataTypeMap, SqlTable @@ -31,7 +31,7 @@ class LocationInputPlugin(BaseInputSource): This can be read in from a file (on disk, remote etc.). """ - def is_correct_input(self, input_item: Any, table_name: str, **kwargs): + def is_correct_input(self, input_item: Any, table_name: str, **kwargs: Any) -> bool: # noqa: ARG002 """Returns `True` if the input is valid.""" return isinstance(input_item, str) @@ -39,27 +39,28 @@ def build_table( self, input_item: str, table_name: str, - **kwargs, - ) -> SqlTable: + **kwargs: Any, # noqa: ARG002 + ) -> SqlTable: # type: ignore[invalid-type-form] """Create a table from the input source.""" - _, extension = os.path.splitext(input_item) - format = extension.lstrip(".").lower() + extension = Path(input_item).suffix + file_format = extension.lstrip(".").lower() num_rows = 0 # Total number of rows in the file. Used for statistics columns = [] - if format == "parquet": + if file_format == "parquet": import pyarrow.parquet as pq # Read the Parquet metadata metadata = pq.read_metadata(input_item) num_rows = metadata.num_rows # Iterate through the schema and build the SqlTable - for col in metadata.schema: - columns.append( - ( - col.name, - DataTypeMap.from_parquet_type_str(col.physical_type), - ) + columns = [ + ( + col.name, + DataTypeMap.from_parquet_type_str(col.physical_type), ) + for col in metadata.schema + ] + elif format == "csv": import csv @@ -69,19 +70,18 @@ def build_table( # to get that information. However, this should only be occurring # at table creation time and therefore shouldn't # slow down query performance. - with open(input_item, "r") as file: + with Path(input_item).open() as file: reader = csv.reader(file) - header_row = next(reader) - print(header_row) + _header_row = next(reader) for _ in reader: num_rows += 1 # TODO: Need to actually consume this row into reasonable columns - raise RuntimeError("TODO: Currently unable to support CSV input files.") + msg = "TODO: Currently unable to support CSV input files." + raise RuntimeError(msg) else: - raise RuntimeError( - f"Input of format: `{format}` is currently not supported.\ + msg = f"Input of format: `{format}` is currently not supported.\ Only Parquet and CSV." - ) + raise RuntimeError(msg) # Input could possibly be multiple files. Create a list if so input_files = glob.glob(input_item) diff --git a/python/datafusion/io.py b/python/datafusion/io.py index 3b6264948..3e39703e3 100644 --- a/python/datafusion/io.py +++ b/python/datafusion/io.py @@ -19,15 +19,19 @@ from __future__ import annotations -import pathlib - -import pyarrow +from typing import TYPE_CHECKING from datafusion.dataframe import DataFrame -from datafusion.expr import Expr from ._internal import SessionContext as SessionContextInternal +if TYPE_CHECKING: + import pathlib + + import pyarrow as pa + + from datafusion.expr import Expr + def read_parquet( path: str | pathlib.Path, @@ -35,7 +39,7 @@ def read_parquet( parquet_pruning: bool = True, file_extension: str = ".parquet", skip_metadata: bool = True, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_sort_order: list[list[Expr]] | None = None, ) -> DataFrame: """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. @@ -79,7 +83,7 @@ def read_parquet( def read_json( path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, schema_infer_max_records: int = 1000, file_extension: str = ".json", table_partition_cols: list[tuple[str, str]] | None = None, @@ -120,7 +124,7 @@ def read_json( def read_csv( path: str | pathlib.Path | list[str] | list[pathlib.Path], - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, has_header: bool = True, delimiter: str = ",", schema_infer_max_records: int = 1000, @@ -173,7 +177,7 @@ def read_csv( def read_avro( path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_partition_cols: list[tuple[str, str]] | None = None, file_extension: str = ".avro", ) -> DataFrame: diff --git a/python/datafusion/object_store.py b/python/datafusion/object_store.py index 7cc17506f..6298526f5 100644 --- a/python/datafusion/object_store.py +++ b/python/datafusion/object_store.py @@ -24,4 +24,4 @@ MicrosoftAzure = object_store.MicrosoftAzure Http = object_store.Http -__all__ = ["AmazonS3", "GoogleCloud", "LocalFileSystem", "MicrosoftAzure", "Http"] +__all__ = ["AmazonS3", "GoogleCloud", "Http", "LocalFileSystem", "MicrosoftAzure"] diff --git a/python/datafusion/plan.py b/python/datafusion/plan.py index 133fc446d..0b7bebcb3 100644 --- a/python/datafusion/plan.py +++ b/python/datafusion/plan.py @@ -19,7 +19,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, List +from typing import TYPE_CHECKING, Any import datafusion._internal as df_internal @@ -27,8 +27,8 @@ from datafusion.context import SessionContext __all__ = [ - "LogicalPlan", "ExecutionPlan", + "LogicalPlan", ] @@ -54,7 +54,7 @@ def to_variant(self) -> Any: """Convert the logical plan into its specific variant.""" return self._raw_plan.to_variant() - def inputs(self) -> List[LogicalPlan]: + def inputs(self) -> list[LogicalPlan]: """Returns the list of inputs to the logical plan.""" return [LogicalPlan(p) for p in self._raw_plan.inputs()] @@ -106,7 +106,7 @@ def __init__(self, plan: df_internal.ExecutionPlan) -> None: """This constructor should not be called by the end user.""" self._raw_plan = plan - def children(self) -> List[ExecutionPlan]: + def children(self) -> list[ExecutionPlan]: """Get a list of children `ExecutionPlan` that act as inputs to this plan. The returned list will be empty for leaf nodes such as scans, will contain a diff --git a/python/datafusion/record_batch.py b/python/datafusion/record_batch.py index 772cd9089..556eaa786 100644 --- a/python/datafusion/record_batch.py +++ b/python/datafusion/record_batch.py @@ -26,14 +26,14 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - import pyarrow + import pyarrow as pa import typing_extensions import datafusion._internal as df_internal class RecordBatch: - """This class is essentially a wrapper for :py:class:`pyarrow.RecordBatch`.""" + """This class is essentially a wrapper for :py:class:`pa.RecordBatch`.""" def __init__(self, record_batch: df_internal.RecordBatch) -> None: """This constructor is generally not called by the end user. @@ -42,8 +42,8 @@ def __init__(self, record_batch: df_internal.RecordBatch) -> None: """ self.record_batch = record_batch - def to_pyarrow(self) -> pyarrow.RecordBatch: - """Convert to :py:class:`pyarrow.RecordBatch`.""" + def to_pyarrow(self) -> pa.RecordBatch: + """Convert to :py:class:`pa.RecordBatch`.""" return self.record_batch.to_pyarrow() diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index 06302fe38..f10adfb0c 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -23,7 +23,6 @@ from __future__ import annotations -import pathlib from typing import TYPE_CHECKING try: @@ -36,11 +35,13 @@ from ._internal import substrait as substrait_internal if TYPE_CHECKING: + import pathlib + from datafusion.context import SessionContext __all__ = [ - "Plan", "Consumer", + "Plan", "Producer", "Serde", ] @@ -68,11 +69,9 @@ def encode(self) -> bytes: @deprecated("Use `Plan` instead.") -class plan(Plan): +class plan(Plan): # noqa: N801 """See `Plan`.""" - pass - class Serde: """Provides the ``Substrait`` serialization and deserialization.""" @@ -140,11 +139,9 @@ def deserialize_bytes(proto_bytes: bytes) -> Plan: @deprecated("Use `Serde` instead.") -class serde(Serde): +class serde(Serde): # noqa: N801 """See `Serde` instead.""" - pass - class Producer: """Generates substrait plans from a logical plan.""" @@ -168,11 +165,9 @@ def to_substrait_plan(logical_plan: LogicalPlan, ctx: SessionContext) -> Plan: @deprecated("Use `Producer` instead.") -class producer(Producer): +class producer(Producer): # noqa: N801 """Use `Producer` instead.""" - pass - class Consumer: """Generates a logical plan from a substrait plan.""" @@ -194,7 +189,5 @@ def from_substrait_plan(ctx: SessionContext, plan: Plan) -> LogicalPlan: @deprecated("Use `Consumer` instead.") -class consumer(Consumer): +class consumer(Consumer): # noqa: N801 """Use `Consumer` instead.""" - - pass diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index af7bcf2ed..603b7063d 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -22,15 +22,15 @@ import functools from abc import ABCMeta, abstractmethod from enum import Enum -from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar +from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, overload -import pyarrow +import pyarrow as pa import datafusion._internal as df_internal from datafusion.expr import Expr if TYPE_CHECKING: - _R = TypeVar("_R", bound=pyarrow.DataType) + _R = TypeVar("_R", bound=pa.DataType) class Volatility(Enum): @@ -72,7 +72,7 @@ class Volatility(Enum): for each output row, resulting in a unique random value for each row. """ - def __str__(self): + def __str__(self) -> str: """Returns the string equivalent.""" return self.name.lower() @@ -88,7 +88,7 @@ def __init__( self, name: str, func: Callable[..., _R], - input_types: pyarrow.DataType | list[pyarrow.DataType], + input_types: pa.DataType | list[pa.DataType], return_type: _R, volatility: Volatility | str, ) -> None: @@ -96,7 +96,7 @@ def __init__( See helper method :py:func:`udf` for argument details. """ - if isinstance(input_types, pyarrow.DataType): + if isinstance(input_types, pa.DataType): input_types = [input_types] self._udf = df_internal.ScalarUDF( name, func, input_types, return_type, str(volatility) @@ -111,7 +111,27 @@ def __call__(self, *args: Expr) -> Expr: args_raw = [arg.expr for arg in args] return Expr(self._udf.__call__(*args_raw)) - class udf: + @overload + @staticmethod + def udf( + input_types: list[pa.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[..., ScalarUDF]: ... + + @overload + @staticmethod + def udf( + func: Callable[..., _R], + input_types: list[pa.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> ScalarUDF: ... + + @staticmethod + def udf(*args: Any, **kwargs: Any): # noqa: D417 """Create a new User-Defined Function (UDF). This class can be used both as a **function** and as a **decorator**. @@ -125,7 +145,7 @@ class udf: Args: func (Callable, optional): **Only needed when calling as a function.** Skip this argument when using `udf` as a decorator. - input_types (list[pyarrow.DataType]): The data types of the arguments + input_types (list[pa.DataType]): The data types of the arguments to `func`. This list must be of the same length as the number of arguments. return_type (_R): The data type of the return value from the function. @@ -141,40 +161,28 @@ class udf: ``` def double_func(x): return x * 2 - double_udf = udf(double_func, [pyarrow.int32()], pyarrow.int32(), + double_udf = udf(double_func, [pa.int32()], pa.int32(), "volatile", "double_it") ``` **Using `udf` as a decorator:** ``` - @udf([pyarrow.int32()], pyarrow.int32(), "volatile", "double_it") + @udf([pa.int32()], pa.int32(), "volatile", "double_it") def double_udf(x): return x * 2 ``` """ - def __new__(cls, *args, **kwargs): - """Create a new UDF. - - Trigger UDF function or decorator depending on if the first args is callable - """ - if args and callable(args[0]): - # Case 1: Used as a function, require the first parameter to be callable - return cls._function(*args, **kwargs) - else: - # Case 2: Used as a decorator with parameters - return cls._decorator(*args, **kwargs) - - @staticmethod def _function( func: Callable[..., _R], - input_types: list[pyarrow.DataType], + input_types: list[pa.DataType], return_type: _R, volatility: Volatility | str, name: Optional[str] = None, ) -> ScalarUDF: if not callable(func): - raise TypeError("`func` argument must be callable") + msg = "`func` argument must be callable" + raise TypeError(msg) if name is None: if hasattr(func, "__qualname__"): name = func.__qualname__.lower() @@ -188,49 +196,50 @@ def _function( volatility=volatility, ) - @staticmethod def _decorator( - input_types: list[pyarrow.DataType], + input_types: list[pa.DataType], return_type: _R, volatility: Volatility | str, name: Optional[str] = None, - ): - def decorator(func): + ) -> Callable: + def decorator(func: Callable): udf_caller = ScalarUDF.udf( func, input_types, return_type, volatility, name ) @functools.wraps(func) - def wrapper(*args, **kwargs): + def wrapper(*args: Any, **kwargs: Any): return udf_caller(*args, **kwargs) return wrapper return decorator + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return _function(*args, **kwargs) + # Case 2: Used as a decorator with parameters + return _decorator(*args, **kwargs) + class Accumulator(metaclass=ABCMeta): """Defines how an :py:class:`AggregateUDF` accumulates values.""" @abstractmethod - def state(self) -> List[pyarrow.Scalar]: + def state(self) -> list[pa.Scalar]: """Return the current state.""" - pass @abstractmethod - def update(self, *values: pyarrow.Array) -> None: + def update(self, *values: pa.Array) -> None: """Evaluate an array of values and update state.""" - pass @abstractmethod - def merge(self, states: List[pyarrow.Array]) -> None: + def merge(self, states: list[pa.Array]) -> None: """Merge a set of states.""" - pass @abstractmethod - def evaluate(self) -> pyarrow.Scalar: + def evaluate(self) -> pa.Scalar: """Return the resultant value.""" - pass class AggregateUDF: @@ -244,9 +253,9 @@ def __init__( self, name: str, accumulator: Callable[[], Accumulator], - input_types: list[pyarrow.DataType], - return_type: pyarrow.DataType, - state_type: list[pyarrow.DataType], + input_types: list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], volatility: Volatility | str, ) -> None: """Instantiate a user-defined aggregate function (UDAF). @@ -272,7 +281,29 @@ def __call__(self, *args: Expr) -> Expr: args_raw = [arg.expr for arg in args] return Expr(self._udaf.__call__(*args_raw)) - class udaf: + @overload + @staticmethod + def udaf( + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[..., AggregateUDF]: ... + + @overload + @staticmethod + def udaf( + accum: Callable[[], Accumulator], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ) -> AggregateUDF: ... + + @staticmethod + def udaf(*args: Any, **kwargs: Any): # noqa: D417 """Create a new User-Defined Aggregate Function (UDAF). This class allows you to define an **aggregate function** that can be used in @@ -300,13 +331,13 @@ class Summarize(Accumulator): def __init__(self, bias: float = 0.0): self._sum = pa.scalar(bias) - def state(self) -> List[pa.Scalar]: + def state(self) -> list[pa.Scalar]: return [self._sum] def update(self, values: pa.Array) -> None: self._sum = pa.scalar(self._sum.as_py() + pc.sum(values).as_py()) - def merge(self, states: List[pa.Array]) -> None: + def merge(self, states: list[pa.Array]) -> None: self._sum = pa.scalar(self._sum.as_py() + pc.sum(states[0]).as_py()) def evaluate(self) -> pa.Scalar: @@ -344,37 +375,23 @@ def udf4() -> Summarize: aggregation or window function calls. """ - def __new__(cls, *args, **kwargs): - """Create a new UDAF. - - Trigger UDAF function or decorator depending on if the first args is - callable - """ - if args and callable(args[0]): - # Case 1: Used as a function, require the first parameter to be callable - return cls._function(*args, **kwargs) - else: - # Case 2: Used as a decorator with parameters - return cls._decorator(*args, **kwargs) - - @staticmethod def _function( accum: Callable[[], Accumulator], - input_types: pyarrow.DataType | list[pyarrow.DataType], - return_type: pyarrow.DataType, - state_type: list[pyarrow.DataType], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], volatility: Volatility | str, name: Optional[str] = None, ) -> AggregateUDF: if not callable(accum): - raise TypeError("`func` must be callable.") - if not isinstance(accum.__call__(), Accumulator): - raise TypeError( - "Accumulator must implement the abstract base class Accumulator" - ) + msg = "`func` must be callable." + raise TypeError(msg) + if not isinstance(accum(), Accumulator): + msg = "Accumulator must implement the abstract base class Accumulator" + raise TypeError(msg) if name is None: - name = accum.__call__().__class__.__qualname__.lower() - if isinstance(input_types, pyarrow.DataType): + name = accum().__class__.__qualname__.lower() + if isinstance(input_types, pa.DataType): input_types = [input_types] return AggregateUDF( name=name, @@ -385,29 +402,34 @@ def _function( volatility=volatility, ) - @staticmethod def _decorator( - input_types: pyarrow.DataType | list[pyarrow.DataType], - return_type: pyarrow.DataType, - state_type: list[pyarrow.DataType], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], volatility: Volatility | str, name: Optional[str] = None, - ): - def decorator(accum: Callable[[], Accumulator]): + ) -> Callable[..., Callable[..., Expr]]: + def decorator(accum: Callable[[], Accumulator]) -> Callable[..., Expr]: udaf_caller = AggregateUDF.udaf( accum, input_types, return_type, state_type, volatility, name ) @functools.wraps(accum) - def wrapper(*args, **kwargs): + def wrapper(*args: Any, **kwargs: Any) -> Expr: return udaf_caller(*args, **kwargs) return wrapper return decorator + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return _function(*args, **kwargs) + # Case 2: Used as a decorator with parameters + return _decorator(*args, **kwargs) + -class WindowEvaluator(metaclass=ABCMeta): +class WindowEvaluator: """Evaluator class for user-defined window functions (UDWF). It is up to the user to decide which evaluate function is appropriate. @@ -423,7 +445,7 @@ class WindowEvaluator(metaclass=ABCMeta): +------------------------+--------------------------------+------------------+---------------------------+ | True | True/False | True/False | ``evaluate`` | +------------------------+--------------------------------+------------------+---------------------------+ - """ # noqa: W505 + """ # noqa: W505, E501 def memoize(self) -> None: """Perform a memoize operation to improve performance. @@ -436,9 +458,8 @@ def memoize(self) -> None: `memoize` is called after each input batch is processed, and such functions can save whatever they need """ - pass - def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: + def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: # noqa: ARG002 """Return the range for the window fuction. If `uses_window_frame` flag is `false`. This method is used to @@ -460,14 +481,17 @@ def is_causal(self) -> bool: """Get whether evaluator needs future data for its result.""" return False - def evaluate_all(self, values: list[pyarrow.Array], num_rows: int) -> pyarrow.Array: + def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: """Evaluate a window function on an entire input partition. This function is called once per input *partition* for window functions that *do not use* values from the window frame, such as - :py:func:`~datafusion.functions.row_number`, :py:func:`~datafusion.functions.rank`, - :py:func:`~datafusion.functions.dense_rank`, :py:func:`~datafusion.functions.percent_rank`, - :py:func:`~datafusion.functions.cume_dist`, :py:func:`~datafusion.functions.lead`, + :py:func:`~datafusion.functions.row_number`, + :py:func:`~datafusion.functions.rank`, + :py:func:`~datafusion.functions.dense_rank`, + :py:func:`~datafusion.functions.percent_rank`, + :py:func:`~datafusion.functions.cume_dist`, + :py:func:`~datafusion.functions.lead`, and :py:func:`~datafusion.functions.lag`. It produces the result of all rows in a single pass. It @@ -499,12 +523,11 @@ def evaluate_all(self, values: list[pyarrow.Array], num_rows: int) -> pyarrow.Ar .. code-block:: text avg(x) OVER (PARTITION BY y ORDER BY z ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) - """ # noqa: W505 - pass + """ # noqa: W505, E501 def evaluate( - self, values: list[pyarrow.Array], eval_range: tuple[int, int] - ) -> pyarrow.Scalar: + self, values: list[pa.Array], eval_range: tuple[int, int] + ) -> pa.Scalar: """Evaluate window function on a range of rows in an input partition. This is the simplest and most general function to implement @@ -519,11 +542,10 @@ def evaluate( and evaluation results of ORDER BY expressions. If function has a single argument, `values[1..]` will contain ORDER BY expression results. """ - pass def evaluate_all_with_rank( self, num_rows: int, ranks_in_partition: list[tuple[int, int]] - ) -> pyarrow.Array: + ) -> pa.Array: """Called for window functions that only need the rank of a row. Evaluate the partition evaluator against the partition using @@ -552,7 +574,6 @@ def evaluate_all_with_rank( The user must implement this method if ``include_rank`` returns True. """ - pass def supports_bounded_execution(self) -> bool: """Can the window function be incrementally computed using bounded memory?""" @@ -567,10 +588,6 @@ def include_rank(self) -> bool: return False -if TYPE_CHECKING: - _W = TypeVar("_W", bound=WindowEvaluator) - - class WindowUDF: """Class for performing window user-defined functions (UDF). @@ -582,8 +599,8 @@ def __init__( self, name: str, func: Callable[[], WindowEvaluator], - input_types: list[pyarrow.DataType], - return_type: pyarrow.DataType, + input_types: list[pa.DataType], + return_type: pa.DataType, volatility: Volatility | str, ) -> None: """Instantiate a user-defined window function (UDWF). @@ -607,8 +624,8 @@ def __call__(self, *args: Expr) -> Expr: @staticmethod def udwf( func: Callable[[], WindowEvaluator], - input_types: pyarrow.DataType | list[pyarrow.DataType], - return_type: pyarrow.DataType, + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, volatility: Volatility | str, name: Optional[str] = None, ) -> WindowUDF: @@ -648,16 +665,16 @@ def bias_10() -> BiasedNumbers: Returns: A user-defined window function. - """ # noqa W505 + """ # noqa: W505, E501 if not callable(func): - raise TypeError("`func` must be callable.") - if not isinstance(func.__call__(), WindowEvaluator): - raise TypeError( - "`func` must implement the abstract base class WindowEvaluator" - ) + msg = "`func` must be callable." + raise TypeError(msg) + if not isinstance(func(), WindowEvaluator): + msg = "`func` must implement the abstract base class WindowEvaluator" + raise TypeError(msg) if name is None: - name = func.__call__().__class__.__qualname__.lower() - if isinstance(input_types, pyarrow.DataType): + name = func().__class__.__qualname__.lower() + if isinstance(input_types, pa.DataType): input_types = [input_types] return WindowUDF( name=name, @@ -666,3 +683,10 @@ def bias_10() -> BiasedNumbers: return_type=return_type, volatility=volatility, ) + + +# Convenience exports so we can import instead of treating as +# variables at the package root +udf = ScalarUDF.udf +udaf = AggregateUDF.udaf +udwf = WindowUDF.udwf diff --git a/python/tests/generic.py b/python/tests/generic.py index 0177e2df0..1b98fdf9e 100644 --- a/python/tests/generic.py +++ b/python/tests/generic.py @@ -16,6 +16,7 @@ # under the License. import datetime +from datetime import timezone import numpy as np import pyarrow as pa @@ -26,29 +27,29 @@ def data(): - np.random.seed(1) + rng = np.random.default_rng(1) data = np.concatenate( [ - np.random.normal(0, 0.01, size=50), - np.random.normal(50, 0.01, size=50), + rng.normal(0, 0.01, size=50), + rng.normal(50, 0.01, size=50), ] ) return pa.array(data) def data_with_nans(): - np.random.seed(0) - data = np.random.normal(0, 0.01, size=50) - mask = np.random.randint(0, 2, size=50) + rng = np.random.default_rng(0) + data = rng.normal(0, 0.01, size=50) + mask = rng.normal(0, 2, size=50) data[mask == 0] = np.nan return data def data_datetime(f): data = [ - datetime.datetime.now(), - datetime.datetime.now() - datetime.timedelta(days=1), - datetime.datetime.now() + datetime.timedelta(days=1), + datetime.datetime.now(tz=timezone.utc), + datetime.datetime.now(tz=timezone.utc) - datetime.timedelta(days=1), + datetime.datetime.now(tz=timezone.utc) + datetime.timedelta(days=1), ] return pa.array(data, type=pa.timestamp(f), mask=np.array([False, True, False])) diff --git a/python/tests/test_aggregation.py b/python/tests/test_aggregation.py index 5ef46131b..61b1c7d80 100644 --- a/python/tests/test_aggregation.py +++ b/python/tests/test_aggregation.py @@ -66,7 +66,7 @@ def df_aggregate_100(): @pytest.mark.parametrize( - "agg_expr, calc_expected", + ("agg_expr", "calc_expected"), [ (f.avg(column("a")), lambda a, b, c, d: np.array(np.average(a))), ( @@ -114,7 +114,7 @@ def test_aggregation_stats(df, agg_expr, calc_expected): @pytest.mark.parametrize( - "agg_expr, expected, array_sort", + ("agg_expr", "expected", "array_sort"), [ (f.approx_distinct(column("b")), pa.array([2], type=pa.uint64()), False), ( @@ -182,12 +182,11 @@ def test_aggregation(df, agg_expr, expected, array_sort): agg_df.show() result = agg_df.collect()[0] - print(result) assert result.column(0) == expected @pytest.mark.parametrize( - "name,expr,expected", + ("name", "expr", "expected"), [ ( "approx_percentile_cont", @@ -299,7 +298,9 @@ def test_aggregate_100(df_aggregate_100, name, expr, expected): ] -@pytest.mark.parametrize("name,expr,result", data_test_bitwise_and_boolean_functions) +@pytest.mark.parametrize( + ("name", "expr", "result"), data_test_bitwise_and_boolean_functions +) def test_bit_and_bool_fns(df, name, expr, result): df = df.aggregate([], [expr.alias(name)]) @@ -311,7 +312,7 @@ def test_bit_and_bool_fns(df, name, expr, result): @pytest.mark.parametrize( - "name,expr,result", + ("name", "expr", "result"), [ ("first_value", f.first_value(column("a")), [0, 4]), ( @@ -361,7 +362,6 @@ def test_bit_and_bool_fns(df, name, expr, result): ), [8, 9], ), - ("first_value", f.first_value(column("a")), [0, 4]), ( "nth_value_ordered", f.nth_value(column("a"), 2, order_by=[column("a").sort(ascending=False)]), @@ -401,7 +401,7 @@ def test_first_last_value(df_partitioned, name, expr, result) -> None: @pytest.mark.parametrize( - "name,expr,result", + ("name", "expr", "result"), [ ("string_agg", f.string_agg(column("a"), ","), "one,two,three,two"), ("string_agg", f.string_agg(column("b"), ""), "03124"), diff --git a/python/tests/test_catalog.py b/python/tests/test_catalog.py index 214f6b165..23b328458 100644 --- a/python/tests/test_catalog.py +++ b/python/tests/test_catalog.py @@ -19,6 +19,9 @@ import pytest +# Note we take in `database` as a variable even though we don't use +# it because that will cause the fixture to set up the context with +# the tables we need. def test_basic(ctx, database): with pytest.raises(KeyError): ctx.catalog("non-existent") @@ -26,10 +29,10 @@ def test_basic(ctx, database): default = ctx.catalog() assert default.names() == ["public"] - for database in [default.database("public"), default.database()]: - assert database.names() == {"csv1", "csv", "csv2"} + for db in [default.database("public"), default.database()]: + assert db.names() == {"csv1", "csv", "csv2"} - table = database.table("csv") + table = db.table("csv") assert table.kind == "physical" assert table.schema == pa.schema( [ diff --git a/python/tests/test_context.py b/python/tests/test_context.py index 91046e6b8..7a0a7aa08 100644 --- a/python/tests/test_context.py +++ b/python/tests/test_context.py @@ -16,7 +16,6 @@ # under the License. import datetime as dt import gzip -import os import pathlib import pyarrow as pa @@ -45,7 +44,7 @@ def test_create_context_runtime_config_only(): SessionContext(runtime=RuntimeEnvBuilder()) -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_runtime_configs(tmp_path, path_to_str): path1 = tmp_path / "dir1" path2 = tmp_path / "dir2" @@ -62,7 +61,7 @@ def test_runtime_configs(tmp_path, path_to_str): assert db is not None -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_temporary_files(tmp_path, path_to_str): path = str(tmp_path) if path_to_str else tmp_path @@ -79,14 +78,14 @@ def test_create_context_with_all_valid_args(): runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000) config = ( SessionConfig() - .with_create_default_catalog_and_schema(True) + .with_create_default_catalog_and_schema(enabled=True) .with_default_catalog_and_schema("foo", "bar") .with_target_partitions(1) - .with_information_schema(True) - .with_repartition_joins(False) - .with_repartition_aggregations(False) - .with_repartition_windows(False) - .with_parquet_pruning(False) + .with_information_schema(enabled=True) + .with_repartition_joins(enabled=False) + .with_repartition_aggregations(enabled=False) + .with_repartition_windows(enabled=False) + .with_parquet_pruning(enabled=False) ) ctx = SessionContext(config, runtime) @@ -167,7 +166,7 @@ def test_from_arrow_table(ctx): def record_batch_generator(num_batches: int): schema = pa.schema([("a", pa.int64()), ("b", pa.int64())]) - for i in range(num_batches): + for _i in range(num_batches): yield pa.RecordBatch.from_arrays( [pa.array([1, 2, 3]), pa.array([4, 5, 6])], schema=schema ) @@ -492,10 +491,10 @@ def test_table_not_found(ctx): def test_read_json(ctx): - path = os.path.dirname(os.path.abspath(__file__)) + path = pathlib.Path(__file__).parent.resolve() # Default - test_data_path = os.path.join(path, "data_test_context", "data.json") + test_data_path = path / "data_test_context" / "data.json" df = ctx.read_json(test_data_path) result = df.collect() @@ -515,7 +514,7 @@ def test_read_json(ctx): assert result[0].schema == schema # File extension - test_data_path = os.path.join(path, "data_test_context", "data.json") + test_data_path = path / "data_test_context" / "data.json" df = ctx.read_json(test_data_path, file_extension=".json") result = df.collect() @@ -524,15 +523,17 @@ def test_read_json(ctx): def test_read_json_compressed(ctx, tmp_path): - path = os.path.dirname(os.path.abspath(__file__)) - test_data_path = os.path.join(path, "data_test_context", "data.json") + path = pathlib.Path(__file__).parent.resolve() + test_data_path = path / "data_test_context" / "data.json" # File compression type gzip_path = tmp_path / "data.json.gz" - with open(test_data_path, "rb") as csv_file: - with gzip.open(gzip_path, "wb") as gzipped_file: - gzipped_file.writelines(csv_file) + with ( + pathlib.Path.open(test_data_path, "rb") as csv_file, + gzip.open(gzip_path, "wb") as gzipped_file, + ): + gzipped_file.writelines(csv_file) df = ctx.read_json(gzip_path, file_extension=".gz", file_compression_type="gz") result = df.collect() @@ -563,14 +564,16 @@ def test_read_csv_list(ctx): def test_read_csv_compressed(ctx, tmp_path): - test_data_path = "testing/data/csv/aggregate_test_100.csv" + test_data_path = pathlib.Path("testing/data/csv/aggregate_test_100.csv") # File compression type gzip_path = tmp_path / "aggregate_test_100.csv.gz" - with open(test_data_path, "rb") as csv_file: - with gzip.open(gzip_path, "wb") as gzipped_file: - gzipped_file.writelines(csv_file) + with ( + pathlib.Path.open(test_data_path, "rb") as csv_file, + gzip.open(gzip_path, "wb") as gzipped_file, + ): + gzipped_file.writelines(csv_file) csv_df = ctx.read_csv(gzip_path, file_extension=".gz", file_compression_type="gz") csv_df.select(column("c1")).show() @@ -603,7 +606,7 @@ def test_create_sql_options(): def test_sql_with_options_no_ddl(ctx): sql = "CREATE TABLE IF NOT EXISTS valuetable AS VALUES(1,'HELLO'),(12,'DATAFUSION')" ctx.sql(sql) - options = SQLOptions().with_allow_ddl(False) + options = SQLOptions().with_allow_ddl(allow=False) with pytest.raises(Exception, match="DDL"): ctx.sql_with_options(sql, options=options) @@ -618,7 +621,7 @@ def test_sql_with_options_no_dml(ctx): ctx.register_dataset(table_name, dataset) sql = f'INSERT INTO "{table_name}" VALUES (1, 2), (2, 3);' ctx.sql(sql) - options = SQLOptions().with_allow_dml(False) + options = SQLOptions().with_allow_dml(allow=False) with pytest.raises(Exception, match="DML"): ctx.sql_with_options(sql, options=options) @@ -626,6 +629,6 @@ def test_sql_with_options_no_dml(ctx): def test_sql_with_options_no_statements(ctx): sql = "SET time zone = 1;" ctx.sql(sql) - options = SQLOptions().with_allow_statements(False) + options = SQLOptions().with_allow_statements(allow=False) with pytest.raises(Exception, match="SetVariable"): ctx.sql_with_options(sql, options=options) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index c636e896a..d084f12dd 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -339,7 +339,7 @@ def test_join(): # Verify we don't make a breaking change to pre-43.0.0 # where users would pass join_keys as a positional argument - df2 = df.join(df1, (["a"], ["a"]), how="inner") # type: ignore + df2 = df.join(df1, (["a"], ["a"]), how="inner") df2.show() df2 = df2.sort(column("l.a")) table = pa.Table.from_batches(df2.collect()) @@ -375,17 +375,17 @@ def test_join_invalid_params(): with pytest.raises( ValueError, match=r"`left_on` or `right_on` should not provided with `on`" ): - df2 = df.join(df1, on="a", how="inner", right_on="test") # type: ignore + df2 = df.join(df1, on="a", how="inner", right_on="test") with pytest.raises( ValueError, match=r"`left_on` and `right_on` should both be provided." ): - df2 = df.join(df1, left_on="a", how="inner") # type: ignore + df2 = df.join(df1, left_on="a", how="inner") with pytest.raises( ValueError, match=r"either `on` or `left_on` and `right_on` should be provided." ): - df2 = df.join(df1, how="inner") # type: ignore + df2 = df.join(df1, how="inner") def test_join_on(): @@ -567,7 +567,7 @@ def test_distinct(): ] -@pytest.mark.parametrize("name,expr,result", data_test_window_functions) +@pytest.mark.parametrize(("name", "expr", "result"), data_test_window_functions) def test_window_functions(partitioned_df, name, expr, result): df = partitioned_df.select( column("a"), column("b"), column("c"), f.alias(expr, name) @@ -731,7 +731,7 @@ def test_execution_plan(aggregate_df): plan = aggregate_df.execution_plan() expected = ( - "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n" # noqa: E501 + "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n" ) assert expected == plan.display() @@ -756,7 +756,7 @@ def test_execution_plan(aggregate_df): ctx = SessionContext() rows_returned = 0 - for idx in range(0, plan.partition_count): + for idx in range(plan.partition_count): stream = ctx.execute(plan, idx) try: batch = stream.next() @@ -885,7 +885,7 @@ def test_union_distinct(ctx): ) df_c = ctx.create_dataframe([[batch]]).sort(column("a")) - df_a_u_b = df_a.union(df_b, True).sort(column("a")) + df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a")) assert df_c.collect() == df_a_u_b.collect() assert df_c.collect() == df_a_u_b.collect() @@ -954,8 +954,6 @@ def test_to_arrow_table(df): def test_execute_stream(df): stream = df.execute_stream() - for s in stream: - print(type(s)) assert all(batch is not None for batch in stream) assert not list(stream) # after one iteration the generator must be exhausted @@ -969,7 +967,7 @@ def test_execute_stream_to_arrow_table(df, schema): (batch.to_pyarrow() for batch in stream), schema=df.schema() ) else: - pyarrow_table = pa.Table.from_batches((batch.to_pyarrow() for batch in stream)) + pyarrow_table = pa.Table.from_batches(batch.to_pyarrow() for batch in stream) assert isinstance(pyarrow_table, pa.Table) assert pyarrow_table.shape == (3, 3) @@ -1033,7 +1031,7 @@ def test_describe(df): } -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_write_csv(ctx, df, tmp_path, path_to_str): path = str(tmp_path) if path_to_str else tmp_path @@ -1046,7 +1044,7 @@ def test_write_csv(ctx, df, tmp_path, path_to_str): assert result == expected -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_write_json(ctx, df, tmp_path, path_to_str): path = str(tmp_path) if path_to_str else tmp_path @@ -1059,7 +1057,7 @@ def test_write_json(ctx, df, tmp_path, path_to_str): assert result == expected -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_write_parquet(df, tmp_path, path_to_str): path = str(tmp_path) if path_to_str else tmp_path @@ -1071,7 +1069,7 @@ def test_write_parquet(df, tmp_path, path_to_str): @pytest.mark.parametrize( - "compression, compression_level", + ("compression", "compression_level"), [("gzip", 6), ("brotli", 7), ("zstd", 15)], ) def test_write_compressed_parquet(df, tmp_path, compression, compression_level): @@ -1082,7 +1080,7 @@ def test_write_compressed_parquet(df, tmp_path, compression, compression_level): ) # test that the actual compression scheme is the one written - for root, dirs, files in os.walk(path): + for _root, _dirs, files in os.walk(path): for file in files: if file.endswith(".parquet"): metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict() @@ -1097,7 +1095,7 @@ def test_write_compressed_parquet(df, tmp_path, compression, compression_level): @pytest.mark.parametrize( - "compression, compression_level", + ("compression", "compression_level"), [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)], ) def test_write_compressed_parquet_wrong_compression_level( @@ -1152,7 +1150,7 @@ def test_dataframe_export(df) -> None: table = pa.table(df, schema=desired_schema) assert table.num_columns == 1 assert table.num_rows == 3 - for i in range(0, 3): + for i in range(3): assert table[0][i].as_py() is None # Expect an error when we cannot convert schema @@ -1186,8 +1184,8 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame: result = df.to_pydict() assert result["a"] == [1, 2, 3] - assert result["string_col"] == ["string data" for _i in range(0, 3)] - assert result["new_col"] == [3 for _i in range(0, 3)] + assert result["string_col"] == ["string data" for _i in range(3)] + assert result["new_col"] == [3 for _i in range(3)] def test_dataframe_repr_html(df) -> None: diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index 354c7e180..926e69845 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -85,18 +85,14 @@ def test_limit(test_ctx): plan = plan.to_variant() assert isinstance(plan, Limit) - # TODO: Upstream now has expressions for skip and fetch - # REF: https://github.com/apache/datafusion/pull/12836 - # assert plan.skip() == 0 + assert "Skip: None" in str(plan) df = test_ctx.sql("select c1 from test LIMIT 10 OFFSET 5") plan = df.logical_plan() plan = plan.to_variant() assert isinstance(plan, Limit) - # TODO: Upstream now has expressions for skip and fetch - # REF: https://github.com/apache/datafusion/pull/12836 - # assert plan.skip() == 5 + assert "Skip: Some(Literal(Int64(5)))" in str(plan) def test_aggregate_query(test_ctx): @@ -165,6 +161,7 @@ def traverse_logical_plan(plan): res = traverse_logical_plan(input_plan) if res is not None: return res + return None ctx = SessionContext() data = {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]} @@ -176,7 +173,7 @@ def traverse_logical_plan(plan): assert variant.expr().to_variant().qualified_name() == "table1.name" assert ( str(variant.list()) - == '[Expr(Utf8("dfa")), Expr(Utf8("ad")), Expr(Utf8("dfre")), Expr(Utf8("vsa"))]' + == '[Expr(Utf8("dfa")), Expr(Utf8("ad")), Expr(Utf8("dfre")), Expr(Utf8("vsa"))]' # noqa: E501 ) assert not variant.negated() diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index fca05bb8f..ed88a16e3 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. import math -from datetime import datetime +from datetime import datetime, timezone import numpy as np import pyarrow as pa @@ -25,6 +25,8 @@ np.seterr(invalid="ignore") +DEFAULT_TZ = timezone.utc + @pytest.fixture def df(): @@ -37,9 +39,9 @@ def df(): pa.array(["hello ", " world ", " !"], type=pa.string_view()), pa.array( [ - datetime(2022, 12, 31), - datetime(2027, 6, 26), - datetime(2020, 7, 2), + datetime(2022, 12, 31, tzinfo=DEFAULT_TZ), + datetime(2027, 6, 26, tzinfo=DEFAULT_TZ), + datetime(2020, 7, 2, tzinfo=DEFAULT_TZ), ] ), pa.array([False, True, True]), @@ -221,12 +223,12 @@ def py_indexof(arr, v): def py_arr_remove(arr, v, n=None): new_arr = arr[:] found = 0 - while found != n: - try: + try: + while found != n: new_arr.remove(v) found += 1 - except ValueError: - break + except ValueError: + pass return new_arr @@ -234,13 +236,13 @@ def py_arr_remove(arr, v, n=None): def py_arr_replace(arr, from_, to, n=None): new_arr = arr[:] found = 0 - while found != n: - try: + try: + while found != n: idx = new_arr.index(from_) new_arr[idx] = to found += 1 - except ValueError: - break + except ValueError: + pass return new_arr @@ -268,266 +270,266 @@ def py_flatten(arr): @pytest.mark.parametrize( ("stmt", "py_expr"), [ - [ + ( lambda col: f.array_append(col, literal(99.0)), lambda data: [np.append(arr, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_push_back(col, literal(99.0)), lambda data: [np.append(arr, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_append(col, literal(99.0)), lambda data: [np.append(arr, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_push_back(col, literal(99.0)), lambda data: [np.append(arr, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_concat(col, col), lambda data: [np.concatenate([arr, arr]) for arr in data], - ], - [ + ), + ( lambda col: f.array_cat(col, col), lambda data: [np.concatenate([arr, arr]) for arr in data], - ], - [ + ), + ( lambda col: f.list_cat(col, col), lambda data: [np.concatenate([arr, arr]) for arr in data], - ], - [ + ), + ( lambda col: f.list_concat(col, col), lambda data: [np.concatenate([arr, arr]) for arr in data], - ], - [ + ), + ( lambda col: f.array_dims(col), lambda data: [[len(r)] for r in data], - ], - [ + ), + ( lambda col: f.array_distinct(col), lambda data: [list(set(r)) for r in data], - ], - [ + ), + ( lambda col: f.list_distinct(col), lambda data: [list(set(r)) for r in data], - ], - [ + ), + ( lambda col: f.list_dims(col), lambda data: [[len(r)] for r in data], - ], - [ + ), + ( lambda col: f.array_element(col, literal(1)), lambda data: [r[0] for r in data], - ], - [ + ), + ( lambda col: f.array_empty(col), lambda data: [len(r) == 0 for r in data], - ], - [ + ), + ( lambda col: f.empty(col), lambda data: [len(r) == 0 for r in data], - ], - [ + ), + ( lambda col: f.array_extract(col, literal(1)), lambda data: [r[0] for r in data], - ], - [ + ), + ( lambda col: f.list_element(col, literal(1)), lambda data: [r[0] for r in data], - ], - [ + ), + ( lambda col: f.list_extract(col, literal(1)), lambda data: [r[0] for r in data], - ], - [ + ), + ( lambda col: f.array_length(col), lambda data: [len(r) for r in data], - ], - [ + ), + ( lambda col: f.list_length(col), lambda data: [len(r) for r in data], - ], - [ + ), + ( lambda col: f.array_has(col, literal(1.0)), lambda data: [1.0 in r for r in data], - ], - [ + ), + ( lambda col: f.array_has_all( col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]]) ), lambda data: [np.all([v in r for v in [1.0, 3.0, 5.0]]) for r in data], - ], - [ + ), + ( lambda col: f.array_has_any( col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]]) ), lambda data: [np.any([v in r for v in [1.0, 3.0, 5.0]]) for r in data], - ], - [ + ), + ( lambda col: f.array_position(col, literal(1.0)), lambda data: [py_indexof(r, 1.0) for r in data], - ], - [ + ), + ( lambda col: f.array_indexof(col, literal(1.0)), lambda data: [py_indexof(r, 1.0) for r in data], - ], - [ + ), + ( lambda col: f.list_position(col, literal(1.0)), lambda data: [py_indexof(r, 1.0) for r in data], - ], - [ + ), + ( lambda col: f.list_indexof(col, literal(1.0)), lambda data: [py_indexof(r, 1.0) for r in data], - ], - [ + ), + ( lambda col: f.array_positions(col, literal(1.0)), lambda data: [[i + 1 for i, _v in enumerate(r) if _v == 1.0] for r in data], - ], - [ + ), + ( lambda col: f.list_positions(col, literal(1.0)), lambda data: [[i + 1 for i, _v in enumerate(r) if _v == 1.0] for r in data], - ], - [ + ), + ( lambda col: f.array_ndims(col), lambda data: [np.array(r).ndim for r in data], - ], - [ + ), + ( lambda col: f.list_ndims(col), lambda data: [np.array(r).ndim for r in data], - ], - [ + ), + ( lambda col: f.array_prepend(literal(99.0), col), lambda data: [np.insert(arr, 0, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_push_front(literal(99.0), col), lambda data: [np.insert(arr, 0, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_prepend(literal(99.0), col), lambda data: [np.insert(arr, 0, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_push_front(literal(99.0), col), lambda data: [np.insert(arr, 0, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_pop_back(col), lambda data: [arr[:-1] for arr in data], - ], - [ + ), + ( lambda col: f.array_pop_front(col), lambda data: [arr[1:] for arr in data], - ], - [ + ), + ( lambda col: f.array_remove(col, literal(3.0)), lambda data: [py_arr_remove(arr, 3.0, 1) for arr in data], - ], - [ + ), + ( lambda col: f.list_remove(col, literal(3.0)), lambda data: [py_arr_remove(arr, 3.0, 1) for arr in data], - ], - [ + ), + ( lambda col: f.array_remove_n(col, literal(3.0), literal(2)), lambda data: [py_arr_remove(arr, 3.0, 2) for arr in data], - ], - [ + ), + ( lambda col: f.list_remove_n(col, literal(3.0), literal(2)), lambda data: [py_arr_remove(arr, 3.0, 2) for arr in data], - ], - [ + ), + ( lambda col: f.array_remove_all(col, literal(3.0)), lambda data: [py_arr_remove(arr, 3.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_remove_all(col, literal(3.0)), lambda data: [py_arr_remove(arr, 3.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_repeat(col, literal(2)), lambda data: [[arr] * 2 for arr in data], - ], - [ + ), + ( lambda col: f.list_repeat(col, literal(2)), lambda data: [[arr] * 2 for arr in data], - ], - [ + ), + ( lambda col: f.array_replace(col, literal(3.0), literal(4.0)), lambda data: [py_arr_replace(arr, 3.0, 4.0, 1) for arr in data], - ], - [ + ), + ( lambda col: f.list_replace(col, literal(3.0), literal(4.0)), lambda data: [py_arr_replace(arr, 3.0, 4.0, 1) for arr in data], - ], - [ + ), + ( lambda col: f.array_replace_n(col, literal(3.0), literal(4.0), literal(1)), lambda data: [py_arr_replace(arr, 3.0, 4.0, 1) for arr in data], - ], - [ + ), + ( lambda col: f.list_replace_n(col, literal(3.0), literal(4.0), literal(2)), lambda data: [py_arr_replace(arr, 3.0, 4.0, 2) for arr in data], - ], - [ + ), + ( lambda col: f.array_replace_all(col, literal(3.0), literal(4.0)), lambda data: [py_arr_replace(arr, 3.0, 4.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_replace_all(col, literal(3.0), literal(4.0)), lambda data: [py_arr_replace(arr, 3.0, 4.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_sort(col, descending=True, null_first=True), lambda data: [np.sort(arr)[::-1] for arr in data], - ], - [ + ), + ( lambda col: f.list_sort(col, descending=False, null_first=False), lambda data: [np.sort(arr) for arr in data], - ], - [ + ), + ( lambda col: f.array_slice(col, literal(2), literal(4)), lambda data: [arr[1:4] for arr in data], - ], + ), pytest.param( lambda col: f.list_slice(col, literal(-1), literal(2)), lambda data: [arr[-1:2] for arr in data], ), - [ + ( lambda col: f.array_intersect(col, literal([3.0, 4.0])), lambda data: [np.intersect1d(arr, [3.0, 4.0]) for arr in data], - ], - [ + ), + ( lambda col: f.list_intersect(col, literal([3.0, 4.0])), lambda data: [np.intersect1d(arr, [3.0, 4.0]) for arr in data], - ], - [ + ), + ( lambda col: f.array_union(col, literal([12.0, 999.0])), lambda data: [np.union1d(arr, [12.0, 999.0]) for arr in data], - ], - [ + ), + ( lambda col: f.list_union(col, literal([12.0, 999.0])), lambda data: [np.union1d(arr, [12.0, 999.0]) for arr in data], - ], - [ + ), + ( lambda col: f.array_except(col, literal([3.0])), lambda data: [np.setdiff1d(arr, [3.0]) for arr in data], - ], - [ + ), + ( lambda col: f.list_except(col, literal([3.0])), lambda data: [np.setdiff1d(arr, [3.0]) for arr in data], - ], - [ + ), + ( lambda col: f.array_resize(col, literal(10), literal(0.0)), lambda data: [py_arr_resize(arr, 10, 0.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_resize(col, literal(10), literal(0.0)), lambda data: [py_arr_resize(arr, 10, 0.0) for arr in data], - ], - [ + ), + ( lambda col: f.range(literal(1), literal(5), literal(2)), lambda data: [np.arange(1, 5, 2)], - ], + ), ], ) def test_array_functions(stmt, py_expr): @@ -611,22 +613,22 @@ def test_make_array_functions(make_func): @pytest.mark.parametrize( ("stmt", "py_expr"), [ - [ + ( f.array_to_string(column("arr"), literal(",")), lambda data: [",".join([str(int(v)) for v in r]) for r in data], - ], - [ + ), + ( f.array_join(column("arr"), literal(",")), lambda data: [",".join([str(int(v)) for v in r]) for r in data], - ], - [ + ), + ( f.list_to_string(column("arr"), literal(",")), lambda data: [",".join([str(int(v)) for v in r]) for r in data], - ], - [ + ), + ( f.list_join(column("arr"), literal(",")), lambda data: [",".join([str(int(v)) for v in r]) for r in data], - ], + ), ], ) def test_array_function_obj_tests(stmt, py_expr): @@ -640,7 +642,7 @@ def test_array_function_obj_tests(stmt, py_expr): @pytest.mark.parametrize( - "function, expected_result", + ("function", "expected_result"), [ ( f.ascii(column("a")), @@ -894,54 +896,72 @@ def test_temporal_functions(df): assert result.column(0) == pa.array([12, 6, 7], type=pa.int32()) assert result.column(1) == pa.array([2022, 2027, 2020], type=pa.int32()) assert result.column(2) == pa.array( - [datetime(2022, 12, 1), datetime(2027, 6, 1), datetime(2020, 7, 1)], - type=pa.timestamp("us"), + [ + datetime(2022, 12, 1, tzinfo=DEFAULT_TZ), + datetime(2027, 6, 1, tzinfo=DEFAULT_TZ), + datetime(2020, 7, 1, tzinfo=DEFAULT_TZ), + ], + type=pa.timestamp("ns", tz=DEFAULT_TZ), ) assert result.column(3) == pa.array( - [datetime(2022, 12, 31), datetime(2027, 6, 26), datetime(2020, 7, 2)], - type=pa.timestamp("us"), + [ + datetime(2022, 12, 31, tzinfo=DEFAULT_TZ), + datetime(2027, 6, 26, tzinfo=DEFAULT_TZ), + datetime(2020, 7, 2, tzinfo=DEFAULT_TZ), + ], + type=pa.timestamp("ns", tz=DEFAULT_TZ), ) assert result.column(4) == pa.array( [ - datetime(2022, 12, 30, 23, 47, 30), - datetime(2027, 6, 25, 23, 47, 30), - datetime(2020, 7, 1, 23, 47, 30), + datetime(2022, 12, 30, 23, 47, 30, tzinfo=DEFAULT_TZ), + datetime(2027, 6, 25, 23, 47, 30, tzinfo=DEFAULT_TZ), + datetime(2020, 7, 1, 23, 47, 30, tzinfo=DEFAULT_TZ), ], - type=pa.timestamp("ns"), + type=pa.timestamp("ns", tz=DEFAULT_TZ), ) assert result.column(5) == pa.array( - [datetime(2023, 1, 10, 20, 52, 54)] * 3, type=pa.timestamp("s") + [datetime(2023, 1, 10, 20, 52, 54, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("s"), ) assert result.column(6) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ns"), ) assert result.column(7) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14)] * 3, type=pa.timestamp("s") + [datetime(2023, 9, 7, 5, 6, 14, tzinfo=DEFAULT_TZ)] * 3, type=pa.timestamp("s") ) assert result.column(8) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523000)] * 3, type=pa.timestamp("ms") + [datetime(2023, 9, 7, 5, 6, 14, 523000, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ms"), ) assert result.column(9) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("us"), ) assert result.column(10) == pa.array([31, 26, 2], type=pa.int32()) assert result.column(11) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ns"), ) assert result.column(12) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14)] * 3, type=pa.timestamp("s") + [datetime(2023, 9, 7, 5, 6, 14, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("s"), ) assert result.column(13) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523000)] * 3, type=pa.timestamp("ms") + [datetime(2023, 9, 7, 5, 6, 14, 523000, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ms"), ) assert result.column(14) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("us"), ) assert result.column(15) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ns"), ) assert result.column(16) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ns"), ) @@ -1057,7 +1077,7 @@ def test_regr_funcs_sql_2(): @pytest.mark.parametrize( - "func, expected", + ("func", "expected"), [ pytest.param(f.regr_slope(column("c2"), column("c1")), [4.6], id="regr_slope"), pytest.param( @@ -1160,7 +1180,7 @@ def test_binary_string_functions(df): @pytest.mark.parametrize( - "python_datatype, name, expected", + ("python_datatype", "name", "expected"), [ pytest.param(bool, "e", pa.bool_(), id="bool"), pytest.param(int, "b", pa.int64(), id="int"), @@ -1179,7 +1199,7 @@ def test_cast(df, python_datatype, name: str, expected): @pytest.mark.parametrize( - "negated, low, high, expected", + ("negated", "low", "high", "expected"), [ pytest.param(False, 3, 5, {"filtered": [4, 5]}), pytest.param(False, 4, 5, {"filtered": [4, 5]}), diff --git a/python/tests/test_imports.py b/python/tests/test_imports.py index 0c155cbde..9ef7ed89a 100644 --- a/python/tests/test_imports.py +++ b/python/tests/test_imports.py @@ -169,14 +169,15 @@ def test_class_module_is_datafusion(): def test_import_from_functions_submodule(): - from datafusion.functions import abs, sin # noqa + from datafusion.functions import abs as df_abs + from datafusion.functions import sin - assert functions.abs is abs + assert functions.abs is df_abs assert functions.sin is sin msg = "cannot import name 'foobar' from 'datafusion.functions'" with pytest.raises(ImportError, match=msg): - from datafusion.functions import foobar # noqa + from datafusion.functions import foobar # noqa: F401 def test_classes_are_inheritable(): diff --git a/python/tests/test_input.py b/python/tests/test_input.py index 806471357..4663f6148 100644 --- a/python/tests/test_input.py +++ b/python/tests/test_input.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -import os +import pathlib from datafusion.input.location import LocationInputPlugin @@ -23,10 +23,10 @@ def test_location_input(): location_input = LocationInputPlugin() - cwd = os.getcwd() - input_file = cwd + "/testing/data/parquet/generated_simple_numerics/blogs.parquet" + cwd = pathlib.Path.cwd() + input_file = cwd / "testing/data/parquet/generated_simple_numerics/blogs.parquet" table_name = "blog" - tbl = location_input.build_table(input_file, table_name) - assert "blog" == tbl.name - assert 3 == len(tbl.columns) + tbl = location_input.build_table(str(input_file), table_name) + assert tbl.name == "blog" + assert len(tbl.columns) == 3 assert "blogs.parquet" in tbl.filepaths[0] diff --git a/python/tests/test_io.py b/python/tests/test_io.py index 21ad188ee..7ca509689 100644 --- a/python/tests/test_io.py +++ b/python/tests/test_io.py @@ -14,8 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import os -import pathlib +from pathlib import Path import pyarrow as pa from datafusion import column @@ -23,10 +22,10 @@ def test_read_json_global_ctx(ctx): - path = os.path.dirname(os.path.abspath(__file__)) + path = Path(__file__).parent.resolve() # Default - test_data_path = os.path.join(path, "data_test_context", "data.json") + test_data_path = Path(path) / "data_test_context" / "data.json" df = read_json(test_data_path) result = df.collect() @@ -46,7 +45,7 @@ def test_read_json_global_ctx(ctx): assert result[0].schema == schema # File extension - test_data_path = os.path.join(path, "data_test_context", "data.json") + test_data_path = Path(path) / "data_test_context" / "data.json" df = read_json(test_data_path, file_extension=".json") result = df.collect() @@ -59,7 +58,7 @@ def test_read_parquet_global(): parquet_df.show() assert parquet_df is not None - path = pathlib.Path.cwd() / "parquet/data/alltypes_plain.parquet" + path = Path.cwd() / "parquet/data/alltypes_plain.parquet" parquet_df = read_parquet(path=path) assert parquet_df is not None @@ -90,6 +89,6 @@ def test_read_avro(): avro_df.show() assert avro_df is not None - path = pathlib.Path.cwd() / "testing/data/avro/alltypes_plain.avro" + path = Path.cwd() / "testing/data/avro/alltypes_plain.avro" avro_df = read_avro(path=path) assert avro_df is not None diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py index 862f745bf..b6348e3a0 100644 --- a/python/tests/test_sql.py +++ b/python/tests/test_sql.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. import gzip -import os +from pathlib import Path import numpy as np import pyarrow as pa @@ -47,9 +47,8 @@ def test_register_csv(ctx, tmp_path): ) write_csv(table, path) - with open(path, "rb") as csv_file: - with gzip.open(gzip_path, "wb") as gzipped_file: - gzipped_file.writelines(csv_file) + with Path.open(path, "rb") as csv_file, gzip.open(gzip_path, "wb") as gzipped_file: + gzipped_file.writelines(csv_file) ctx.register_csv("csv", path) ctx.register_csv("csv1", str(path)) @@ -158,7 +157,7 @@ def test_register_parquet(ctx, tmp_path): assert result.to_pydict() == {"cnt": [100]} -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_register_parquet_partitioned(ctx, tmp_path, path_to_str): dir_root = tmp_path / "dataset_parquet_partitioned" dir_root.mkdir(exist_ok=False) @@ -194,7 +193,7 @@ def test_register_parquet_partitioned(ctx, tmp_path, path_to_str): assert dict(zip(rd["grp"], rd["cnt"])) == {"a": 3, "b": 1} -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_register_dataset(ctx, tmp_path, path_to_str): path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) path = str(path) if path_to_str else path @@ -209,13 +208,15 @@ def test_register_dataset(ctx, tmp_path, path_to_str): def test_register_json(ctx, tmp_path): - path = os.path.dirname(os.path.abspath(__file__)) - test_data_path = os.path.join(path, "data_test_context", "data.json") + path = Path(__file__).parent.resolve() + test_data_path = Path(path) / "data_test_context" / "data.json" gzip_path = tmp_path / "data.json.gz" - with open(test_data_path, "rb") as json_file: - with gzip.open(gzip_path, "wb") as gzipped_file: - gzipped_file.writelines(json_file) + with ( + Path.open(test_data_path, "rb") as json_file, + gzip.open(gzip_path, "wb") as gzipped_file, + ): + gzipped_file.writelines(json_file) ctx.register_json("json", test_data_path) ctx.register_json("json1", str(test_data_path)) @@ -470,16 +471,18 @@ def test_simple_select(ctx, tmp_path, arr): # In DF 43.0.0 we now default to having BinaryView and StringView # so the array that is saved to the parquet is slightly different # than the array read. Convert to values for comparison. - if isinstance(result, pa.BinaryViewArray) or isinstance(result, pa.StringViewArray): + if isinstance(result, (pa.BinaryViewArray, pa.StringViewArray)): arr = arr.tolist() result = result.tolist() np.testing.assert_equal(result, arr) -@pytest.mark.parametrize("file_sort_order", (None, [[col("int").sort(True, True)]])) -@pytest.mark.parametrize("pass_schema", (True, False)) -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize( + "file_sort_order", [None, [[col("int").sort(ascending=True, nulls_first=True)]]] +) +@pytest.mark.parametrize("pass_schema", [True, False]) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_register_listing_table( ctx, tmp_path, pass_schema, file_sort_order, path_to_str ): @@ -528,7 +531,7 @@ def test_register_listing_table( assert dict(zip(rd["grp"], rd["count"])) == {"a": 5, "b": 2} result = ctx.sql( - "SELECT grp, COUNT(*) AS count FROM my_table WHERE date_id=20201005 GROUP BY grp" + "SELECT grp, COUNT(*) AS count FROM my_table WHERE date_id=20201005 GROUP BY grp" # noqa: E501 ).collect() result = pa.Table.from_batches(result) diff --git a/python/tests/test_store.py b/python/tests/test_store.py index 53ffc3acf..ac9af98f3 100644 --- a/python/tests/test_store.py +++ b/python/tests/test_store.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -import os +from pathlib import Path import pytest from datafusion import SessionContext @@ -23,17 +23,16 @@ @pytest.fixture def ctx(): - ctx = SessionContext() - return ctx + return SessionContext() def test_read_parquet(ctx): ctx.register_parquet( "test", - f"file://{os.getcwd()}/parquet/data/alltypes_plain.parquet", - [], - True, - ".parquet", + f"file://{Path.cwd()}/parquet/data/alltypes_plain.parquet", + table_partition_cols=[], + parquet_pruning=True, + file_extension=".parquet", ) df = ctx.sql("SELECT * FROM test") assert isinstance(df.collect(), list) diff --git a/python/tests/test_substrait.py b/python/tests/test_substrait.py index feada7cde..f367a447d 100644 --- a/python/tests/test_substrait.py +++ b/python/tests/test_substrait.py @@ -50,7 +50,7 @@ def test_substrait_serialization(ctx): substrait_plan = ss.Producer.to_substrait_plan(df.logical_plan(), ctx) -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_substrait_file_serialization(ctx, tmp_path, path_to_str): batch = pa.RecordBatch.from_arrays( [pa.array([1, 2, 3]), pa.array([4, 5, 6])], diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py index 97cf81f3c..453ff6f4f 100644 --- a/python/tests/test_udaf.py +++ b/python/tests/test_udaf.py @@ -17,8 +17,6 @@ from __future__ import annotations -from typing import List - import pyarrow as pa import pyarrow.compute as pc import pytest @@ -31,7 +29,7 @@ class Summarize(Accumulator): def __init__(self, initial_value: float = 0.0): self._sum = pa.scalar(initial_value) - def state(self) -> List[pa.Scalar]: + def state(self) -> list[pa.Scalar]: return [self._sum] def update(self, values: pa.Array) -> None: @@ -39,7 +37,7 @@ def update(self, values: pa.Array) -> None: # This breaks on `None` self._sum = pa.scalar(self._sum.as_py() + pc.sum(values).as_py()) - def merge(self, states: List[pa.Array]) -> None: + def merge(self, states: list[pa.Array]) -> None: # Not nice since pyarrow scalars can't be summed yet. # This breaks on `None` self._sum = pa.scalar(self._sum.as_py() + pc.sum(states[0]).as_py()) @@ -56,7 +54,7 @@ class MissingMethods(Accumulator): def __init__(self): self._sum = pa.scalar(0) - def state(self) -> List[pa.Scalar]: + def state(self) -> list[pa.Scalar]: return [self._sum] @@ -86,7 +84,7 @@ def test_errors(df): "evaluate, merge, update)" ) with pytest.raises(Exception, match=msg): - accum = udaf( # noqa F841 + accum = udaf( # noqa: F841 MissingMethods, pa.int64(), pa.int64(), diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py index 2fea34aa3..3d6dcf9d8 100644 --- a/python/tests/test_udwf.py +++ b/python/tests/test_udwf.py @@ -298,7 +298,7 @@ def test_udwf_errors(df): ] -@pytest.mark.parametrize("name,expr,expected", data_test_udwf_functions) +@pytest.mark.parametrize(("name", "expr", "expected"), data_test_udwf_functions) def test_udwf_functions(df, name, expr, expected): df = df.select("a", "b", f.round(expr, lit(3)).alias(name)) diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index ac064ba95..d7f6f6e35 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -19,6 +19,7 @@ import datafusion.functions import datafusion.object_store import datafusion.substrait +import pytest # EnumType introduced in 3.11. 3.10 and prior it was called EnumMeta. try: @@ -41,10 +42,8 @@ def missing_exports(internal_obj, wrapped_obj) -> None: internal_attr = getattr(internal_obj, attr) wrapped_attr = getattr(wrapped_obj, attr) - if internal_attr is not None: - if wrapped_attr is None: - print("Missing attribute: ", attr) - assert False + if internal_attr is not None and wrapped_attr is None: + pytest.fail(f"Missing attribute: {attr}") if attr in ["__self__", "__class__"]: continue From 3dcf7c7e5c0af0eb3c5e3bdf9c6e33fd4541b070 Mon Sep 17 00:00:00 2001 From: jsai28 <54253219+jsai28@users.noreply.github.com> Date: Thu, 13 Mar 2025 04:09:03 -0600 Subject: [PATCH 018/145] feat/making global context accessible for users (#1060) * Rename _global_ctx to global_ctx * Add global context to python wrapper code * Update context.py * singleton for global context * formatting * remove udf from import * remove _global_instance * formatting * formatting * unnecessary test * fix test_io.py * ran ruff * ran ruff format --- python/datafusion/context.py | 12 +++++++ python/datafusion/io.py | 63 ++++++++++++++++-------------------- python/tests/test_context.py | 18 +++++++++++ src/context.rs | 2 +- 4 files changed, 58 insertions(+), 37 deletions(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 0ab1a908a..58ad9a943 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -496,6 +496,18 @@ def __init__( self.ctx = SessionContextInternal(config, runtime) + @classmethod + def global_ctx(cls) -> SessionContext: + """Retrieve the global context as a `SessionContext` wrapper. + + Returns: + A `SessionContext` object that wraps the global `SessionContextInternal`. + """ + internal_ctx = SessionContextInternal.global_ctx() + wrapper = cls() + wrapper.ctx = internal_ctx + return wrapper + def enable_url_table(self) -> SessionContext: """Control if local files can be queried as tables. diff --git a/python/datafusion/io.py b/python/datafusion/io.py index 3e39703e3..ef5ebf96f 100644 --- a/python/datafusion/io.py +++ b/python/datafusion/io.py @@ -21,10 +21,9 @@ from typing import TYPE_CHECKING +from datafusion.context import SessionContext from datafusion.dataframe import DataFrame -from ._internal import SessionContext as SessionContextInternal - if TYPE_CHECKING: import pathlib @@ -68,16 +67,14 @@ def read_parquet( """ if table_partition_cols is None: table_partition_cols = [] - return DataFrame( - SessionContextInternal._global_ctx().read_parquet( - str(path), - table_partition_cols, - parquet_pruning, - file_extension, - skip_metadata, - schema, - file_sort_order, - ) + return SessionContext.global_ctx().read_parquet( + str(path), + table_partition_cols, + parquet_pruning, + file_extension, + skip_metadata, + schema, + file_sort_order, ) @@ -110,15 +107,13 @@ def read_json( """ if table_partition_cols is None: table_partition_cols = [] - return DataFrame( - SessionContextInternal._global_ctx().read_json( - str(path), - schema, - schema_infer_max_records, - file_extension, - table_partition_cols, - file_compression_type, - ) + return SessionContext.global_ctx().read_json( + str(path), + schema, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, ) @@ -161,17 +156,15 @@ def read_csv( path = [str(p) for p in path] if isinstance(path, list) else str(path) - return DataFrame( - SessionContextInternal._global_ctx().read_csv( - path, - schema, - has_header, - delimiter, - schema_infer_max_records, - file_extension, - table_partition_cols, - file_compression_type, - ) + return SessionContext.global_ctx().read_csv( + path, + schema, + has_header, + delimiter, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, ) @@ -198,8 +191,6 @@ def read_avro( """ if file_partition_cols is None: file_partition_cols = [] - return DataFrame( - SessionContextInternal._global_ctx().read_avro( - str(path), schema, file_partition_cols, file_extension - ) + return SessionContext.global_ctx().read_avro( + str(path), schema, file_partition_cols, file_extension ) diff --git a/python/tests/test_context.py b/python/tests/test_context.py index 7a0a7aa08..4a15ac9cf 100644 --- a/python/tests/test_context.py +++ b/python/tests/test_context.py @@ -632,3 +632,21 @@ def test_sql_with_options_no_statements(ctx): options = SQLOptions().with_allow_statements(allow=False) with pytest.raises(Exception, match="SetVariable"): ctx.sql_with_options(sql, options=options) + + +@pytest.fixture +def batch(): + return pa.RecordBatch.from_arrays( + [pa.array([4, 5, 6])], + names=["a"], + ) + + +def test_create_dataframe_with_global_ctx(batch): + ctx = SessionContext.global_ctx() + + df = ctx.create_dataframe([[batch]]) + + result = df.collect()[0].column(0) + + assert result == pa.array([4, 5, 6]) diff --git a/src/context.rs b/src/context.rs index 9ba87eb8a..0db0f4d7e 100644 --- a/src/context.rs +++ b/src/context.rs @@ -308,7 +308,7 @@ impl PySessionContext { #[classmethod] #[pyo3(signature = ())] - fn _global_ctx(_cls: &Bound<'_, PyType>) -> PyResult { + fn global_ctx(_cls: &Bound<'_, PyType>) -> PyResult { Ok(Self { ctx: get_global_ctx().clone(), }) From 55141bad7c2270c14742e962d8bab1d4f1be27f5 Mon Sep 17 00:00:00 2001 From: Spaarsh <67336892+Spaarsh@users.noreply.github.com> Date: Fri, 14 Mar 2025 18:26:31 +0530 Subject: [PATCH 019/145] Renaming Internal Structs (#1059) * Renamed Expr to RawExpr * Fixed CI test for exported classes to include RawExpr as well * Fixed CI test for exported classes to check if Expr class covers RawExpr * Generalized Raw* class checking * fixes * fixes * fixed the CI test to not look for Raw classes in the datafusion module * Add additional text to unit test describing operation and ensure wrapped Raw classes are checked * New ruff rule on main * Resolve ruff errors --------- Co-authored-by: Tim Saucer --- python/datafusion/expr.py | 8 ++-- python/tests/test_wrapper_coverage.py | 55 +++++++++++++++++++-------- src/expr.rs | 2 +- 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 702f75aed..77b6c272d 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -193,7 +193,7 @@ class Expr: :ref:`Expressions` in the online documentation for more information. """ - def __init__(self, expr: expr_internal.Expr) -> None: + def __init__(self, expr: expr_internal.RawExpr) -> None: """This constructor should not be called by the end user.""" self.expr = expr @@ -383,7 +383,7 @@ def literal(value: Any) -> Expr: value = pa.scalar(value, type=pa.string_view()) if not isinstance(value, pa.Scalar): value = pa.scalar(value) - return Expr(expr_internal.Expr.literal(value)) + return Expr(expr_internal.RawExpr.literal(value)) @staticmethod def string_literal(value: str) -> Expr: @@ -398,13 +398,13 @@ def string_literal(value: str) -> Expr: """ if isinstance(value, str): value = pa.scalar(value, type=pa.string()) - return Expr(expr_internal.Expr.literal(value)) + return Expr(expr_internal.RawExpr.literal(value)) return Expr.literal(value) @staticmethod def column(value: str) -> Expr: """Creates a new expression representing a column.""" - return Expr(expr_internal.Expr.column(value)) + return Expr(expr_internal.RawExpr.column(value)) def alias(self, name: str) -> Expr: """Assign a name to the expression.""" diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index d7f6f6e35..a2de2d32b 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -28,37 +28,62 @@ from enum import EnumMeta as EnumType -def missing_exports(internal_obj, wrapped_obj) -> None: - # Special case enums - just make sure they exist since dir() - # and other functions get overridden. +def missing_exports(internal_obj, wrapped_obj) -> None: # noqa: C901 + """ + Identify if any of the rust exposted structs or functions do not have wrappers. + + Special handling for: + - Raw* classes: Internal implementation details that shouldn't be exposed + - _global_ctx: Internal implementation detail + - __self__, __class__: Python special attributes + """ + # Special case enums - EnumType overrides a some of the internal functions, + # so check all of the values exist and move on if isinstance(wrapped_obj, EnumType): + expected_values = [v for v in dir(internal_obj) if not v.startswith("__")] + for value in expected_values: + assert value in dir(wrapped_obj) return - for attr in dir(internal_obj): - if attr in ["_global_ctx"]: - continue - assert attr in dir(wrapped_obj) + for internal_attr_name in dir(internal_obj): + wrapped_attr_name = internal_attr_name.removeprefix("Raw") + assert wrapped_attr_name in dir(wrapped_obj) - internal_attr = getattr(internal_obj, attr) - wrapped_attr = getattr(wrapped_obj, attr) + internal_attr = getattr(internal_obj, internal_attr_name) + wrapped_attr = getattr(wrapped_obj, wrapped_attr_name) - if internal_attr is not None and wrapped_attr is None: - pytest.fail(f"Missing attribute: {attr}") + # There are some auto generated attributes that can be None, such as + # __kwdefaults__ and __doc__. As long as these are None on the internal + # object, it's okay to skip them. However if they do exist on the internal + # object they must also exist on the wrapped object. + if internal_attr is not None: + if wrapped_attr is None: + pytest.fail(f"Missing attribute: {internal_attr_name}") - if attr in ["__self__", "__class__"]: + if internal_attr_name in ["__self__", "__class__"]: continue + if isinstance(internal_attr, list): assert isinstance(wrapped_attr, list) + + # We have cases like __all__ that are a list and we want to be certain that + # every value in the list in the internal object is also in the wrapper list for val in internal_attr: - assert val in wrapped_attr + if isinstance(val, str) and val.startswith("Raw"): + assert val[3:] in wrapped_attr + else: + assert val in wrapped_attr elif hasattr(internal_attr, "__dict__"): + # Check all submodules recursively missing_exports(internal_attr, wrapped_attr) def test_datafusion_missing_exports() -> None: """Check for any missing python exports. - This test verifies that every exposed class, attribute, and function in - the internal (pyo3) module is also exposed in our python wrappers. + This test verifies that every exposed class, attribute, + and function in the internal (pyo3) module - datafusion._internal + is also exposed in our python wrappers - datafusion - + i.e., the ones exposed to the public. """ missing_exports(datafusion._internal, datafusion) diff --git a/src/expr.rs b/src/expr.rs index e750be6a4..d3c528eb4 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -101,7 +101,7 @@ pub mod window; use sort_expr::{to_sort_expressions, PySortExpr}; /// A PyExpr that can be used on a DataFrame -#[pyclass(name = "Expr", module = "datafusion.expr", subclass)] +#[pyclass(name = "RawExpr", module = "datafusion.expr", subclass)] #[derive(Debug, Clone)] pub struct PyExpr { pub expr: Expr, From 4f457030f171a26d0c4cce4d55cf541519956fcc Mon Sep 17 00:00:00 2001 From: jsai28 <54253219+jsai28@users.noreply.github.com> Date: Sat, 15 Mar 2025 04:57:38 -0600 Subject: [PATCH 020/145] added pytest asyncio tests (#1063) --- pyproject.toml | 1 + python/tests/test_dataframe.py | 54 ++++++++++++++++++++++++++++++++++ uv.lock | 17 ++++++++++- 3 files changed, 71 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 060e3b80a..a4ed18c4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -150,6 +150,7 @@ dev = [ "maturin>=1.8.1", "numpy>1.25.0", "pytest>=7.4.4", + "pytest-asyncio>=0.23.3", "ruff>=0.9.1", "toml>=0.10.2", "pygithub==2.5.0", diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index d084f12dd..384b17878 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -771,6 +771,16 @@ def test_execution_plan(aggregate_df): assert rows_returned == 5 +@pytest.mark.asyncio +async def test_async_iteration_of_df(aggregate_df): + rows_returned = 0 + async for batch in aggregate_df.execute_stream(): + assert batch is not None + rows_returned += len(batch.to_pyarrow()[0]) + + assert rows_returned == 5 + + def test_repartition(df): df.repartition(2) @@ -958,6 +968,18 @@ def test_execute_stream(df): assert not list(stream) # after one iteration the generator must be exhausted +@pytest.mark.asyncio +async def test_execute_stream_async(df): + stream = df.execute_stream() + batches = [batch async for batch in stream] + + assert all(batch is not None for batch in batches) + + # After consuming all batches, the stream should be exhausted + remaining_batches = [batch async for batch in stream] + assert not remaining_batches + + @pytest.mark.parametrize("schema", [True, False]) def test_execute_stream_to_arrow_table(df, schema): stream = df.execute_stream() @@ -974,6 +996,25 @@ def test_execute_stream_to_arrow_table(df, schema): assert set(pyarrow_table.column_names) == {"a", "b", "c"} +@pytest.mark.asyncio +@pytest.mark.parametrize("schema", [True, False]) +async def test_execute_stream_to_arrow_table_async(df, schema): + stream = df.execute_stream() + + if schema: + pyarrow_table = pa.Table.from_batches( + [batch.to_pyarrow() async for batch in stream], schema=df.schema() + ) + else: + pyarrow_table = pa.Table.from_batches( + [batch.to_pyarrow() async for batch in stream] + ) + + assert isinstance(pyarrow_table, pa.Table) + assert pyarrow_table.shape == (3, 3) + assert set(pyarrow_table.column_names) == {"a", "b", "c"} + + def test_execute_stream_partitioned(df): streams = df.execute_stream_partitioned() assert all(batch is not None for stream in streams for batch in stream) @@ -982,6 +1023,19 @@ def test_execute_stream_partitioned(df): ) # after one iteration all generators must be exhausted +@pytest.mark.asyncio +async def test_execute_stream_partitioned_async(df): + streams = df.execute_stream_partitioned() + + for stream in streams: + batches = [batch async for batch in stream] + assert all(batch is not None for batch in batches) + + # Ensure the stream is exhausted after iteration + remaining_batches = [batch async for batch in stream] + assert not remaining_batches + + def test_empty_to_arrow_table(df): # Convert empty datafusion dataframe to pyarrow Table pyarrow_table = df.limit(0).to_arrow_table() diff --git a/uv.lock b/uv.lock index 619b92856..7e4bc4c6b 100644 --- a/uv.lock +++ b/uv.lock @@ -284,9 +284,11 @@ dependencies = [ [package.dev-dependencies] dev = [ { name = "maturin" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "pygithub" }, { name = "pytest" }, + { name = "pytest-asyncio" }, { name = "ruff" }, { name = "toml" }, ] @@ -314,9 +316,10 @@ requires-dist = [ [package.metadata.requires-dev] dev = [ { name = "maturin", specifier = ">=1.8.1" }, - { name = "numpy", marker = "python_full_version >= '3.10'", specifier = ">1.24.4" }, + { name = "numpy", specifier = ">1.25.0" }, { name = "pygithub", specifier = "==2.5.0" }, { name = "pytest", specifier = ">=7.4.4" }, + { name = "pytest-asyncio", specifier = ">=0.23.3" }, { name = "ruff", specifier = ">=0.9.1" }, { name = "toml", specifier = ">=0.10.2" }, ] @@ -1079,6 +1082,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/11/92/76a1c94d3afee238333bc0a42b82935dd8f9cf8ce9e336ff87ee14d9e1cf/pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6", size = 343083 }, ] +[[package]] +name = "pytest-asyncio" +version = "0.25.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f2/a8/ecbc8ede70921dd2f544ab1cadd3ff3bf842af27f87bbdea774c7baa1d38/pytest_asyncio-0.25.3.tar.gz", hash = "sha256:fc1da2cf9f125ada7e710b4ddad05518d4cee187ae9412e9ac9271003497f07a", size = 54239 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/67/17/3493c5624e48fd97156ebaec380dcaafee9506d7e2c46218ceebbb57d7de/pytest_asyncio-0.25.3-py3-none-any.whl", hash = "sha256:9e89518e0f9bd08928f97a3482fdc4e244df17529460bc038291ccaf8f85c7c3", size = 19467 }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" From 2f52688d76e84794343c17ffaf3002534ecfd716 Mon Sep 17 00:00:00 2001 From: kosiew Date: Sat, 15 Mar 2025 19:00:50 +0800 Subject: [PATCH 021/145] Add decorator for udwf (#1061) * feat: Introduce create_udwf method for User-Defined Window Functions - Added `create_udwf` static method to `WindowUDF` class, allowing users to create User-Defined Window Functions (UDWF) as both a function and a decorator. - Updated type hinting for `_R` using `TypeAlias` for better clarity. - Enhanced documentation with usage examples for both function and decorator styles, improving usability and understanding. * refactor: Simplify UDWF test suite and introduce SimpleWindowCount evaluator - Removed multiple exponential smoothing classes to streamline the code. - Introduced SimpleWindowCount class for basic row counting functionality. - Updated test cases to validate the new SimpleWindowCount evaluator. - Refactored fixture and test functions for clarity and consistency. - Enhanced error handling in UDWF creation tests. * fix: Update type alias import to use typing_extensions for compatibility * Add udwf tests for multiple input types and decorator syntax * replace old def udwf * refactor: Simplify df fixture by passing ctx as an argument * refactor: Rename DataFrame fixtures and update test functions - Renamed `df` fixture to `complex_window_df` for clarity. - Renamed `simple_df` fixture to `count_window_df` to better reflect its purpose. - Updated test functions to use the new fixture names, enhancing readability and maintainability. * refactor: Update udwf calls in WindowUDF to use BiasedNumbers directly - Changed udwf1 to use BiasedNumbers instead of bias_10. - Added udwf2 to call udwf with bias_10. - Introduced udwf3 to demonstrate a lambda function returning BiasedNumbers(20). * feat: Add overloads for udwf function to support multiple input types and decorator syntax * refactor: Simplify udwf method signature by removing redundant type hints * refactor: Remove state_type from udwf method signature and update return type handling - Eliminated the state_type parameter from the udwf method to simplify the function signature. - Updated return type handling in the _function and _decorator methods to use a generic type _R for better type flexibility. - Enhanced the decorator to wrap the original function, allowing for improved argument handling and expression return. * refactor: Update volatility parameter type in udwf method signature to support Volatility enum * Fix ruff errors * fix C901 for def udwf * refactor: Update udwf method signature and simplify input handling - Changed the type hint for the return type in the _create_window_udf_decorator method to use pa.DataType directly instead of a TypeVar. - Simplified the handling of input types by removing redundant checks and directly using the input types list. - Removed unnecessary comments and cleaned up the code for better readability. - Updated the test for udwf to use parameterized tests for better coverage and maintainability. * refactor: Rename input_type to input_types in udwf method signature for clarity * refactor: Enhance typing in udf.py by introducing Protocol for WindowEvaluator and improving import organization * Revert "refactor: Enhance typing in udf.py by introducing Protocol for WindowEvaluator and improving import organization" This reverts commit 16dbe5f3fd88f42d0a304384b162009bd9e49a35. --- python/datafusion/udf.py | 123 +++++++++++++++++++++------ python/tests/test_udwf.py | 170 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 264 insertions(+), 29 deletions(-) diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index 603b7063d..e93a34ca5 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -621,6 +621,16 @@ def __call__(self, *args: Expr) -> Expr: args_raw = [arg.expr for arg in args] return Expr(self._udwf.__call__(*args_raw)) + @overload + @staticmethod + def udwf( + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[..., WindowUDF]: ... + + @overload @staticmethod def udwf( func: Callable[[], WindowEvaluator], @@ -628,24 +638,31 @@ def udwf( return_type: pa.DataType, volatility: Volatility | str, name: Optional[str] = None, - ) -> WindowUDF: - """Create a new User-Defined Window Function. + ) -> WindowUDF: ... - If your :py:class:`WindowEvaluator` can be instantiated with no arguments, you - can simply pass it's type as ``func``. If you need to pass additional arguments - to it's constructor, you can define a lambda or a factory method. During runtime - the :py:class:`WindowEvaluator` will be constructed for every instance in - which this UDWF is used. The following examples are all valid. + @staticmethod + def udwf(*args: Any, **kwargs: Any): # noqa: D417 + """Create a new User-Defined Window Function (UDWF). - .. code-block:: python + This class can be used both as a **function** and as a **decorator**. + + Usage: + - **As a function**: Call `udwf(func, input_types, return_type, volatility, + name)`. + - **As a decorator**: Use `@udwf(input_types, return_type, volatility, + name)`. When using `udwf` as a decorator, **do not pass `func` + explicitly**. + **Function example:** + ``` import pyarrow as pa class BiasedNumbers(WindowEvaluator): def __init__(self, start: int = 0) -> None: self.start = start - def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: + def evaluate_all(self, values: list[pa.Array], + num_rows: int) -> pa.Array: return pa.array([self.start + i for i in range(num_rows)]) def bias_10() -> BiasedNumbers: @@ -655,35 +672,93 @@ def bias_10() -> BiasedNumbers: udwf2 = udwf(bias_10, pa.int64(), pa.int64(), "immutable") udwf3 = udwf(lambda: BiasedNumbers(20), pa.int64(), pa.int64(), "immutable") + ``` + + **Decorator example:** + ``` + @udwf(pa.int64(), pa.int64(), "immutable") + def biased_numbers() -> BiasedNumbers: + return BiasedNumbers(10) + ``` + Args: - func: A callable to create the window function. - input_types: The data types of the arguments to ``func``. + func: **Only needed when calling as a function. Skip this argument when + using `udwf` as a decorator.** + input_types: The data types of the arguments. return_type: The data type of the return value. volatility: See :py:class:`Volatility` for allowed values. - arguments: A list of arguments to pass in to the __init__ method for accum. name: A descriptive name for the function. Returns: - A user-defined window function. - """ # noqa: W505, E501 + A user-defined window function that can be used in window function calls. + """ + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return WindowUDF._create_window_udf(*args, **kwargs) + # Case 2: Used as a decorator with parameters + return WindowUDF._create_window_udf_decorator(*args, **kwargs) + + @staticmethod + def _create_window_udf( + func: Callable[[], WindowEvaluator], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> WindowUDF: + """Create a WindowUDF instance from function arguments.""" if not callable(func): msg = "`func` must be callable." raise TypeError(msg) if not isinstance(func(), WindowEvaluator): msg = "`func` must implement the abstract base class WindowEvaluator" raise TypeError(msg) - if name is None: - name = func().__class__.__qualname__.lower() - if isinstance(input_types, pa.DataType): - input_types = [input_types] - return WindowUDF( - name=name, - func=func, - input_types=input_types, - return_type=return_type, - volatility=volatility, + + name = name or func.__qualname__.lower() + input_types = ( + [input_types] if isinstance(input_types, pa.DataType) else input_types ) + return WindowUDF(name, func, input_types, return_type, volatility) + + @staticmethod + def _get_default_name(func: Callable) -> str: + """Get the default name for a function based on its attributes.""" + if hasattr(func, "__qualname__"): + return func.__qualname__.lower() + return func.__class__.__name__.lower() + + @staticmethod + def _normalize_input_types( + input_types: pa.DataType | list[pa.DataType], + ) -> list[pa.DataType]: + """Convert a single DataType to a list if needed.""" + if isinstance(input_types, pa.DataType): + return [input_types] + return input_types + + @staticmethod + def _create_window_udf_decorator( + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[[Callable[[], WindowEvaluator]], Callable[..., Expr]]: + """Create a decorator for a WindowUDF.""" + + def decorator(func: Callable[[], WindowEvaluator]) -> Callable[..., Expr]: + udwf_caller = WindowUDF._create_window_udf( + func, input_types, return_type, volatility, name + ) + + @functools.wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> Expr: + return udwf_caller(*args, **kwargs) + + return wrapper + + return decorator + # Convenience exports so we can import instead of treating as # variables at the package root diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py index 3d6dcf9d8..4190e7d64 100644 --- a/python/tests/test_udwf.py +++ b/python/tests/test_udwf.py @@ -162,14 +162,27 @@ def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: return pa.array(results) +class SimpleWindowCount(WindowEvaluator): + """A simple window evaluator that counts rows.""" + + def __init__(self, base: int = 0) -> None: + self.base = base + + def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: + return pa.array([self.base + i for i in range(num_rows)]) + + class NotSubclassOfWindowEvaluator: pass @pytest.fixture -def df(): - ctx = SessionContext() +def ctx(): + return SessionContext() + +@pytest.fixture +def complex_window_df(ctx): # create a RecordBatch and a new DataFrame from it batch = pa.RecordBatch.from_arrays( [ @@ -182,7 +195,17 @@ def df(): return ctx.create_dataframe([[batch]]) -def test_udwf_errors(df): +@pytest.fixture +def count_window_df(ctx): + # create a RecordBatch and a new DataFrame from it + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 4, 6])], + names=["a", "b"], + ) + return ctx.create_dataframe([[batch]], name="test_table") + + +def test_udwf_errors(complex_window_df): with pytest.raises(TypeError): udwf( NotSubclassOfWindowEvaluator, @@ -192,6 +215,103 @@ def test_udwf_errors(df): ) +def test_udwf_errors_with_message(): + """Test error cases for UDWF creation.""" + with pytest.raises( + TypeError, match="`func` must implement the abstract base class WindowEvaluator" + ): + udwf( + NotSubclassOfWindowEvaluator, pa.int64(), pa.int64(), volatility="immutable" + ) + + +def test_udwf_basic_usage(count_window_df): + """Test basic UDWF usage with a simple counting window function.""" + simple_count = udwf( + SimpleWindowCount, pa.int64(), pa.int64(), volatility="immutable" + ) + + df = count_window_df.select( + simple_count(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build() + .alias("count") + ) + result = df.collect()[0] + assert result.column(0) == pa.array([0, 1, 2]) + + +def test_udwf_with_args(count_window_df): + """Test UDWF with constructor arguments.""" + count_base10 = udwf( + lambda: SimpleWindowCount(10), pa.int64(), pa.int64(), volatility="immutable" + ) + + df = count_window_df.select( + count_base10(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build() + .alias("count") + ) + result = df.collect()[0] + assert result.column(0) == pa.array([10, 11, 12]) + + +def test_udwf_decorator_basic(count_window_df): + """Test UDWF used as a decorator.""" + + @udwf([pa.int64()], pa.int64(), "immutable") + def window_count() -> WindowEvaluator: + return SimpleWindowCount() + + df = count_window_df.select( + window_count(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build() + .alias("count") + ) + result = df.collect()[0] + assert result.column(0) == pa.array([0, 1, 2]) + + +def test_udwf_decorator_with_args(count_window_df): + """Test UDWF decorator with constructor arguments.""" + + @udwf([pa.int64()], pa.int64(), "immutable") + def window_count_base10() -> WindowEvaluator: + return SimpleWindowCount(10) + + df = count_window_df.select( + window_count_base10(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build() + .alias("count") + ) + result = df.collect()[0] + assert result.column(0) == pa.array([10, 11, 12]) + + +def test_register_udwf(ctx, count_window_df): + """Test registering and using UDWF in SQL context.""" + window_count = udwf( + SimpleWindowCount, + [pa.int64()], + pa.int64(), + volatility="immutable", + name="window_count", + ) + + ctx.register_udwf(window_count) + result = ctx.sql( + """ + SELECT window_count(a) + OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED + FOLLOWING) FROM test_table + """ + ).collect()[0] + assert result.column(0) == pa.array([0, 1, 2]) + + smooth_default = udwf( ExponentialSmoothDefault, pa.float64(), @@ -299,10 +419,50 @@ def test_udwf_errors(df): @pytest.mark.parametrize(("name", "expr", "expected"), data_test_udwf_functions) -def test_udwf_functions(df, name, expr, expected): - df = df.select("a", "b", f.round(expr, lit(3)).alias(name)) +def test_udwf_functions(complex_window_df, name, expr, expected): + df = complex_window_df.select("a", "b", f.round(expr, lit(3)).alias(name)) # execute and collect the first (and only) batch result = df.sort(column("a")).select(column(name)).collect()[0] assert result.column(0) == pa.array(expected) + + +@pytest.mark.parametrize( + "udwf_func", + [ + udwf(SimpleWindowCount, pa.int64(), pa.int64(), "immutable"), + udwf(SimpleWindowCount, [pa.int64()], pa.int64(), "immutable"), + udwf([pa.int64()], pa.int64(), "immutable")(lambda: SimpleWindowCount()), + udwf(pa.int64(), pa.int64(), "immutable")(lambda: SimpleWindowCount()), + ], +) +def test_udwf_overloads(udwf_func, count_window_df): + df = count_window_df.select( + udwf_func(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build() + .alias("count") + ) + result = df.collect()[0] + assert result.column(0) == pa.array([0, 1, 2]) + + +def test_udwf_named_function(ctx, count_window_df): + """Test UDWF with explicit name parameter.""" + window_count = udwf( + SimpleWindowCount, + pa.int64(), + pa.int64(), + volatility="immutable", + name="my_custom_counter", + ) + + ctx.register_udwf(window_count) + result = ctx.sql( + """ + SELECT my_custom_counter(a) + OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED + FOLLOWING) FROM test_table""" + ).collect()[0] + assert result.column(0) == pa.array([0, 1, 2]) From 7c1c08f8617ac97a2568eb0664e9d4ee30fceba9 Mon Sep 17 00:00:00 2001 From: Nirnay Roy <32942494+nirnayroy@users.noreply.github.com> Date: Sat, 15 Mar 2025 17:05:05 +0530 Subject: [PATCH 022/145] feat: expose regex_count function (#1066) * Added wrapper for regex_count function * fix comment --------- Co-authored-by: Nirnay Roy --- python/datafusion/functions.py | 18 ++++++++++++++++++ python/tests/test_functions.py | 4 ++++ src/functions.rs | 20 ++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 0cc7434cf..26bac149c 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -217,6 +217,7 @@ "random", "range", "rank", + "regexp_count", "regexp_like", "regexp_match", "regexp_replace", @@ -779,6 +780,23 @@ def regexp_replace( return Expr(f.regexp_replace(string.expr, pattern.expr, replacement.expr, flags)) +def regexp_count( + string: Expr, pattern: Expr, start: Expr, flags: Expr | None = None +) -> Expr: + """Returns the number of matches in a string. + + Optional start position (the first position is 1) to search for the regular + expression. + """ + if flags is not None: + flags = flags.expr + if start is not None: + start = start.expr + else: + start = Expr.expr + return Expr(f.regexp_count(string.expr, pattern.expr, start, flags)) + + def repeat(string: Expr, n: Expr) -> Expr: """Repeats the ``string`` to ``n`` times.""" return Expr(f.repeat(string.expr, n.expr)) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index ed88a16e3..161e1e3bb 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -740,6 +740,10 @@ def test_array_function_obj_tests(stmt, py_expr): f.regexp_replace(column("a"), literal("(ell|orl)"), literal("-")), pa.array(["H-o", "W-d", "!"]), ), + ( + f.regexp_count(column("a"), literal("(ell|orl)"), literal(1)), + pa.array([1, 1, 0], type=pa.int64()), + ), ], ) def test_string_functions(df, function, expected_result): diff --git a/src/functions.rs b/src/functions.rs index 6a8abb18d..8fac239b4 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -173,6 +173,25 @@ fn regexp_replace( ) .into()) } + +#[pyfunction] +#[pyo3(signature = (string, pattern, start, flags=None))] +/// Returns the number of matches found in the string. +fn regexp_count( + string: PyExpr, + pattern: PyExpr, + start: Option, + flags: Option, +) -> PyResult { + Ok(functions::expr_fn::regexp_count( + string.expr, + pattern.expr, + start.map(|x| x.expr), + flags.map(|x| x.expr), + ) + .into()) +} + /// Creates a new Sort Expr #[pyfunction] fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> PyResult { @@ -943,6 +962,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(power))?; m.add_wrapped(wrap_pyfunction!(radians))?; m.add_wrapped(wrap_pyfunction!(random))?; + m.add_wrapped(wrap_pyfunction!(regexp_count))?; m.add_wrapped(wrap_pyfunction!(regexp_like))?; m.add_wrapped(wrap_pyfunction!(regexp_match))?; m.add_wrapped(wrap_pyfunction!(regexp_replace))?; From b8dd97bc8eefcfecfa8dcc864c4898c654b236a9 Mon Sep 17 00:00:00 2001 From: Spaarsh <67336892+Spaarsh@users.noreply.github.com> Date: Mon, 17 Mar 2025 20:08:16 +0530 Subject: [PATCH 023/145] Add additional ruff suggestions (#1062) * Enabled ruff rule PT001 and ANN204 * Enabled ruff rule B008 * Enabled ruff rule EM101 * Enabled ruff rule PLR1714 * Enabled ruff rule ANN201 * Enabled ruff rule C400 * Enabled ruff rule B904 * Enabled ruff rule UP006 * Enabled ruff rule RUF012 * Enabled ruff rule FBT003 * Enabled ruff rule C416 * Enabled ruff rule SIM102 * Enabled ruff rule PGH003 * Enabled ruff rule PERF401 * Enabled ruff rule EM102 * Enabled ruff rule SIM108 * Enabled ruff rule ICN001 * Enabled ruff rule ICN001 * implemented reviews * Update pyproject.toml to ignore `SIM102` * Enabled ruff rule PLW2901 * Enabled ruff rule RET503 * Fixed failing ruff tests --- benchmarks/db-benchmark/groupby-datafusion.py | 24 ++-- benchmarks/db-benchmark/join-datafusion.py | 5 +- benchmarks/tpch/tpch.py | 7 +- dev/release/generate-changelog.py | 6 +- docs/source/conf.py | 4 +- examples/create-context.py | 12 +- examples/python-udaf.py | 36 +++-- examples/python-udf-comparisons.py | 9 +- examples/python-udf.py | 12 +- examples/query-pyarrow-data.py | 10 +- examples/sql-using-python-udaf.py | 2 +- examples/tpch/_tests.py | 4 +- examples/tpch/convert_data_to_parquet.py | 134 +++++++++--------- examples/tpch/q08_market_share.py | 2 +- examples/tpch/q19_discounted_revenue.py | 4 +- .../tpch/q21_suppliers_kept_orders_waiting.py | 2 +- pyproject.toml | 20 --- python/datafusion/__init__.py | 8 +- python/datafusion/catalog.py | 4 +- python/datafusion/context.py | 51 +++---- python/datafusion/dataframe.py | 55 +++---- python/datafusion/expr.py | 31 ++-- python/datafusion/functions.py | 9 +- python/tests/test_functions.py | 2 +- python/tests/test_wrapper_coverage.py | 7 +- 25 files changed, 213 insertions(+), 247 deletions(-) diff --git a/benchmarks/db-benchmark/groupby-datafusion.py b/benchmarks/db-benchmark/groupby-datafusion.py index 04bf7a149..f9e8d638b 100644 --- a/benchmarks/db-benchmark/groupby-datafusion.py +++ b/benchmarks/db-benchmark/groupby-datafusion.py @@ -20,7 +20,7 @@ import timeit import datafusion as df -import pyarrow +import pyarrow as pa from datafusion import ( RuntimeEnvBuilder, SessionConfig, @@ -37,7 +37,7 @@ exec(open("./_helpers/helpers.py").read()) -def ans_shape(batches): +def ans_shape(batches) -> tuple[int, int]: rows, cols = 0, 0 for batch in batches: rows += batch.num_rows @@ -48,7 +48,7 @@ def ans_shape(batches): return rows, cols -def execute(df): +def execute(df) -> list: print(df.execution_plan().display_indent()) return df.collect() @@ -68,14 +68,14 @@ def execute(df): src_grp = os.path.join("data", data_name + ".csv") print("loading dataset %s" % src_grp, flush=True) -schema = pyarrow.schema( +schema = pa.schema( [ - ("id4", pyarrow.int32()), - ("id5", pyarrow.int32()), - ("id6", pyarrow.int32()), - ("v1", pyarrow.int32()), - ("v2", pyarrow.int32()), - ("v3", pyarrow.float64()), + ("id4", pa.int32()), + ("id5", pa.int32()), + ("id6", pa.int32()), + ("v1", pa.int32()), + ("v2", pa.int32()), + ("v3", pa.float64()), ] ) @@ -93,8 +93,8 @@ def execute(df): ) config = ( SessionConfig() - .with_repartition_joins(False) - .with_repartition_aggregations(False) + .with_repartition_joins(enabled=False) + .with_repartition_aggregations(enabled=False) .set("datafusion.execution.coalesce_batches", "false") ) ctx = SessionContext(config, runtime) diff --git a/benchmarks/db-benchmark/join-datafusion.py b/benchmarks/db-benchmark/join-datafusion.py index b45ebf632..039868031 100755 --- a/benchmarks/db-benchmark/join-datafusion.py +++ b/benchmarks/db-benchmark/join-datafusion.py @@ -29,7 +29,7 @@ exec(open("./_helpers/helpers.py").read()) -def ans_shape(batches): +def ans_shape(batches) -> tuple[int, int]: rows, cols = 0, 0 for batch in batches: rows += batch.num_rows @@ -57,7 +57,8 @@ def ans_shape(batches): os.path.join("data", y_data_name[2] + ".csv"), ] if len(src_jn_y) != 3: - raise Exception("Something went wrong in preparing files used for join") + error_msg = "Something went wrong in preparing files used for join" + raise Exception(error_msg) print( "loading datasets " diff --git a/benchmarks/tpch/tpch.py b/benchmarks/tpch/tpch.py index bfb9ac398..2d1bbae5b 100644 --- a/benchmarks/tpch/tpch.py +++ b/benchmarks/tpch/tpch.py @@ -21,7 +21,7 @@ from datafusion import SessionContext -def bench(data_path, query_path): +def bench(data_path, query_path) -> None: with open("results.csv", "w") as results: # register tables start = time.time() @@ -68,10 +68,7 @@ def bench(data_path, query_path): with open(f"{query_path}/q{query}.sql") as f: text = f.read() tmp = text.split(";") - queries = [] - for str in tmp: - if len(str.strip()) > 0: - queries.append(str.strip()) + queries = [s.strip() for s in tmp if len(s.strip()) > 0] try: start = time.time() diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py index e30e2def2..d86736773 100755 --- a/dev/release/generate-changelog.py +++ b/dev/release/generate-changelog.py @@ -24,7 +24,7 @@ from github import Github -def print_pulls(repo_name, title, pulls): +def print_pulls(repo_name, title, pulls) -> None: if len(pulls) > 0: print(f"**{title}:**") print() @@ -34,7 +34,7 @@ def print_pulls(repo_name, title, pulls): print() -def generate_changelog(repo, repo_name, tag1, tag2, version): +def generate_changelog(repo, repo_name, tag1, tag2, version) -> None: # get a list of commits between two tags print(f"Fetching list of commits between {tag1} and {tag2}", file=sys.stderr) comparison = repo.compare(tag1, tag2) @@ -154,7 +154,7 @@ def generate_changelog(repo, repo_name, tag1, tag2, version): ) -def cli(args=None): +def cli(args=None) -> None: """Process command line arguments.""" if not args: args = sys.argv[1:] diff --git a/docs/source/conf.py b/docs/source/conf.py index c82a189e0..0be03d81d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -73,7 +73,7 @@ autoapi_python_class_content = "both" -def autoapi_skip_member_fn(app, what, name, obj, skip, options): # noqa: ARG001 +def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool: # noqa: ARG001 skip_contents = [ # Re-exports ("class", "datafusion.DataFrame"), @@ -93,7 +93,7 @@ def autoapi_skip_member_fn(app, what, name, obj, skip, options): # noqa: ARG001 return skip -def setup(sphinx): +def setup(sphinx) -> None: sphinx.connect("autoapi-skip-member", autoapi_skip_member_fn) diff --git a/examples/create-context.py b/examples/create-context.py index 760c8513e..0026d6162 100644 --- a/examples/create-context.py +++ b/examples/create-context.py @@ -25,14 +25,14 @@ runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000) config = ( SessionConfig() - .with_create_default_catalog_and_schema(True) + .with_create_default_catalog_and_schema(enabled=True) .with_default_catalog_and_schema("foo", "bar") .with_target_partitions(8) - .with_information_schema(True) - .with_repartition_joins(False) - .with_repartition_aggregations(False) - .with_repartition_windows(False) - .with_parquet_pruning(False) + .with_information_schema(enabled=True) + .with_repartition_joins(enabled=False) + .with_repartition_aggregations(enabled=False) + .with_repartition_windows(enabled=False) + .with_parquet_pruning(enabled=False) .set("datafusion.execution.parquet.pushdown_filters", "true") ) ctx = SessionContext(config, runtime) diff --git a/examples/python-udaf.py b/examples/python-udaf.py index 538f69571..6655edb0a 100644 --- a/examples/python-udaf.py +++ b/examples/python-udaf.py @@ -16,7 +16,7 @@ # under the License. import datafusion -import pyarrow +import pyarrow as pa import pyarrow.compute from datafusion import Accumulator, col, udaf @@ -26,25 +26,21 @@ class MyAccumulator(Accumulator): Interface of a user-defined accumulation. """ - def __init__(self): - self._sum = pyarrow.scalar(0.0) + def __init__(self) -> None: + self._sum = pa.scalar(0.0) - def update(self, values: pyarrow.Array) -> None: + def update(self, values: pa.Array) -> None: # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar( - self._sum.as_py() + pyarrow.compute.sum(values).as_py() - ) + self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(values).as_py()) - def merge(self, states: pyarrow.Array) -> None: + def merge(self, states: pa.Array) -> None: # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar( - self._sum.as_py() + pyarrow.compute.sum(states).as_py() - ) + self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(states).as_py()) - def state(self) -> pyarrow.Array: - return pyarrow.array([self._sum.as_py()]) + def state(self) -> pa.Array: + return pa.array([self._sum.as_py()]) - def evaluate(self) -> pyarrow.Scalar: + def evaluate(self) -> pa.Scalar: return self._sum @@ -52,17 +48,17 @@ def evaluate(self) -> pyarrow.Scalar: ctx = datafusion.SessionContext() # create a RecordBatch and a new DataFrame from it -batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], +batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]]) my_udaf = udaf( MyAccumulator, - pyarrow.float64(), - pyarrow.float64(), - [pyarrow.float64()], + pa.float64(), + pa.float64(), + [pa.float64()], "stable", ) @@ -70,4 +66,4 @@ def evaluate(self) -> pyarrow.Scalar: result = df.collect()[0] -assert result.column(0) == pyarrow.array([6.0]) +assert result.column(0) == pa.array([6.0]) diff --git a/examples/python-udf-comparisons.py b/examples/python-udf-comparisons.py index c5d5ec8dd..eb0825011 100644 --- a/examples/python-udf-comparisons.py +++ b/examples/python-udf-comparisons.py @@ -112,8 +112,8 @@ def is_of_interest_impl( returnflag_arr: pa.Array, ) -> pa.Array: result = [] - for idx, partkey in enumerate(partkey_arr): - partkey = partkey.as_py() + for idx, partkey_val in enumerate(partkey_arr): + partkey = partkey_val.as_py() suppkey = suppkey_arr[idx].as_py() returnflag = returnflag_arr[idx].as_py() value = (partkey, suppkey, returnflag) @@ -162,10 +162,7 @@ def udf_using_pyarrow_compute_impl( resultant_arr = pc.and_(filtered_partkey_arr, filtered_suppkey_arr) resultant_arr = pc.and_(resultant_arr, filtered_returnflag_arr) - if results is None: - results = resultant_arr - else: - results = pc.or_(results, resultant_arr) + results = resultant_arr if results is None else pc.or_(results, resultant_arr) return results diff --git a/examples/python-udf.py b/examples/python-udf.py index fb2bc253e..1c08acd1a 100644 --- a/examples/python-udf.py +++ b/examples/python-udf.py @@ -15,23 +15,23 @@ # specific language governing permissions and limitations # under the License. -import pyarrow +import pyarrow as pa from datafusion import SessionContext, udf from datafusion import functions as f -def is_null(array: pyarrow.Array) -> pyarrow.Array: +def is_null(array: pa.Array) -> pa.Array: return array.is_null() -is_null_arr = udf(is_null, [pyarrow.int64()], pyarrow.bool_(), "stable") +is_null_arr = udf(is_null, [pa.int64()], pa.bool_(), "stable") # create a context ctx = SessionContext() # create a RecordBatch and a new DataFrame from it -batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], +batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]]) @@ -40,4 +40,4 @@ def is_null(array: pyarrow.Array) -> pyarrow.Array: result = df.collect()[0] -assert result.column(0) == pyarrow.array([False] * 3) +assert result.column(0) == pa.array([False] * 3) diff --git a/examples/query-pyarrow-data.py b/examples/query-pyarrow-data.py index e3456fb5b..9cfe8a62b 100644 --- a/examples/query-pyarrow-data.py +++ b/examples/query-pyarrow-data.py @@ -16,15 +16,15 @@ # under the License. import datafusion -import pyarrow +import pyarrow as pa from datafusion import col # create a context ctx = datafusion.SessionContext() # create a RecordBatch and a new DataFrame from it -batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], +batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]]) @@ -38,5 +38,5 @@ # execute and collect the first (and only) batch result = df.collect()[0] -assert result.column(0) == pyarrow.array([5, 7, 9]) -assert result.column(1) == pyarrow.array([-3, -3, -3]) +assert result.column(0) == pa.array([5, 7, 9]) +assert result.column(1) == pa.array([-3, -3, -3]) diff --git a/examples/sql-using-python-udaf.py b/examples/sql-using-python-udaf.py index 60ab8d134..32ce38900 100644 --- a/examples/sql-using-python-udaf.py +++ b/examples/sql-using-python-udaf.py @@ -25,7 +25,7 @@ class MyAccumulator(Accumulator): Interface of a user-defined accumulation. """ - def __init__(self): + def __init__(self) -> None: self._sum = pa.scalar(0.0) def update(self, values: pa.Array) -> None: diff --git a/examples/tpch/_tests.py b/examples/tpch/_tests.py index 2be4dfabd..80ff80244 100644 --- a/examples/tpch/_tests.py +++ b/examples/tpch/_tests.py @@ -91,7 +91,7 @@ def check_q17(df): ("q22_global_sales_opportunity", "q22"), ], ) -def test_tpch_query_vs_answer_file(query_code: str, answer_file: str): +def test_tpch_query_vs_answer_file(query_code: str, answer_file: str) -> None: module = import_module(query_code) df: DataFrame = module.df @@ -122,3 +122,5 @@ def test_tpch_query_vs_answer_file(query_code: str, answer_file: str): assert df.join(df_expected, on=cols, how="anti").count() == 0 assert df.count() == df_expected.count() + + return None diff --git a/examples/tpch/convert_data_to_parquet.py b/examples/tpch/convert_data_to_parquet.py index 73097fac5..fd0fcca49 100644 --- a/examples/tpch/convert_data_to_parquet.py +++ b/examples/tpch/convert_data_to_parquet.py @@ -25,112 +25,112 @@ import os import datafusion -import pyarrow +import pyarrow as pa ctx = datafusion.SessionContext() all_schemas = {} all_schemas["customer"] = [ - ("C_CUSTKEY", pyarrow.int64()), - ("C_NAME", pyarrow.string()), - ("C_ADDRESS", pyarrow.string()), - ("C_NATIONKEY", pyarrow.int64()), - ("C_PHONE", pyarrow.string()), - ("C_ACCTBAL", pyarrow.decimal128(15, 2)), - ("C_MKTSEGMENT", pyarrow.string()), - ("C_COMMENT", pyarrow.string()), + ("C_CUSTKEY", pa.int64()), + ("C_NAME", pa.string()), + ("C_ADDRESS", pa.string()), + ("C_NATIONKEY", pa.int64()), + ("C_PHONE", pa.string()), + ("C_ACCTBAL", pa.decimal128(15, 2)), + ("C_MKTSEGMENT", pa.string()), + ("C_COMMENT", pa.string()), ] all_schemas["lineitem"] = [ - ("L_ORDERKEY", pyarrow.int64()), - ("L_PARTKEY", pyarrow.int64()), - ("L_SUPPKEY", pyarrow.int64()), - ("L_LINENUMBER", pyarrow.int32()), - ("L_QUANTITY", pyarrow.decimal128(15, 2)), - ("L_EXTENDEDPRICE", pyarrow.decimal128(15, 2)), - ("L_DISCOUNT", pyarrow.decimal128(15, 2)), - ("L_TAX", pyarrow.decimal128(15, 2)), - ("L_RETURNFLAG", pyarrow.string()), - ("L_LINESTATUS", pyarrow.string()), - ("L_SHIPDATE", pyarrow.date32()), - ("L_COMMITDATE", pyarrow.date32()), - ("L_RECEIPTDATE", pyarrow.date32()), - ("L_SHIPINSTRUCT", pyarrow.string()), - ("L_SHIPMODE", pyarrow.string()), - ("L_COMMENT", pyarrow.string()), + ("L_ORDERKEY", pa.int64()), + ("L_PARTKEY", pa.int64()), + ("L_SUPPKEY", pa.int64()), + ("L_LINENUMBER", pa.int32()), + ("L_QUANTITY", pa.decimal128(15, 2)), + ("L_EXTENDEDPRICE", pa.decimal128(15, 2)), + ("L_DISCOUNT", pa.decimal128(15, 2)), + ("L_TAX", pa.decimal128(15, 2)), + ("L_RETURNFLAG", pa.string()), + ("L_LINESTATUS", pa.string()), + ("L_SHIPDATE", pa.date32()), + ("L_COMMITDATE", pa.date32()), + ("L_RECEIPTDATE", pa.date32()), + ("L_SHIPINSTRUCT", pa.string()), + ("L_SHIPMODE", pa.string()), + ("L_COMMENT", pa.string()), ] all_schemas["nation"] = [ - ("N_NATIONKEY", pyarrow.int64()), - ("N_NAME", pyarrow.string()), - ("N_REGIONKEY", pyarrow.int64()), - ("N_COMMENT", pyarrow.string()), + ("N_NATIONKEY", pa.int64()), + ("N_NAME", pa.string()), + ("N_REGIONKEY", pa.int64()), + ("N_COMMENT", pa.string()), ] all_schemas["orders"] = [ - ("O_ORDERKEY", pyarrow.int64()), - ("O_CUSTKEY", pyarrow.int64()), - ("O_ORDERSTATUS", pyarrow.string()), - ("O_TOTALPRICE", pyarrow.decimal128(15, 2)), - ("O_ORDERDATE", pyarrow.date32()), - ("O_ORDERPRIORITY", pyarrow.string()), - ("O_CLERK", pyarrow.string()), - ("O_SHIPPRIORITY", pyarrow.int32()), - ("O_COMMENT", pyarrow.string()), + ("O_ORDERKEY", pa.int64()), + ("O_CUSTKEY", pa.int64()), + ("O_ORDERSTATUS", pa.string()), + ("O_TOTALPRICE", pa.decimal128(15, 2)), + ("O_ORDERDATE", pa.date32()), + ("O_ORDERPRIORITY", pa.string()), + ("O_CLERK", pa.string()), + ("O_SHIPPRIORITY", pa.int32()), + ("O_COMMENT", pa.string()), ] all_schemas["part"] = [ - ("P_PARTKEY", pyarrow.int64()), - ("P_NAME", pyarrow.string()), - ("P_MFGR", pyarrow.string()), - ("P_BRAND", pyarrow.string()), - ("P_TYPE", pyarrow.string()), - ("P_SIZE", pyarrow.int32()), - ("P_CONTAINER", pyarrow.string()), - ("P_RETAILPRICE", pyarrow.decimal128(15, 2)), - ("P_COMMENT", pyarrow.string()), + ("P_PARTKEY", pa.int64()), + ("P_NAME", pa.string()), + ("P_MFGR", pa.string()), + ("P_BRAND", pa.string()), + ("P_TYPE", pa.string()), + ("P_SIZE", pa.int32()), + ("P_CONTAINER", pa.string()), + ("P_RETAILPRICE", pa.decimal128(15, 2)), + ("P_COMMENT", pa.string()), ] all_schemas["partsupp"] = [ - ("PS_PARTKEY", pyarrow.int64()), - ("PS_SUPPKEY", pyarrow.int64()), - ("PS_AVAILQTY", pyarrow.int32()), - ("PS_SUPPLYCOST", pyarrow.decimal128(15, 2)), - ("PS_COMMENT", pyarrow.string()), + ("PS_PARTKEY", pa.int64()), + ("PS_SUPPKEY", pa.int64()), + ("PS_AVAILQTY", pa.int32()), + ("PS_SUPPLYCOST", pa.decimal128(15, 2)), + ("PS_COMMENT", pa.string()), ] all_schemas["region"] = [ - ("r_REGIONKEY", pyarrow.int64()), - ("r_NAME", pyarrow.string()), - ("r_COMMENT", pyarrow.string()), + ("r_REGIONKEY", pa.int64()), + ("r_NAME", pa.string()), + ("r_COMMENT", pa.string()), ] all_schemas["supplier"] = [ - ("S_SUPPKEY", pyarrow.int64()), - ("S_NAME", pyarrow.string()), - ("S_ADDRESS", pyarrow.string()), - ("S_NATIONKEY", pyarrow.int32()), - ("S_PHONE", pyarrow.string()), - ("S_ACCTBAL", pyarrow.decimal128(15, 2)), - ("S_COMMENT", pyarrow.string()), + ("S_SUPPKEY", pa.int64()), + ("S_NAME", pa.string()), + ("S_ADDRESS", pa.string()), + ("S_NATIONKEY", pa.int32()), + ("S_PHONE", pa.string()), + ("S_ACCTBAL", pa.decimal128(15, 2)), + ("S_COMMENT", pa.string()), ] curr_dir = os.path.dirname(os.path.abspath(__file__)) -for filename, curr_schema in all_schemas.items(): +for filename, curr_schema_val in all_schemas.items(): # For convenience, go ahead and convert the schema column names to lowercase - curr_schema = [(s[0].lower(), s[1]) for s in curr_schema] + curr_schema = [(s[0].lower(), s[1]) for s in curr_schema_val] # Pre-collect the output columns so we can ignore the null field we add # in to handle the trailing | in the file output_cols = [r[0] for r in curr_schema] - curr_schema = [pyarrow.field(r[0], r[1], nullable=False) for r in curr_schema] + curr_schema = [pa.field(r[0], r[1], nullable=False) for r in curr_schema] # Trailing | requires extra field for in processing - curr_schema.append(("some_null", pyarrow.null())) + curr_schema.append(("some_null", pa.null())) - schema = pyarrow.schema(curr_schema) + schema = pa.schema(curr_schema) source_file = os.path.abspath( os.path.join(curr_dir, f"../../benchmarks/tpch/data/{filename}.csv") diff --git a/examples/tpch/q08_market_share.py b/examples/tpch/q08_market_share.py index d46df30f2..4bf50efba 100644 --- a/examples/tpch/q08_market_share.py +++ b/examples/tpch/q08_market_share.py @@ -150,7 +150,7 @@ df = df.with_column( "national_volume", F.case(col("s_suppkey").is_null()) - .when(lit(False), col("volume")) + .when(lit(value=False), col("volume")) .otherwise(lit(0.0)), ) diff --git a/examples/tpch/q19_discounted_revenue.py b/examples/tpch/q19_discounted_revenue.py index 2b87e1120..bd492aac0 100644 --- a/examples/tpch/q19_discounted_revenue.py +++ b/examples/tpch/q19_discounted_revenue.py @@ -89,8 +89,8 @@ def is_of_interest( same number of rows in the output. """ result = [] - for idx, brand in enumerate(brand_arr): - brand = brand.as_py() + for idx, brand_val in enumerate(brand_arr): + brand = brand_val.as_py() if brand in items_of_interest: values_of_interest = items_of_interest[brand] diff --git a/examples/tpch/q21_suppliers_kept_orders_waiting.py b/examples/tpch/q21_suppliers_kept_orders_waiting.py index 9bbaad779..619c4406b 100644 --- a/examples/tpch/q21_suppliers_kept_orders_waiting.py +++ b/examples/tpch/q21_suppliers_kept_orders_waiting.py @@ -65,7 +65,7 @@ df = df.with_column( "failed_supp", F.case(col("l_receiptdate") > col("l_commitdate")) - .when(lit(True), col("l_suppkey")) + .when(lit(value=True), col("l_suppkey")) .end(), ) diff --git a/pyproject.toml b/pyproject.toml index a4ed18c4c..d86b657ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,37 +80,17 @@ ignore = [ "TD003", # Allow TODO lines "UP007", # Disallowing Union is pedantic # TODO: Enable all of the following, but this PR is getting too large already - "PT001", - "ANN204", - "B008", - "EM101", "PLR0913", - "PLR1714", - "ANN201", - "C400", "TRY003", - "B904", - "UP006", - "RUF012", - "FBT003", - "C416", - "SIM102", - "PGH003", "PLR2004", - "PERF401", "PD901", - "EM102", "ERA001", - "SIM108", - "ICN001", "ANN001", "ANN202", "PTH", "N812", "INP001", "DTZ007", - "PLW2901", - "RET503", "RUF015", "A005", "TC001", diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 286e5dc31..d871fdb71 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -92,17 +92,17 @@ ] -def column(value: str): +def column(value: str) -> Expr: """Create a column expression.""" return Expr.column(value) -def col(value: str): +def col(value: str) -> Expr: """Create a column expression.""" return Expr.column(value) -def literal(value): +def literal(value) -> Expr: """Create a literal expression.""" return Expr.literal(value) @@ -120,6 +120,6 @@ def str_lit(value): return string_literal(value) -def lit(value): +def lit(value) -> Expr: """Create a literal expression.""" return Expr.literal(value) diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index 0560f4704..6c3f188cc 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -24,7 +24,7 @@ import datafusion._internal as df_internal if TYPE_CHECKING: - import pyarrow + import pyarrow as pa class Catalog: @@ -67,7 +67,7 @@ def __init__(self, table: df_internal.Table) -> None: self.table = table @property - def schema(self) -> pyarrow.Schema: + def schema(self) -> pa.Schema: """Returns the schema associated with this table.""" return self.table.schema diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 58ad9a943..1429a4975 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -40,9 +40,9 @@ if TYPE_CHECKING: import pathlib - import pandas - import polars - import pyarrow + import pandas as pd + import polars as pl + import pyarrow as pa from datafusion.plan import ExecutionPlan, LogicalPlan @@ -537,7 +537,7 @@ def register_listing_table( path: str | pathlib.Path, table_partition_cols: list[tuple[str, str]] | None = None, file_extension: str = ".parquet", - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_sort_order: list[list[Expr | SortExpr]] | None = None, ) -> None: """Register multiple files as a single table. @@ -606,14 +606,14 @@ def sql_with_options(self, query: str, options: SQLOptions) -> DataFrame: def create_dataframe( self, - partitions: list[list[pyarrow.RecordBatch]], + partitions: list[list[pa.RecordBatch]], name: str | None = None, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, ) -> DataFrame: """Create and return a dataframe using the provided partitions. Args: - partitions: :py:class:`pyarrow.RecordBatch` partitions to register. + partitions: :py:class:`pa.RecordBatch` partitions to register. name: Resultant dataframe name. schema: Schema for the partitions. @@ -684,16 +684,14 @@ def from_arrow( return DataFrame(self.ctx.from_arrow(data, name)) @deprecated("Use ``from_arrow`` instead.") - def from_arrow_table( - self, data: pyarrow.Table, name: str | None = None - ) -> DataFrame: + def from_arrow_table(self, data: pa.Table, name: str | None = None) -> DataFrame: """Create a :py:class:`~datafusion.dataframe.DataFrame` from an Arrow table. This is an alias for :py:func:`from_arrow`. """ return self.from_arrow(data, name) - def from_pandas(self, data: pandas.DataFrame, name: str | None = None) -> DataFrame: + def from_pandas(self, data: pd.DataFrame, name: str | None = None) -> DataFrame: """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Pandas DataFrame. Args: @@ -705,7 +703,7 @@ def from_pandas(self, data: pandas.DataFrame, name: str | None = None) -> DataFr """ return DataFrame(self.ctx.from_pandas(data, name)) - def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFrame: + def from_polars(self, data: pl.DataFrame, name: str | None = None) -> DataFrame: """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Polars DataFrame. Args: @@ -719,7 +717,7 @@ def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFr # https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 # is the discussion on how we arrived at adding register_view - def register_view(self, name: str, df: DataFrame): + def register_view(self, name: str, df: DataFrame) -> None: """Register a :py:class: `~datafusion.detaframe.DataFrame` as a view. Args: @@ -755,7 +753,7 @@ def register_table_provider( self.ctx.register_table_provider(name, provider) def register_record_batches( - self, name: str, partitions: list[list[pyarrow.RecordBatch]] + self, name: str, partitions: list[list[pa.RecordBatch]] ) -> None: """Register record batches as a table. @@ -776,7 +774,7 @@ def register_parquet( parquet_pruning: bool = True, file_extension: str = ".parquet", skip_metadata: bool = True, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_sort_order: list[list[SortExpr]] | None = None, ) -> None: """Register a Parquet file as a table. @@ -817,7 +815,7 @@ def register_csv( self, name: str, path: str | pathlib.Path | list[str | pathlib.Path], - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, has_header: bool = True, delimiter: str = ",", schema_infer_max_records: int = 1000, @@ -843,10 +841,7 @@ def register_csv( selected for data input. file_compression_type: File compression type. """ - if isinstance(path, list): - path = [str(p) for p in path] - else: - path = str(path) + path = [str(p) for p in path] if isinstance(path, list) else str(path) self.ctx.register_csv( name, @@ -863,7 +858,7 @@ def register_json( self, name: str, path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, schema_infer_max_records: int = 1000, file_extension: str = ".json", table_partition_cols: list[tuple[str, str]] | None = None, @@ -901,7 +896,7 @@ def register_avro( self, name: str, path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_extension: str = ".avro", table_partition_cols: list[tuple[str, str]] | None = None, ) -> None: @@ -923,8 +918,8 @@ def register_avro( name, str(path), schema, file_extension, table_partition_cols ) - def register_dataset(self, name: str, dataset: pyarrow.dataset.Dataset) -> None: - """Register a :py:class:`pyarrow.dataset.Dataset` as a table. + def register_dataset(self, name: str, dataset: pa.dataset.Dataset) -> None: + """Register a :py:class:`pa.dataset.Dataset` as a table. Args: name: Name of the table to register. @@ -975,7 +970,7 @@ def session_id(self) -> str: def read_json( self, path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, schema_infer_max_records: int = 1000, file_extension: str = ".json", table_partition_cols: list[tuple[str, str]] | None = None, @@ -1012,7 +1007,7 @@ def read_json( def read_csv( self, path: str | pathlib.Path | list[str] | list[pathlib.Path], - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, has_header: bool = True, delimiter: str = ",", schema_infer_max_records: int = 1000, @@ -1065,7 +1060,7 @@ def read_parquet( parquet_pruning: bool = True, file_extension: str = ".parquet", skip_metadata: bool = True, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_sort_order: list[list[Expr | SortExpr]] | None = None, ) -> DataFrame: """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. @@ -1110,7 +1105,7 @@ def read_parquet( def read_avro( self, path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_partition_cols: list[tuple[str, str]] | None = None, file_extension: str = ".avro", ) -> DataFrame: diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index d1c71c2bb..26fe8f453 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -26,10 +26,8 @@ TYPE_CHECKING, Any, Iterable, - List, Literal, Optional, - Type, Union, overload, ) @@ -75,7 +73,7 @@ class Compression(Enum): LZ4_RAW = "lz4_raw" @classmethod - def from_str(cls: Type[Compression], value: str) -> Compression: + def from_str(cls: type[Compression], value: str) -> Compression: """Convert a string to a Compression enum value. Args: @@ -89,11 +87,13 @@ def from_str(cls: Type[Compression], value: str) -> Compression: """ try: return cls(value.lower()) - except ValueError: + except ValueError as err: valid_values = str([item.value for item in Compression]) - raise ValueError( - f"{value} is not a valid Compression. Valid values are: {valid_values}" - ) + error_msg = f""" + {value} is not a valid Compression. + Valid values are: {valid_values} + """ + raise ValueError(error_msg) from err def get_default_level(self) -> Optional[int]: """Get the default compression level for the compression type. @@ -132,7 +132,7 @@ def into_view(self) -> pa.Table: """Convert DataFrame as a ViewTable which can be used in register_table.""" return self.df.into_view() - def __getitem__(self, key: str | List[str]) -> DataFrame: + def __getitem__(self, key: str | list[str]) -> DataFrame: """Return a new :py:class`DataFrame` with the specified column or columns. Args: @@ -287,8 +287,7 @@ def _simplify_expression( if isinstance(expr, Expr): expr_list.append(expr.expr) elif isinstance(expr, Iterable): - for inner_expr in expr: - expr_list.append(inner_expr.expr) + expr_list.extend(inner_expr.expr for inner_expr in expr) else: raise NotImplementedError if named_exprs: @@ -513,10 +512,15 @@ def join( # This check is to prevent breaking API changes where users prior to # DF 43.0.0 would pass the join_keys as a positional argument instead # of a keyword argument. - if isinstance(on, tuple) and len(on) == 2: - if isinstance(on[0], list) and isinstance(on[1], list): - join_keys = on # type: ignore - on = None + if ( + isinstance(on, tuple) + and len(on) == 2 + and isinstance(on[0], list) + and isinstance(on[1], list) + ): + # We know this is safe because we've checked the types + join_keys = on # type: ignore[assignment] + on = None if join_keys is not None: warnings.warn( @@ -529,18 +533,17 @@ def join( if on is not None: if left_on is not None or right_on is not None: - raise ValueError( - "`left_on` or `right_on` should not provided with `on`" - ) + error_msg = "`left_on` or `right_on` should not provided with `on`" + raise ValueError(error_msg) left_on = on right_on = on elif left_on is not None or right_on is not None: if left_on is None or right_on is None: - raise ValueError("`left_on` and `right_on` should both be provided.") + error_msg = "`left_on` and `right_on` should both be provided." + raise ValueError(error_msg) else: - raise ValueError( - "either `on` or `left_on` and `right_on` should be provided." - ) + error_msg = "either `on` or `left_on` and `right_on` should be provided." + raise ValueError(error_msg) if isinstance(left_on, str): left_on = [left_on] if isinstance(right_on, str): @@ -726,9 +729,11 @@ def write_parquet( if isinstance(compression, str): compression = Compression.from_str(compression) - if compression in {Compression.GZIP, Compression.BROTLI, Compression.ZSTD}: - if compression_level is None: - compression_level = compression.get_default_level() + if ( + compression in {Compression.GZIP, Compression.BROTLI, Compression.ZSTD} + and compression_level is None + ): + compression_level = compression.get_default_level() self.df.write_parquet(str(path), compression.value, compression_level) @@ -824,7 +829,7 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram Returns: A DataFrame with the columns expanded. """ - columns = [c for c in columns] + columns = list(columns) return DataFrame(self.df.unnest_columns(columns, preserve_nulls=preserve_nulls)) def __arrow_c_stream__(self, requested_schema: pa.Schema) -> Any: diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 77b6c272d..2697d8143 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -22,7 +22,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Optional, Type +from typing import TYPE_CHECKING, Any, ClassVar, Optional import pyarrow as pa @@ -176,7 +176,7 @@ def sort_or_default(e: Expr | SortExpr) -> expr_internal.SortExpr: """Helper function to return a default Sort if an Expr is provided.""" if isinstance(e, SortExpr): return e.raw_sort - return SortExpr(e, True, True).raw_sort + return SortExpr(e, ascending=True, nulls_first=True).raw_sort def sort_list_to_raw_sort_list( @@ -439,24 +439,21 @@ def fill_null(self, value: Any | Expr | None = None) -> Expr: value = Expr.literal(value) return Expr(functions_internal.nvl(self.expr, value.expr)) - _to_pyarrow_types = { + _to_pyarrow_types: ClassVar[dict[type, pa.DataType]] = { float: pa.float64(), int: pa.int64(), str: pa.string(), bool: pa.bool_(), } - def cast( - self, to: pa.DataType[Any] | Type[float] | Type[int] | Type[str] | Type[bool] - ) -> Expr: + def cast(self, to: pa.DataType[Any] | type[float | int | str | bool]) -> Expr: """Cast to a new data type.""" if not isinstance(to, pa.DataType): try: to = self._to_pyarrow_types[to] - except KeyError: - raise TypeError( - "Expected instance of pyarrow.DataType or builtins.type" - ) + except KeyError as err: + error_msg = "Expected instance of pyarrow.DataType or builtins.type" + raise TypeError(error_msg) from err return Expr(self.expr.cast(to)) @@ -565,9 +562,7 @@ def partition_by(self, *partition_by: Expr) -> ExprFuncBuilder: set parameters for either window or aggregate functions. If used on any other type of expression, an error will be generated when ``build()`` is called. """ - return ExprFuncBuilder( - self.expr.partition_by(list(e.expr for e in partition_by)) - ) + return ExprFuncBuilder(self.expr.partition_by([e.expr for e in partition_by])) def window_frame(self, window_frame: WindowFrame) -> ExprFuncBuilder: """Set the frame fora window function. @@ -610,7 +605,7 @@ def over(self, window: Window) -> Expr: class ExprFuncBuilder: - def __init__(self, builder: expr_internal.ExprFuncBuilder): + def __init__(self, builder: expr_internal.ExprFuncBuilder) -> None: self.builder = builder def order_by(self, *exprs: Expr) -> ExprFuncBuilder: @@ -638,7 +633,7 @@ def null_treatment(self, null_treatment: NullTreatment) -> ExprFuncBuilder: def partition_by(self, *partition_by: Expr) -> ExprFuncBuilder: """Set partitioning for window functions.""" return ExprFuncBuilder( - self.builder.partition_by(list(e.expr for e in partition_by)) + self.builder.partition_by([e.expr for e in partition_by]) ) def window_frame(self, window_frame: WindowFrame) -> ExprFuncBuilder: @@ -693,11 +688,11 @@ def __init__( """ if not isinstance(start_bound, pa.Scalar) and start_bound is not None: start_bound = pa.scalar(start_bound) - if units == "rows" or units == "groups": + if units in ("rows", "groups"): start_bound = start_bound.cast(pa.uint64()) if not isinstance(end_bound, pa.Scalar) and end_bound is not None: end_bound = pa.scalar(end_bound) - if units == "rows" or units == "groups": + if units in ("rows", "groups"): end_bound = end_bound.cast(pa.uint64()) self.window_frame = expr_internal.WindowFrame(units, start_bound, end_bound) @@ -709,7 +704,7 @@ def get_lower_bound(self) -> WindowFrameBound: """Returns starting bound.""" return WindowFrameBound(self.window_frame.get_lower_bound()) - def get_upper_bound(self): + def get_upper_bound(self) -> WindowFrameBound: """Returns end bound.""" return WindowFrameBound(self.window_frame.get_upper_bound()) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 26bac149c..5cf914e16 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -790,10 +790,7 @@ def regexp_count( """ if flags is not None: flags = flags.expr - if start is not None: - start = start.expr - else: - start = Expr.expr + start = start.expr if start is not None else Expr.expr return Expr(f.regexp_count(string.expr, pattern.expr, start, flags)) @@ -817,13 +814,15 @@ def right(string: Expr, n: Expr) -> Expr: return Expr(f.right(string.expr, n.expr)) -def round(value: Expr, decimal_places: Expr = Expr.literal(0)) -> Expr: +def round(value: Expr, decimal_places: Expr | None = None) -> Expr: """Round the argument to the nearest integer. If the optional ``decimal_places`` is specified, round to the nearest number of decimal places. You can specify a negative number of decimal places. For example ``round(lit(125.2345), lit(-2))`` would yield a value of ``100.0``. """ + if decimal_places is None: + decimal_places = Expr.literal(0) return Expr(f.round(value.expr, decimal_places.expr)) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 161e1e3bb..37f2075f5 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -81,7 +81,7 @@ def test_literal(df): literal("1"), literal("OK"), literal(3.14), - literal(True), + literal(value=True), literal(b"hello world"), ) result = df.collect() diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index a2de2d32b..926a65961 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -28,7 +28,7 @@ from enum import EnumMeta as EnumType -def missing_exports(internal_obj, wrapped_obj) -> None: # noqa: C901 +def missing_exports(internal_obj, wrapped_obj) -> None: """ Identify if any of the rust exposted structs or functions do not have wrappers. @@ -56,9 +56,8 @@ def missing_exports(internal_obj, wrapped_obj) -> None: # noqa: C901 # __kwdefaults__ and __doc__. As long as these are None on the internal # object, it's okay to skip them. However if they do exist on the internal # object they must also exist on the wrapped object. - if internal_attr is not None: - if wrapped_attr is None: - pytest.fail(f"Missing attribute: {internal_attr_name}") + if internal_attr is not None and wrapped_attr is None: + pytest.fail(f"Missing attribute: {internal_attr_name}") if internal_attr_name in ["__self__", "__class__"]: continue From 42982dad27ad03e7e9395d4c3ae3064c2b489434 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 22 Mar 2025 10:14:55 -0400 Subject: [PATCH 024/145] Improve collection during repr and repr_html (#1036) * Improve table readout of a dataframe in jupyter notebooks by making the table scrollable and displaying the first record batch up to 2MB * Add option to only display a portion of a cell data and the user can click on a button to toggle showing more or less * We cannot expect that the first non-empy batch is sufficient for our 2MB limit, so switch over to collecting until we run out or use up the size * Update python unit test to allow the additional formatting data to exist and only check the table contents * Combining collection for repr and repr_html into one function * Small clippy suggestion * Collect was occuring twice on repr * Switch to execute_stream_partitioned --- python/tests/test_dataframe.py | 23 ++-- src/dataframe.rs | 240 ++++++++++++++++++++++++++++----- src/utils.rs | 2 +- 3 files changed, 225 insertions(+), 40 deletions(-) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 384b17878..718ebf69d 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. import os +import re from typing import Any import pyarrow as pa @@ -1245,13 +1246,17 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame: def test_dataframe_repr_html(df) -> None: output = df._repr_html_() - ref_html = """
- - - - -
abc
148
255
368
- """ + # Since we've added a fair bit of processing to the html output, lets just verify + # the values we are expecting in the table exist. Use regex and ignore everything + # between the and . We also don't want the closing > on the + # td and th segments because that is where the formatting data is written. - # Ignore whitespace just to make this test look cleaner - assert output.replace(" ", "") == ref_html.replace(" ", "") + headers = ["a", "b", "c"] + headers = [f"{v}" for v in headers] + header_pattern = "(.*?)".join(headers) + assert len(re.findall(header_pattern, output, re.DOTALL)) == 1 + + body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]] + body_lines = [f"{v}" for inner in body_data for v in inner] + body_pattern = "(.*?)".join(body_lines) + assert len(re.findall(body_pattern, output, re.DOTALL)) == 1 diff --git a/src/dataframe.rs b/src/dataframe.rs index 243e2e14f..be10b8c28 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -31,9 +31,11 @@ use datafusion::common::UnnestOptions; use datafusion::config::{CsvOptions, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::datasource::TableProvider; +use datafusion::error::DataFusionError; use datafusion::execution::SendableRecordBatchStream; use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; use datafusion::prelude::*; +use futures::{StreamExt, TryStreamExt}; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; @@ -70,6 +72,9 @@ impl PyTableProvider { PyTable::new(table_provider) } } +const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB +const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20; +const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25; /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. @@ -111,56 +116,151 @@ impl PyDataFrame { } fn __repr__(&self, py: Python) -> PyDataFusionResult { - let df = self.df.as_ref().clone().limit(0, Some(10))?; - let batches = wait_for_future(py, df.collect())?; - let batches_as_string = pretty::pretty_format_batches(&batches); - match batches_as_string { - Ok(batch) => Ok(format!("DataFrame()\n{batch}")), - Err(err) => Ok(format!("Error: {:?}", err.to_string())), + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10), + )?; + if batches.is_empty() { + // This should not be reached, but do it for safety since we index into the vector below + return Ok("No data to display".to_string()); } - } - fn _repr_html_(&self, py: Python) -> PyDataFusionResult { - let mut html_str = "\n".to_string(); + let batches_as_displ = + pretty::pretty_format_batches(&batches).map_err(py_datafusion_err)?; + + let additional_str = match has_more { + true => "\nData truncated.", + false => "", + }; - let df = self.df.as_ref().clone().limit(0, Some(10))?; - let batches = wait_for_future(py, df.collect())?; + Ok(format!("DataFrame()\n{batches_as_displ}{additional_str}")) + } + fn _repr_html_(&self, py: Python) -> PyDataFusionResult { + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display( + self.df.as_ref().clone(), + MIN_TABLE_ROWS_TO_DISPLAY, + usize::MAX, + ), + )?; if batches.is_empty() { - html_str.push_str("
\n"); - return Ok(html_str); + // This should not be reached, but do it for safety since we index into the vector below + return Ok("No data to display".to_string()); } + let table_uuid = uuid::Uuid::new_v4().to_string(); + + let mut html_str = " + + +
+ + \n".to_string(); + let schema = batches[0].schema(); let mut header = Vec::new(); for field in schema.fields() { - header.push(format!("", field.name())); } let header_str = header.join(""); - html_str.push_str(&format!("{}\n", header_str)); - - for batch in batches { - let formatters = batch - .columns() - .iter() - .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default())) - .map(|c| { - c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string()))) - }) - .collect::, _>>()?; - - for row in 0..batch.num_rows() { + html_str.push_str(&format!("{}\n", header_str)); + + let batch_formatters = batches + .iter() + .map(|batch| { + batch + .columns() + .iter() + .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default())) + .map(|c| { + c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string()))) + }) + .collect::, _>>() + }) + .collect::, _>>()?; + + let rows_per_batch = batches.iter().map(|batch| batch.num_rows()); + + // We need to build up row by row for html + let mut table_row = 0; + for (batch_formatter, num_rows_in_batch) in batch_formatters.iter().zip(rows_per_batch) { + for batch_row in 0..num_rows_in_batch { + table_row += 1; let mut cells = Vec::new(); - for formatter in &formatters { - cells.push(format!("", formatter.value(row))); + for (col, formatter) in batch_formatter.iter().enumerate() { + let cell_data = formatter.value(batch_row).to_string(); + // From testing, primitive data types do not typically get larger than 21 characters + if cell_data.len() > MAX_LENGTH_CELL_WITHOUT_MINIMIZE { + let short_cell_data = &cell_data[0..MAX_LENGTH_CELL_WITHOUT_MINIMIZE]; + cells.push(format!(" + ")); + } else { + cells.push(format!("", formatter.value(batch_row))); + } } let row_str = cells.join(""); html_str.push_str(&format!("{}\n", row_str)); } } + html_str.push_str("
{}", field.name())); + header.push(format!("{}
{} +
+ {short_cell_data} + {cell_data} + +
+
{}
\n"); + + html_str.push_str(" + + "); - html_str.push_str("\n"); + if has_more { + html_str.push_str("Data truncated due to size."); + } Ok(html_str) } @@ -771,3 +871,83 @@ fn record_batch_into_schema( RecordBatch::try_new(schema, data_arrays) } + +/// This is a helper function to return the first non-empty record batch from executing a DataFrame. +/// It additionally returns a bool, which indicates if there are more record batches available. +/// We do this so we can determine if we should indicate to the user that the data has been +/// truncated. This collects until we have achived both of these two conditions +/// +/// - We have collected our minimum number of rows +/// - We have reached our limit, either data size or maximum number of rows +/// +/// Otherwise it will return when the stream has exhausted. If you want a specific number of +/// rows, set min_rows == max_rows. +async fn collect_record_batches_to_display( + df: DataFrame, + min_rows: usize, + max_rows: usize, +) -> Result<(Vec, bool), DataFusionError> { + let partitioned_stream = df.execute_stream_partitioned().await?; + let mut stream = futures::stream::iter(partitioned_stream).flatten(); + let mut size_estimate_so_far = 0; + let mut rows_so_far = 0; + let mut record_batches = Vec::default(); + let mut has_more = false; + + while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows) + || rows_so_far < min_rows + { + let mut rb = match stream.next().await { + None => { + break; + } + Some(Ok(r)) => r, + Some(Err(e)) => return Err(e), + }; + + let mut rows_in_rb = rb.num_rows(); + if rows_in_rb > 0 { + size_estimate_so_far += rb.get_array_memory_size(); + + if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY { + let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32; + let total_rows = rows_in_rb + rows_so_far; + + let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize; + if reduced_row_num < min_rows { + reduced_row_num = min_rows.min(total_rows); + } + + let limited_rows_this_rb = reduced_row_num - rows_so_far; + if limited_rows_this_rb < rows_in_rb { + rows_in_rb = limited_rows_this_rb; + rb = rb.slice(0, limited_rows_this_rb); + has_more = true; + } + } + + if rows_in_rb + rows_so_far > max_rows { + rb = rb.slice(0, max_rows - rows_so_far); + has_more = true; + } + + rows_so_far += rb.num_rows(); + record_batches.push(rb); + } + } + + if record_batches.is_empty() { + return Ok((Vec::default(), false)); + } + + if !has_more { + // Data was not already truncated, so check to see if more record batches remain + has_more = match stream.try_next().await { + Ok(None) => false, // reached end + Ok(Some(_)) => true, + Err(_) => false, // Stream disconnected + }; + } + + Ok((record_batches, has_more)) +} diff --git a/src/utils.rs b/src/utils.rs index 999aad755..3487de21b 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -42,7 +42,7 @@ pub(crate) fn get_tokio_runtime() -> &'static TokioRuntime { #[inline] pub(crate) fn get_global_ctx() -> &'static SessionContext { static CTX: OnceLock = OnceLock::new(); - CTX.get_or_init(|| SessionContext::new()) + CTX.get_or_init(SessionContext::new) } /// Utility to collect rust futures with GIL released From d0315ffa704aba467f769f444208b7ce26d83037 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 22 Mar 2025 14:37:24 -0400 Subject: [PATCH 025/145] feat: Update DataFusion dependency to 46 (#1079) * Update DataFusion dependency to 46 * There was an update upstream in the exec but it is not a breaking change and only needs unit test updates --- Cargo.lock | 296 +++++++++++++++++++-------------- Cargo.toml | 18 +- python/tests/test_dataframe.py | 3 +- src/expr.rs | 39 +++-- src/expr/aggregate.rs | 10 +- src/expr/aggregate_expr.rs | 11 +- src/expr/window.rs | 24 ++- src/functions.rs | 34 ++-- 8 files changed, 252 insertions(+), 183 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5c7f2bf3c..3a4915f23 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -179,9 +179,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "755b6da235ac356a869393c23668c663720b8749dd6f15e52b6c214b4b964cc7" +checksum = "84ef243634a39fb6e9d1710737e7a5ef96c9bacabd2326859ff889bc9ef755e5" dependencies = [ "arrow-arith", "arrow-array", @@ -201,9 +201,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64656a1e0b13ca766f8440752e9a93e11014eec7b67909986f83ed0ab1fe37b8" +checksum = "8f420c6aef51dad2e4a96ce29c0ec90ad84880bdb60b321c74c652a6be07b93f" dependencies = [ "arrow-array", "arrow-buffer", @@ -215,9 +215,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57a4a6d2896083cfbdf84a71a863b22460d0708f8206a8373c52e326cc72ea1a" +checksum = "24bda5ff6461a4ff9739959b3d57b377f45e3f878f7be1a4f28137c0a8f339fa" dependencies = [ "ahash", "arrow-buffer", @@ -232,9 +232,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cef870583ce5e4f3b123c181706f2002fb134960f9a911900f64ba4830c7a43a" +checksum = "bc6ed265c73f134a583d02c3cab5e16afab9446d8048ede8707e31f85fad58a0" dependencies = [ "bytes", "half", @@ -243,9 +243,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ac7eba5a987f8b4a7d9629206ba48e19a1991762795bbe5d08497b7736017ee" +checksum = "01c648572391edcef10e5fd458db70ba27ed6f71bcaee04397d0cfb100b34f8b" dependencies = [ "arrow-array", "arrow-buffer", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90f12542b8164398fc9ec595ff783c4cf6044daa89622c5a7201be920e4c0d4c" +checksum = "a02fb265a6d8011a7d3ad1a36f25816ad0a3bb04cb8e9fe7929c165b98c0cbcd" dependencies = [ "arrow-array", "arrow-cast", @@ -280,9 +280,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b095e8a4f3c309544935d53e04c3bfe4eea4e71c3de6fe0416d1f08bb4441a83" +checksum = "5f2cebf504bb6a92a134a87fff98f01b14fbb3a93ecf7aef90cd0f888c5fffa4" dependencies = [ "arrow-buffer", "arrow-schema", @@ -292,9 +292,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65c63da4afedde2b25ef69825cd4663ca76f78f79ffe2d057695742099130ff6" +checksum = "8e6405b287671c88846e7751f7291f717b164911474cabac6d3d8614d5aa7374" dependencies = [ "arrow-array", "arrow-buffer", @@ -306,9 +306,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9551d9400532f23a370cabbea1dc5a53c49230397d41f96c4c8eedf306199305" +checksum = "5329bf9e7390cbb6b117ddd4d82e94c5362ea4cab5095697139429f36a38350c" dependencies = [ "arrow-array", "arrow-buffer", @@ -319,16 +319,18 @@ dependencies = [ "half", "indexmap", "lexical-core", + "memchr", "num", "serde", "serde_json", + "simdutf8", ] [[package]] name = "arrow-ord" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c07223476f8219d1ace8cd8d85fa18c4ebd8d945013f25ef5c72e85085ca4ee" +checksum = "e103c13d4b80da28339c1d7aa23dd85bd59f42158acc45d39eeb6770627909ce" dependencies = [ "arrow-array", "arrow-buffer", @@ -339,9 +341,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91b194b38bfd89feabc23e798238989c6648b2506ad639be42ec8eb1658d82c4" +checksum = "170549a11b8534f3097a0619cfe89c42812345dc998bcf81128fc700b84345b8" dependencies = [ "arrow-array", "arrow-buffer", @@ -352,18 +354,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f40f6be8f78af1ab610db7d9b236e21d587b7168e368a36275d2e5670096735" +checksum = "a5c53775bba63f319189f366d2b86e9a8889373eb198f07d8544938fc9f8ed9a" dependencies = [ "bitflags 2.8.0", ] [[package]] name = "arrow-select" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac265273864a820c4a179fc67182ccc41ea9151b97024e1be956f0f2369c2539" +checksum = "0a99003b2eb562b8d9c99dfb672306f15e94b20d3734179d596895703e821dcf" dependencies = [ "ahash", "arrow-array", @@ -375,9 +377,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d44c8eed43be4ead49128370f7131f054839d3d6003e52aebf64322470b8fbd0" +checksum = "90fdb130ee8325f4cd8262e19bb6baa3cbcef2b2573c4bee8c6fda7ea08199d7" dependencies = [ "arrow-array", "arrow-buffer", @@ -535,9 +537,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.5" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e" +checksum = "b17679a8d69b6d7fd9cd9801a536cec9fa5e5970b69f9d4747f70b39b031f5e7" dependencies = [ "arrayref", "arrayvec", @@ -649,15 +651,15 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.39" +version = "0.4.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" +checksum = "1a7964611d71df112cb1730f2ee67324fcf4d0fc6606acbbe9bfe06df124637c" dependencies = [ "android-tzdata", "iana-time-zone", "num-traits", "serde", - "windows-targets", + "windows-link", ] [[package]] @@ -864,30 +866,32 @@ dependencies = [ [[package]] name = "datafusion" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eae420e7a5b0b7f1c39364cc76cbcd0f5fdc416b2514ae3847c2676bbd60702a" +checksum = "914e6f9525599579abbd90b0f7a55afcaaaa40350b9e9ed52563f126dfe45fd3" dependencies = [ "apache-avro", "arrow", - "arrow-array", "arrow-ipc", "arrow-schema", - "async-compression", "async-trait", "bytes", "bzip2 0.5.1", "chrono", "datafusion-catalog", + "datafusion-catalog-listing", "datafusion-common", "datafusion-common-runtime", + "datafusion-datasource", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-nested", "datafusion-functions-table", "datafusion-functions-window", + "datafusion-macros", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -896,7 +900,6 @@ dependencies = [ "datafusion-sql", "flate2", "futures", - "glob", "itertools 0.14.0", "log", "num-traits", @@ -908,7 +911,6 @@ dependencies = [ "sqlparser", "tempfile", "tokio", - "tokio-util", "url", "uuid", "xz2", @@ -917,9 +919,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f27987bc22b810939e8dfecc55571e9d50355d6ea8ec1c47af8383a76a6d0e1" +checksum = "998a6549e6ee4ee3980e05590b2960446a56b343ea30199ef38acd0e0b9036e2" dependencies = [ "arrow", "async-trait", @@ -933,22 +935,40 @@ dependencies = [ "itertools 0.14.0", "log", "parking_lot", - "sqlparser", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "46.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5ac10096a5b3c0d8a227176c0e543606860842e943594ccddb45cf42a526e43" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "futures", + "log", + "object_store", + "tokio", ] [[package]] name = "datafusion-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3f6d5b8c9408cc692f7c194b8aa0c0f9b253e065a8d960ad9cdc2a13e697602" +checksum = "1f53d7ec508e1b3f68bd301cee3f649834fad51eff9240d898a4b2614cfd0a7a" dependencies = [ "ahash", "apache-avro", "arrow", - "arrow-array", - "arrow-buffer", "arrow-ipc", - "arrow-schema", "base64 0.22.1", "half", "hashbrown 0.14.5", @@ -966,25 +986,59 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d4603c8e8a4baf77660ab7074cc66fc15cc8a18f2ce9dfadb755fc6ee294e48" +checksum = "e0fcf41523b22e14cc349b01526e8b9f59206653037f2949a4adbfde5f8cb668" dependencies = [ "log", "tokio", ] +[[package]] +name = "datafusion-datasource" +version = "46.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf7f37ad8b6e88b46c7eeab3236147d32ea64b823544f498455a8d9042839c92" +dependencies = [ + "arrow", + "async-compression", + "async-trait", + "bytes", + "bzip2 0.5.1", + "chrono", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "flate2", + "futures", + "glob", + "itertools 0.14.0", + "log", + "object_store", + "rand", + "tokio", + "tokio-util", + "url", + "xz2", + "zstd", +] + [[package]] name = "datafusion-doc" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5bf4bc68623a5cf231eed601ed6eb41f46a37c4d15d11a0bff24cbc8396cd66" +checksum = "7db7a0239fd060f359dc56c6e7db726abaa92babaed2fb2e91c3a8b2fff8b256" [[package]] name = "datafusion-execution" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88b491c012cdf8e051053426013429a76f74ee3c2db68496c79c323ca1084d27" +checksum = "0938f9e5b6bc5782be4111cdfb70c02b7b5451bf34fd57e4de062a7f7c4e31f1" dependencies = [ "arrow", "dashmap", @@ -1001,9 +1055,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5a181408d4fc5dc22f9252781a8f39f2d0e5d1b33ec9bde242844980a2689c1" +checksum = "b36c28b00b00019a8695ad7f1a53ee1673487b90322ecbd604e2cf32894eb14f" dependencies = [ "arrow", "chrono", @@ -1022,26 +1076,25 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1129b48e8534d8c03c6543bcdccef0b55c8ac0c1272a15a56c67068b6eb1885" +checksum = "18f0a851a436c5a2139189eb4617a54e6a9ccb9edc96c4b3c83b3bb7c58b950e" dependencies = [ "arrow", "datafusion-common", + "indexmap", "itertools 0.14.0", "paste", ] [[package]] name = "datafusion-ffi" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff47a79d442207c168c6e3e1d970c248589c148e4800e5b285ac1b2cb1a230f8" +checksum = "d740dd9f32a4f4ed1b907e6934201bb059efe6c877532512c661771d973c7b21" dependencies = [ "abi_stable", "arrow", - "arrow-array", - "arrow-schema", "async-ffi", "async-trait", "datafusion", @@ -1055,9 +1108,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6125874e4856dfb09b59886784fcb74cde5cfc5930b3a80a1a728ef7a010df6b" +checksum = "e3196e37d7b65469fb79fee4f05e5bb58a456831035f9a38aa5919aeb3298d40" dependencies = [ "arrow", "arrow-buffer", @@ -1071,7 +1124,6 @@ dependencies = [ "datafusion-expr", "datafusion-expr-common", "datafusion-macros", - "hashbrown 0.14.5", "hex", "itertools 0.14.0", "log", @@ -1085,14 +1137,12 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3add7b1d3888e05e7c95f2b281af900ca69ebdcb21069ba679b33bde8b3b9d6" +checksum = "adfc2d074d5ee4d9354fdcc9283d5b2b9037849237ddecb8942a29144b77ca05" dependencies = [ "ahash", "arrow", - "arrow-buffer", - "arrow-schema", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -1108,9 +1158,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e18baa4cfc3d2f144f74148ed68a1f92337f5072b6dde204a0dbbdf3324989c" +checksum = "1cbceba0f98d921309a9121b702bcd49289d383684cccabf9a92cda1602f3bbb" dependencies = [ "ahash", "arrow", @@ -1121,15 +1171,12 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ec5ee8cecb0dc370291279673097ddabec03a011f73f30d7f1096457127e03e" +checksum = "170e27ce4baa27113ddf5f77f1a7ec484b0dbeda0c7abbd4bad3fc609c8ab71a" dependencies = [ "arrow", - "arrow-array", - "arrow-buffer", "arrow-ord", - "arrow-schema", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -1145,9 +1192,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c403ddd473bbb0952ba880008428b3c7febf0ed3ce1eec35a205db20efb2a36" +checksum = "7d3a06a7f0817ded87b026a437e7e51de7f59d48173b0a4e803aa896a7bd6bb5" dependencies = [ "arrow", "async-trait", @@ -1161,9 +1208,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ab18c2fb835614d06a75f24a9e09136d3a8c12a92d97c95a6af316a1787a9c5" +checksum = "d6c608b66496a1e05e3d196131eb9bebea579eed1f59e88d962baf3dda853bc6" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1178,9 +1225,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a77b73bc15e7d1967121fdc7a55d819bfb9d6c03766a6c322247dce9094a53a4" +checksum = "da2f9d83348957b4ad0cd87b5cb9445f2651863a36592fe5484d43b49a5f8d82" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1188,9 +1235,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09369b8d962291e808977cf94d495fd8b5b38647232d7ef562c27ac0f495b0af" +checksum = "4800e1ff7ecf8f310887e9b54c9c444b8e215ccbc7b21c2f244cfae373b1ece7" dependencies = [ "datafusion-expr", "quote", @@ -1199,9 +1246,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2403a7e4a84637f3de7d8d4d7a9ccc0cc4be92d89b0161ba3ee5be82f0531c54" +checksum = "971c51c54cd309001376fae752fb15a6b41750b6d1552345c46afbfb6458801b" dependencies = [ "arrow", "chrono", @@ -1218,15 +1265,12 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86ff72ac702b62dbf2650c4e1d715ebd3e4aab14e3885e72e8549e250307347c" +checksum = "e1447c2c6bc8674a16be4786b4abf528c302803fafa186aa6275692570e64d85" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", - "arrow-schema", "datafusion-common", "datafusion-expr", "datafusion-expr-common", @@ -1243,13 +1287,12 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60982b7d684e25579ee29754b4333057ed62e2cc925383c5f0bd8cab7962f435" +checksum = "69f8c25dcd069073a75b3d2840a79d0f81e64bdd2c05f2d3d18939afb36a7dcb" dependencies = [ "ahash", "arrow", - "arrow-buffer", "datafusion-common", "datafusion-expr-common", "hashbrown 0.14.5", @@ -1258,12 +1301,11 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac5e85c189d5238a5cf181a624e450c4cd4c66ac77ca551d6f3ff9080bac90bb" +checksum = "68da5266b5b9847c11d1b3404ee96b1d423814e1973e1ad3789131e5ec912763" dependencies = [ "arrow", - "arrow-schema", "datafusion-common", "datafusion-execution", "datafusion-expr", @@ -1271,23 +1313,19 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", - "futures", "itertools 0.14.0", "log", "recursive", - "url", ] [[package]] name = "datafusion-physical-plan" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c36bf163956d7e2542657c78b3383fdc78f791317ef358a359feffcdb968106f" +checksum = "88cc160df00e413e370b3b259c8ea7bfbebc134d32de16325950e9e923846b7f" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", "arrow-ord", "arrow-schema", "async-trait", @@ -1312,9 +1350,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2db5d79f0c974041787b899d24dc91bdab2ff112d1942dd71356a4ce3b407e6c" +checksum = "6f6ef4c6eb52370cb48639e25e2331a415aac0b2b0a0a472b36e26603bdf184f" dependencies = [ "arrow", "chrono", @@ -1328,9 +1366,9 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de21bde1603aac0ff32cf478e47081be6e3583c6861fe8f57034da911efe7578" +checksum = "5faf4a9bbb0d0a305fea8a6db21ba863286b53e53a212e687d2774028dd6f03f" dependencies = [ "arrow", "datafusion-common", @@ -1362,13 +1400,11 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13caa4daede211ecec53c78b13c503b592794d125f9a3cc3afe992edf9e7f43" +checksum = "325a212b67b677c0eb91447bf9a11b630f9fc4f62d8e5d145bf859f5a6b29e64" dependencies = [ "arrow", - "arrow-array", - "arrow-schema", "bigdecimal", "datafusion-common", "datafusion-expr", @@ -1381,11 +1417,10 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1634405abd8bd3c64c352f2da2f2aec6d80a815930257e0db0ce4ff5daf00944" +checksum = "2c2be3226a683e02cff65181e66e62eba9f812ed0e9b7ec8fe11ac8dabf1a73f" dependencies = [ - "arrow-buffer", "async-recursion", "async-trait", "chrono", @@ -1395,6 +1430,7 @@ dependencies = [ "pbjson-types", "prost", "substrait", + "tokio", "url", ] @@ -1472,9 +1508,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.35" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" +checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc" dependencies = [ "crc32fast", "miniz_oxide", @@ -2117,9 +2153,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.169" +version = "0.2.171" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" [[package]] name = "libflate" @@ -2447,9 +2483,9 @@ dependencies = [ [[package]] name = "parquet" -version = "54.1.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a01a0efa30bbd601ae85b375c728efdb211ade54390281628a7b16708beb235" +checksum = "94243778210509a5a5e9e012872127180c155d73a9cd6e2df9243d213e81e100" dependencies = [ "ahash", "arrow-array", @@ -2479,7 +2515,6 @@ dependencies = [ "tokio", "twox-hash", "zstd", - "zstd-sys", ] [[package]] @@ -3401,11 +3436,12 @@ dependencies = [ [[package]] name = "sqlparser" -version = "0.53.0" +version = "0.54.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8" +checksum = "c66e3b7374ad4a6af849b08b3e7a6eda0edbd82f0fd59b57e22671bf16979899" dependencies = [ "log", + "recursive", "sqlparser_derive", ] @@ -3466,9 +3502,9 @@ dependencies = [ [[package]] name = "substrait" -version = "0.52.3" +version = "0.53.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5db15789cecbfdf6b1fcf2db807e767c92273bdc407ac057c2194b070c597756" +checksum = "6fac3d70185423235f37b889764e184b81a5af4bb7c95833396ee9bd92577e1b" dependencies = [ "heck", "pbjson", @@ -3922,12 +3958,14 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.13.1" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ced87ca4be083373936a67f8de945faa23b6b42384bd5b64434850802c6dccd0" +checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9" dependencies = [ "getrandom 0.3.1", + "js-sys", "serde", + "wasm-bindgen", ] [[package]] @@ -4114,6 +4152,12 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "windows-link" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38" + [[package]] name = "windows-registry" version = "0.2.0" diff --git a/Cargo.toml b/Cargo.toml index 50967a219..8afabdd82 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,24 +34,24 @@ protoc = [ "datafusion-substrait/protoc" ] substrait = ["dep:datafusion-substrait"] [dependencies] -tokio = { version = "1.42", features = ["macros", "rt", "rt-multi-thread", "sync"] } +tokio = { version = "1.43", features = ["macros", "rt", "rt-multi-thread", "sync"] } pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } pyo3-async-runtimes = { version = "0.23", features = ["tokio-runtime"]} -arrow = { version = "54", features = ["pyarrow"] } -datafusion = { version = "45.0.0", features = ["avro", "unicode_expressions"] } -datafusion-substrait = { version = "45.0.0", optional = true } -datafusion-proto = { version = "45.0.0" } -datafusion-ffi = { version = "45.0.0" } -prost = "0.13" # keep in line with `datafusion-substrait` +arrow = { version = "54.2.1", features = ["pyarrow"] } +datafusion = { version = "46.0.1", features = ["avro", "unicode_expressions"] } +datafusion-substrait = { version = "46.0.1", optional = true } +datafusion-proto = { version = "46.0.1" } +datafusion-ffi = { version = "46.0.1" } +prost = "0.13.1" # keep in line with `datafusion-substrait` uuid = { version = "1.12", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } -async-trait = "0.1" +async-trait = "0.1.73" futures = "0.3" object_store = { version = "0.11.0", features = ["aws", "gcp", "azure", "http"] } url = "2" [build-dependencies] -prost-types = "0.13" # keep in line with `datafusion-substrait` +prost-types = "0.13.1" # keep in line with `datafusion-substrait` pyo3-build-config = "0.23" [lib] diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 718ebf69d..eda13930d 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -753,7 +753,8 @@ def test_execution_plan(aggregate_df): assert "AggregateExec:" in indent assert "CoalesceBatchesExec:" in indent assert "RepartitionExec:" in indent - assert "CsvExec:" in indent + assert "DataSourceExec:" in indent + assert "file_type=csv" in indent ctx = SessionContext() rows_returned = 0 diff --git a/src/expr.rs b/src/expr.rs index d3c528eb4..561170289 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use datafusion::logical_expr::expr::{AggregateFunctionParams, WindowFunctionParams}; use datafusion::logical_expr::utils::exprlist_to_fields; use datafusion::logical_expr::{ ExprFuncBuilder, ExprFunctionExt, LogicalPlan, WindowFunctionDefinition, @@ -172,6 +173,7 @@ impl PyExpr { Expr::ScalarSubquery(value) => { Ok(scalar_subquery::PyScalarSubquery::from(value.clone()).into_bound_py_any(py)?) } + #[allow(deprecated)] Expr::Wildcard { qualifier, options } => Err(py_unsupported_variant_err(format!( "Converting Expr::Wildcard to a Python object is not implemented : {:?} {:?}", qualifier, options @@ -332,7 +334,6 @@ impl PyExpr { | Expr::AggregateFunction { .. } | Expr::WindowFunction { .. } | Expr::InList { .. } - | Expr::Wildcard { .. } | Expr::Exists { .. } | Expr::InSubquery { .. } | Expr::GroupingSet(..) @@ -346,6 +347,10 @@ impl PyExpr { | Expr::Unnest(_) | Expr::IsNotUnknown(_) => RexType::Call, Expr::ScalarSubquery(..) => RexType::ScalarSubquery, + #[allow(deprecated)] + Expr::Wildcard { .. } => { + return Err(py_unsupported_variant_err("Expr::Wildcard is unsupported")) + } }) } @@ -394,11 +399,15 @@ impl PyExpr { | Expr::InSubquery(InSubquery { expr, .. }) => Ok(vec![PyExpr::from(*expr.clone())]), // Expr variants containing a collection of Expr(s) for operands - Expr::AggregateFunction(AggregateFunction { args, .. }) + Expr::AggregateFunction(AggregateFunction { + params: AggregateFunctionParams { args, .. }, + .. + }) | Expr::ScalarFunction(ScalarFunction { args, .. }) - | Expr::WindowFunction(WindowFunction { args, .. }) => { - Ok(args.iter().map(|arg| PyExpr::from(arg.clone())).collect()) - } + | Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { args, .. }, + .. + }) => Ok(args.iter().map(|arg| PyExpr::from(arg.clone())).collect()), // Expr(s) that require more specific processing Expr::Case(Case { @@ -465,13 +474,17 @@ impl PyExpr { Expr::GroupingSet(..) | Expr::Unnest(_) | Expr::OuterReferenceColumn(_, _) - | Expr::Wildcard { .. } | Expr::ScalarSubquery(..) | Expr::Placeholder { .. } | Expr::Exists { .. } => Err(py_runtime_err(format!( "Unimplemented Expr type: {}", self.expr ))), + + #[allow(deprecated)] + Expr::Wildcard { .. } => { + Err(py_unsupported_variant_err("Expr::Wildcard is unsupported")) + } } } @@ -575,7 +588,7 @@ impl PyExpr { Expr::AggregateFunction(agg_fn) => { let window_fn = Expr::WindowFunction(WindowFunction::new( WindowFunctionDefinition::AggregateUDF(agg_fn.func.clone()), - agg_fn.args.clone(), + agg_fn.params.args.clone(), )); add_builder_fns_to_window( @@ -663,16 +676,8 @@ impl PyExpr { /// Create a [Field] representing an [Expr], given an input [LogicalPlan] to resolve against pub fn expr_to_field(expr: &Expr, input_plan: &LogicalPlan) -> PyDataFusionResult> { - match expr { - Expr::Wildcard { .. } => { - // Since * could be any of the valid column names just return the first one - Ok(Arc::new(input_plan.schema().field(0).clone())) - } - _ => { - let fields = exprlist_to_fields(&[expr.clone()], input_plan)?; - Ok(fields[0].1.clone()) - } - } + let fields = exprlist_to_fields(&[expr.clone()], input_plan)?; + Ok(fields[0].1.clone()) } fn _types(expr: &Expr) -> PyResult { match expr { diff --git a/src/expr/aggregate.rs b/src/expr/aggregate.rs index 8fc9da5b0..a99d83d23 100644 --- a/src/expr/aggregate.rs +++ b/src/expr/aggregate.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::common::DataFusionError; -use datafusion::logical_expr::expr::{AggregateFunction, Alias}; +use datafusion::logical_expr::expr::{AggregateFunction, AggregateFunctionParams, Alias}; use datafusion::logical_expr::logical_plan::Aggregate; use datafusion::logical_expr::Expr; use pyo3::{prelude::*, IntoPyObjectExt}; @@ -126,9 +126,11 @@ impl PyAggregate { match expr { // TODO: This Alias logic seems to be returning some strange results that we should investigate Expr::Alias(Alias { expr, .. }) => self._aggregation_arguments(expr.as_ref()), - Expr::AggregateFunction(AggregateFunction { func: _, args, .. }) => { - Ok(args.iter().map(|e| PyExpr::from(e.clone())).collect()) - } + Expr::AggregateFunction(AggregateFunction { + func: _, + params: AggregateFunctionParams { args, .. }, + .. + }) => Ok(args.iter().map(|e| PyExpr::from(e.clone())).collect()), _ => Err(py_type_err( "Encountered a non Aggregate type in aggregation_arguments", )), diff --git a/src/expr/aggregate_expr.rs b/src/expr/aggregate_expr.rs index 09471097f..c09f116e3 100644 --- a/src/expr/aggregate_expr.rs +++ b/src/expr/aggregate_expr.rs @@ -40,7 +40,13 @@ impl From for PyAggregateFunction { impl Display for PyAggregateFunction { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let args: Vec = self.aggr.args.iter().map(|expr| expr.to_string()).collect(); + let args: Vec = self + .aggr + .params + .args + .iter() + .map(|expr| expr.to_string()) + .collect(); write!(f, "{}({})", self.aggr.func.name(), args.join(", ")) } } @@ -54,12 +60,13 @@ impl PyAggregateFunction { /// is this a distinct aggregate such as `COUNT(DISTINCT expr)` fn is_distinct(&self) -> bool { - self.aggr.distinct + self.aggr.params.distinct } /// Get the arguments to the aggregate function fn args(&self) -> Vec { self.aggr + .params .args .iter() .map(|expr| PyExpr::from(expr.clone())) diff --git a/src/expr/window.rs b/src/expr/window.rs index 13deaec25..c5467bf94 100644 --- a/src/expr/window.rs +++ b/src/expr/window.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::common::{DataFusionError, ScalarValue}; -use datafusion::logical_expr::expr::WindowFunction; +use datafusion::logical_expr::expr::{WindowFunction, WindowFunctionParams}; use datafusion::logical_expr::{Expr, Window, WindowFrame, WindowFrameBound, WindowFrameUnits}; use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; @@ -118,7 +118,10 @@ impl PyWindowExpr { /// Returns order by columns in a window function expression pub fn get_sort_exprs(&self, expr: PyExpr) -> PyResult> { match expr.expr.unalias() { - Expr::WindowFunction(WindowFunction { order_by, .. }) => py_sort_expr_list(&order_by), + Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { order_by, .. }, + .. + }) => py_sort_expr_list(&order_by), other => Err(not_window_function_err(other)), } } @@ -126,9 +129,10 @@ impl PyWindowExpr { /// Return partition by columns in a window function expression pub fn get_partition_exprs(&self, expr: PyExpr) -> PyResult> { match expr.expr.unalias() { - Expr::WindowFunction(WindowFunction { partition_by, .. }) => { - py_expr_list(&partition_by) - } + Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { partition_by, .. }, + .. + }) => py_expr_list(&partition_by), other => Err(not_window_function_err(other)), } } @@ -136,7 +140,10 @@ impl PyWindowExpr { /// Return input args for window function pub fn get_args(&self, expr: PyExpr) -> PyResult> { match expr.expr.unalias() { - Expr::WindowFunction(WindowFunction { args, .. }) => py_expr_list(&args), + Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { args, .. }, + .. + }) => py_expr_list(&args), other => Err(not_window_function_err(other)), } } @@ -152,7 +159,10 @@ impl PyWindowExpr { /// Returns a Pywindow frame for a given window function expression pub fn get_frame(&self, expr: PyExpr) -> Option { match expr.expr.unalias() { - Expr::WindowFunction(WindowFunction { window_frame, .. }) => Some(window_frame.into()), + Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { window_frame, .. }, + .. + }) => Some(window_frame.into()), _ => None, } } diff --git a/src/functions.rs b/src/functions.rs index 8fac239b4..9c406b95a 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -17,6 +17,7 @@ use datafusion::functions_aggregate::all_default_aggregate_functions; use datafusion::functions_window::all_default_window_functions; +use datafusion::logical_expr::expr::WindowFunctionParams; use datafusion::logical_expr::ExprFunctionExt; use datafusion::logical_expr::WindowFrame; use pyo3::{prelude::*, wrap_pyfunction}; @@ -215,10 +216,7 @@ fn alias(expr: PyExpr, name: &str) -> PyResult { #[pyfunction] fn col(name: &str) -> PyResult { Ok(PyExpr { - expr: datafusion::logical_expr::Expr::Column(Column { - relation: None, - name: name.to_string(), - }), + expr: datafusion::logical_expr::Expr::Column(Column::new_unqualified(name)), }) } @@ -333,19 +331,21 @@ fn window( Ok(PyExpr { expr: datafusion::logical_expr::Expr::WindowFunction(WindowFunction { fun, - args: args.into_iter().map(|x| x.expr).collect::>(), - partition_by: partition_by - .unwrap_or_default() - .into_iter() - .map(|x| x.expr) - .collect::>(), - order_by: order_by - .unwrap_or_default() - .into_iter() - .map(|x| x.into()) - .collect::>(), - window_frame, - null_treatment: None, + params: WindowFunctionParams { + args: args.into_iter().map(|x| x.expr).collect::>(), + partition_by: partition_by + .unwrap_or_default() + .into_iter() + .map(|x| x.expr) + .collect::>(), + order_by: order_by + .unwrap_or_default() + .into_iter() + .map(|x| x.into()) + .collect::>(), + window_frame, + null_treatment: None, + }, }), }) } From 583e1e9420906c99b1fbdf57c0138f1e67548008 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 30 Mar 2025 08:44:55 -0400 Subject: [PATCH 026/145] Update changelog and version number (#1089) --- Cargo.lock | 2 +- Cargo.toml | 2 +- dev/changelog/46.0.0.md | 73 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 dev/changelog/46.0.0.md diff --git a/Cargo.lock b/Cargo.lock index 3a4915f23..f90038c50 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1377,7 +1377,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "45.2.0" +version = "46.0.0" dependencies = [ "arrow", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index 8afabdd82..bc8639d4c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "45.2.0" +version = "46.0.0" homepage = "https://datafusion.apache.org/python" repository = "https://github.com/apache/datafusion-python" authors = ["Apache DataFusion "] diff --git a/dev/changelog/46.0.0.md b/dev/changelog/46.0.0.md new file mode 100644 index 000000000..3e5768099 --- /dev/null +++ b/dev/changelog/46.0.0.md @@ -0,0 +1,73 @@ + + +# Apache DataFusion Python 46.0.0 Changelog + +This release consists of 21 commits from 11 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: reads using global ctx [#982](https://github.com/apache/datafusion-python/pull/982) (ion-elgreco) +- feat: Implementation of udf and udaf decorator [#1040](https://github.com/apache/datafusion-python/pull/1040) (CrystalZhou0529) +- feat: expose regex_count function [#1066](https://github.com/apache/datafusion-python/pull/1066) (nirnayroy) +- feat: Update DataFusion dependency to 46 [#1079](https://github.com/apache/datafusion-python/pull/1079) (timsaucer) + +**Fixed bugs:** + +- fix: add to_timestamp_nanos [#1020](https://github.com/apache/datafusion-python/pull/1020) (chenkovsky) +- fix: type checking [#993](https://github.com/apache/datafusion-python/pull/993) (chenkovsky) + +**Other:** + +- [infra] Fail Clippy on rust build warnings [#1029](https://github.com/apache/datafusion-python/pull/1029) (kevinjqliu) +- Add user documentation for the FFI approach [#1031](https://github.com/apache/datafusion-python/pull/1031) (timsaucer) +- build(deps): bump arrow from 54.1.0 to 54.2.0 [#1035](https://github.com/apache/datafusion-python/pull/1035) (dependabot[bot]) +- Chore: Release datafusion-python 45 [#1024](https://github.com/apache/datafusion-python/pull/1024) (timsaucer) +- Enable Dataframe to be converted into views which can be used in register_table [#1016](https://github.com/apache/datafusion-python/pull/1016) (kosiew) +- Add ruff check for missing futures import [#1052](https://github.com/apache/datafusion-python/pull/1052) (timsaucer) +- Enable take comments to assign issues to users [#1058](https://github.com/apache/datafusion-python/pull/1058) (timsaucer) +- Update python min version to 3.9 [#1043](https://github.com/apache/datafusion-python/pull/1043) (kevinjqliu) +- feat/improve ruff test coverage [#1055](https://github.com/apache/datafusion-python/pull/1055) (timsaucer) +- feat/making global context accessible for users [#1060](https://github.com/apache/datafusion-python/pull/1060) (jsai28) +- Renaming Internal Structs [#1059](https://github.com/apache/datafusion-python/pull/1059) (Spaarsh) +- test: add pytest asyncio tests [#1063](https://github.com/apache/datafusion-python/pull/1063) (jsai28) +- Add decorator for udwf [#1061](https://github.com/apache/datafusion-python/pull/1061) (kosiew) +- Add additional ruff suggestions [#1062](https://github.com/apache/datafusion-python/pull/1062) (Spaarsh) +- Improve collection during repr and repr_html [#1036](https://github.com/apache/datafusion-python/pull/1036) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 7 Tim Saucer + 2 Kevin Liu + 2 Spaarsh + 2 jsai28 + 2 kosiew + 1 Chen Chongchen + 1 Chongchen Chen + 1 Crystal Zhou + 1 Ion Koutsouris + 1 Nirnay Roy + 1 dependabot[bot] +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + From ffafb59e1b1b7f49f4ba4507b28ba1cecfb0225a Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Sun, 30 Mar 2025 20:45:15 +0800 Subject: [PATCH 027/145] feat: support unparser (#1088) * support unparser * add license * add export * format * format --- python/datafusion/__init__.py | 3 +- python/datafusion/unparser.py | 80 +++++++++++++++++++++++++++++++++++ python/tests/test_unparser.py | 33 +++++++++++++++ src/lib.rs | 5 +++ src/unparser/dialect.rs | 63 +++++++++++++++++++++++++++ src/unparser/mod.rs | 66 +++++++++++++++++++++++++++++ 6 files changed, 249 insertions(+), 1 deletion(-) create mode 100644 python/datafusion/unparser.py create mode 100644 python/tests/test_unparser.py create mode 100644 src/unparser/dialect.rs create mode 100644 src/unparser/mod.rs diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index d871fdb71..ecf5545bc 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -26,7 +26,7 @@ except ImportError: import importlib_metadata -from . import functions, object_store, substrait +from . import functions, object_store, substrait, unparser # The following imports are okay to remain as opaque to the user. from ._internal import Config @@ -89,6 +89,7 @@ "udaf", "udf", "udwf", + "unparser", ] diff --git a/python/datafusion/unparser.py b/python/datafusion/unparser.py new file mode 100644 index 000000000..7ca5b9190 --- /dev/null +++ b/python/datafusion/unparser.py @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""This module provides support for unparsing datafusion plans to SQL. + +For additional information about unparsing, see https://docs.rs/datafusion-sql/latest/datafusion_sql/unparser/index.html +""" + +from ._internal import unparser as unparser_internal +from .plan import LogicalPlan + + +class Dialect: + """DataFusion data catalog.""" + + def __init__(self, dialect: unparser_internal.Dialect) -> None: + """This constructor is not typically called by the end user.""" + self.dialect = dialect + + @staticmethod + def default() -> "Dialect": + """Create a new default dialect.""" + return Dialect(unparser_internal.Dialect.default()) + + @staticmethod + def mysql() -> "Dialect": + """Create a new MySQL dialect.""" + return Dialect(unparser_internal.Dialect.mysql()) + + @staticmethod + def postgres() -> "Dialect": + """Create a new PostgreSQL dialect.""" + return Dialect(unparser_internal.Dialect.postgres()) + + @staticmethod + def sqlite() -> "Dialect": + """Create a new SQLite dialect.""" + return Dialect(unparser_internal.Dialect.sqlite()) + + @staticmethod + def duckdb() -> "Dialect": + """Create a new DuckDB dialect.""" + return Dialect(unparser_internal.Dialect.duckdb()) + + +class Unparser: + """DataFusion unparser.""" + + def __init__(self, dialect: Dialect) -> None: + """This constructor is not typically called by the end user.""" + self.unparser = unparser_internal.Unparser(dialect.dialect) + + def plan_to_sql(self, plan: LogicalPlan) -> str: + """Convert a logical plan to a SQL string.""" + return self.unparser.plan_to_sql(plan._raw_plan) + + def with_pretty(self, pretty: bool) -> "Unparser": + """Set the pretty flag.""" + self.unparser = self.unparser.with_pretty(pretty) + return self + + +__all__ = [ + "Dialect", + "Unparser", +] diff --git a/python/tests/test_unparser.py b/python/tests/test_unparser.py new file mode 100644 index 000000000..c4e05780c --- /dev/null +++ b/python/tests/test_unparser.py @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from datafusion.context import SessionContext +from datafusion.unparser import Dialect, Unparser + + +def test_unparser(): + ctx = SessionContext() + df = ctx.sql("SELECT 1") + for dialect in [ + Dialect.mysql(), + Dialect.postgres(), + Dialect.sqlite(), + Dialect.duckdb(), + ]: + unparser = Unparser(dialect) + sql = unparser.plan_to_sql(df.logical_plan()) + assert sql == "SELECT 1" diff --git a/src/lib.rs b/src/lib.rs index ce93ff0c3..6eeda0878 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -52,6 +52,7 @@ pub mod pyarrow_util; mod record_batch; pub mod sql; pub mod store; +pub mod unparser; #[cfg(feature = "substrait")] pub mod substrait; @@ -103,6 +104,10 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { expr::init_module(&expr)?; m.add_submodule(&expr)?; + let unparser = PyModule::new(py, "unparser")?; + unparser::init_module(&unparser)?; + m.add_submodule(&unparser)?; + // Register the functions as a submodule let funcs = PyModule::new(py, "functions")?; functions::init_module(&funcs)?; diff --git a/src/unparser/dialect.rs b/src/unparser/dialect.rs new file mode 100644 index 000000000..caeef9949 --- /dev/null +++ b/src/unparser/dialect.rs @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::sql::unparser::dialect::{ + DefaultDialect, Dialect, DuckDBDialect, MySqlDialect, PostgreSqlDialect, SqliteDialect, +}; +use pyo3::prelude::*; + +#[pyclass(name = "Dialect", module = "datafusion.unparser", subclass)] +#[derive(Clone)] +pub struct PyDialect { + pub dialect: Arc, +} + +#[pymethods] +impl PyDialect { + #[staticmethod] + pub fn default() -> Self { + Self { + dialect: Arc::new(DefaultDialect {}), + } + } + #[staticmethod] + pub fn postgres() -> Self { + Self { + dialect: Arc::new(PostgreSqlDialect {}), + } + } + #[staticmethod] + pub fn mysql() -> Self { + Self { + dialect: Arc::new(MySqlDialect {}), + } + } + #[staticmethod] + pub fn sqlite() -> Self { + Self { + dialect: Arc::new(SqliteDialect {}), + } + } + #[staticmethod] + pub fn duckdb() -> Self { + Self { + dialect: Arc::new(DuckDBDialect::new()), + } + } +} diff --git a/src/unparser/mod.rs b/src/unparser/mod.rs new file mode 100644 index 000000000..b4b0fed10 --- /dev/null +++ b/src/unparser/mod.rs @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod dialect; + +use std::sync::Arc; + +use datafusion::sql::unparser::{dialect::Dialect, Unparser}; +use dialect::PyDialect; +use pyo3::{exceptions::PyValueError, prelude::*}; + +use crate::sql::logical::PyLogicalPlan; + +#[pyclass(name = "Unparser", module = "datafusion.unparser", subclass)] +#[derive(Clone)] +pub struct PyUnparser { + dialect: Arc, + pretty: bool, +} + +#[pymethods] +impl PyUnparser { + #[new] + pub fn new(dialect: PyDialect) -> Self { + Self { + dialect: dialect.dialect.clone(), + pretty: false, + } + } + + pub fn plan_to_sql(&self, plan: &PyLogicalPlan) -> PyResult { + let mut unparser = Unparser::new(self.dialect.as_ref()); + unparser = unparser.with_pretty(self.pretty); + let sql = unparser + .plan_to_sql(&plan.plan()) + .map_err(|e| PyValueError::new_err(e.to_string()))?; + Ok(sql.to_string()) + } + + pub fn with_pretty(&self, pretty: bool) -> Self { + Self { + dialect: self.dialect.clone(), + pretty, + } + } +} + +pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + Ok(()) +} From 09b929a65c27ce8c58563d4def8d79b426ae47e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20Sch=C3=A4fer?= <33159547+floscha@users.noreply.github.com> Date: Sun, 30 Mar 2025 14:45:49 +0200 Subject: [PATCH 028/145] Documentation updates: mention correct dataset on basics page (#1081) * Documentation updates: mention correct dataset on basics page * Update docs/source/user-guide/basics.rst Co-authored-by: Kevin Liu * Make download hint more concise --------- Co-authored-by: Kevin Liu --- docs/source/user-guide/basics.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst index f37378a41..6636c0c6a 100644 --- a/docs/source/user-guide/basics.rst +++ b/docs/source/user-guide/basics.rst @@ -20,8 +20,8 @@ Concepts ======== -In this section, we will cover a basic example to introduce a few key concepts. We will use the same -source file as described in the :ref:`Introduction `, the Pokemon data set. +In this section, we will cover a basic example to introduce a few key concepts. We will use the +2021 Yellow Taxi Trip Records ([download](https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet)), from the [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page). .. ipython:: python From 818975b5c43021fed109ebba3cb99d744e8f036a Mon Sep 17 00:00:00 2001 From: kosiew Date: Mon, 21 Apr 2025 19:51:25 +0800 Subject: [PATCH 029/145] Add Configurable HTML Table Formatter for DataFusion DataFrames in Python (#1100) * feat: add configurable HTML formatter for DataFrames * fix: update schema iteration in DataFrameHtmlFormatter to use correct format * refactor: remove unused constant MAX_LENGTH_CELL_WITHOUT_MINIMIZE in PyTableProvider * refactor: improve HTML rendering structure in DataFrameHtmlFormatter - Added List import to typing for type hints. - Refactored format_html method to modularize HTML component generation. - Created separate methods for building HTML header, table container, header, body, expandable cells, regular cells, and footer for better readability and maintainability. - Updated table_uuid generation to use f-string for consistency. - Ensured all HTML components are returned as lists for efficient joining. * doc: enhance docstrings for DataFrameHtmlFormatter methods to clarify usage * refactor: enhance DataFrameHtmlFormatter with customizable cell and header styles - Added methods `get_cell_style()` and `get_header_style()` to allow subclasses to customize the CSS styles for table cells and headers. - Updated `_build_table_header()` and `_build_regular_cell()` methods to utilize the new styling methods for improved maintainability. - Introduced a registry for custom type formatters in `DataFrameHtmlFormatter` to enable flexible formatting of cell values based on their types. - Enhanced `_format_cell_value()` to check for registered formatters before defaulting to string conversion, improving extensibility. * refactor: enhance DataFrameHtmlFormatter with custom cell and header builders - Introduced CellFormatter and StyleProvider protocols for better extensibility. - Added DefaultStyleProvider class with default CSS styles for cells and headers. - Updated DataFrameHtmlFormatter to support custom cell and header builders. - Refactored methods to utilize the new style provider for consistent styling. - Improved documentation for methods and classes to clarify usage and customization options. * doc: expand module docstring for DataFrameHtmlFormatter with usage examples and customization options * refactor: streamline HTML formatter by removing extensive docstring examples and enhancing cell formatting methods - Removed lengthy examples from the docstring of DataFrameHtmlFormatter to improve readability. - Added methods for extracting and formatting cell values, enhancing the clarity and maintainability of the code. - Updated cell building methods to utilize the new formatting logic, ensuring consistent application of styles and behaviors. - Introduced a reset fixture for tests to ensure the formatter is returned to default settings after each test case. - Added tests for HTML formatter configuration, custom style providers, type formatters, custom cell builders, and complex customizations to ensure robust functionality. * refactor: improve cell rendering logic in DataFrameHtmlFormatter by utilizing raw values for custom cell builders and optimizing expandable cell creation * refactor: enhance HTML representation in DataFrame by integrating latest formatter and improving cell value formatting logic * refactor: improve HTML formatting logic in DataFrame by separating data collection and schema retrieval for clarity refactor: enhance reset_formatter fixture to preserve original formatter configuration during tests * refactor: add debug utilities for HTML formatter integration testing and enhance debugging output in DataFrameHtmlFormatter * refactor: implement HTML formatter patch for DataFrame and enhance value retrieval in cell formatting * fix: correct typo in file extension check for parquet files in test_write_compressed_parquet * test: add test for DataFrame._repr_html_ to validate HTML output structure * refactor: remove monkeypatch for DataFrame._repr_html_ and associated logic * refactor: simplify _repr_html_ method in DataFrame to directly call internal representation * refactor: remove debug utilities for HTML formatter integration in DataFrame * refactor: remove debug print statements from DataFrameHtmlFormatter and add HTML formatter integration tests - Removed debug print statements from format_html, _build_table_body, and get_formatter methods in DataFrameHtmlFormatter to clean up the code. - Introduced a new debug_utils.py file containing a function to check HTML formatter integration. - Updated __init__.py to include configure_formatter for easier access. - Enhanced DataFrame class to include a docstring for _repr_html_ method. - Added comprehensive tests for HTML formatter configuration, custom style providers, type formatters, and cell/header builders in test_dataframe.py. * refactor: streamline imports and enhance HTML formatter integration in tests - Removed redundant import of `configure_formatter` in `__init__.py`. - Added `configure_formatter` to `__all__` in `__init__.py` for better module exposure. - Cleaned up import statements in `html_formatter.py` for clarity. - Consolidated import statements in `test_dataframe.py` for improved readability. - Simplified the `reset_formatter` fixture by removing unnecessary imports and comments. * refactor: remove redundant imports and debug print statements in HTML formatter tests * refactor: add reset_formatter function to reset global HTML formatter state - Implemented reset_formatter to create a new default DataFrame HTML formatter and update the global reference. - Added clean_formatter_state fixture in tests to ensure a fresh formatter state for each test case. - Updated test cases to use clean_formatter_state instead of the previous reset_formatter implementation. * refactor: enhance DataFrameHtmlFormatter initialization with parameter validation * test: add custom cell builder test for HTML formatter with value-based styling * test: enhance DataFrame HTML representation tests for structure and values * feat: enhance DataFrameHtmlFormatter with shared styles support and reset functionality - Added `use_shared_styles` parameter to control loading of styles/scripts. - Implemented logic to conditionally include styles based on `use_shared_styles`. - Updated the constructor to validate `use_shared_styles` as a boolean. - Introduced `reset_styles_loaded_state` function to reset the styles loaded state. - Modified `reset_formatter` to reset the `_styles_loaded` flag. * refactor: update footer comment in DataFrameHtmlFormatter to clarify content * test: enhance HTML representation test to accommodate span-wrapped values * docs: add usage examples to formatter functions in html_formatter.py * test: add HTML formatter tests for shared styles functionality * feat: add method to check if styles are loaded and enhance schema validation in DataFrameHtmlFormatter * refactor: streamline custom cell builder in HTML formatter tests for clarity and maintainability * fix ruff errors * chore: update license header in html_formatter.py for compliance * refactor: improve HTML formatter tests by updating import statements and enhancing regex patterns for body data * fix clippy errors --- python/datafusion/__init__.py | 2 + python/datafusion/html_formatter.py | 647 ++++++++++++++++++++++++++++ python/tests/test_dataframe.py | 396 ++++++++++++++++- src/dataframe.rs | 130 +----- 4 files changed, 1061 insertions(+), 114 deletions(-) create mode 100644 python/datafusion/html_formatter.py diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index ecf5545bc..60d0d61b4 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -45,6 +45,7 @@ Expr, WindowFrame, ) +from .html_formatter import configure_formatter from .io import read_avro, read_csv, read_json, read_parquet from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream @@ -76,6 +77,7 @@ "col", "column", "common", + "configure_formatter", "expr", "functions", "lit", diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py new file mode 100644 index 000000000..a50e14fd5 --- /dev/null +++ b/python/datafusion/html_formatter.py @@ -0,0 +1,647 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""HTML formatting utilities for DataFusion DataFrames.""" + +from __future__ import annotations + +from typing import ( + Any, + Callable, + Optional, + Protocol, + runtime_checkable, +) + + +@runtime_checkable +class CellFormatter(Protocol): + """Protocol for cell value formatters.""" + + def __call__(self, value: Any) -> str: + """Format a cell value to string representation.""" + ... + + +@runtime_checkable +class StyleProvider(Protocol): + """Protocol for HTML style providers.""" + + def get_cell_style(self) -> str: + """Get the CSS style for table cells.""" + ... + + def get_header_style(self) -> str: + """Get the CSS style for header cells.""" + ... + + +class DefaultStyleProvider: + """Default implementation of StyleProvider.""" + + def get_cell_style(self) -> str: + """Get the CSS style for table cells. + + Returns: + CSS style string + """ + return ( + "border: 1px solid black; padding: 8px; text-align: left; " + "white-space: nowrap;" + ) + + def get_header_style(self) -> str: + """Get the CSS style for header cells. + + Returns: + CSS style string + """ + return ( + "border: 1px solid black; padding: 8px; text-align: left; " + "background-color: #f2f2f2; white-space: nowrap; min-width: fit-content; " + "max-width: fit-content;" + ) + + +class DataFrameHtmlFormatter: + """Configurable HTML formatter for DataFusion DataFrames. + + This class handles the HTML rendering of DataFrames for display in + Jupyter notebooks and other rich display contexts. + + This class supports extension through composition. Key extension points: + - Provide a custom StyleProvider for styling cells and headers + - Register custom formatters for specific types + - Provide custom cell builders for specialized cell rendering + + Args: + max_cell_length: Maximum characters to display in a cell before truncation + max_width: Maximum width of the HTML table in pixels + max_height: Maximum height of the HTML table in pixels + enable_cell_expansion: Whether to add expand/collapse buttons for long cell + values + custom_css: Additional CSS to include in the HTML output + show_truncation_message: Whether to display a message when data is truncated + style_provider: Custom provider for cell and header styles + use_shared_styles: Whether to load styles and scripts only once per notebook + session + """ + + # Class variable to track if styles have been loaded in the notebook + _styles_loaded = False + + def __init__( + self, + max_cell_length: int = 25, + max_width: int = 1000, + max_height: int = 300, + enable_cell_expansion: bool = True, + custom_css: Optional[str] = None, + show_truncation_message: bool = True, + style_provider: Optional[StyleProvider] = None, + use_shared_styles: bool = True, + ) -> None: + """Initialize the HTML formatter. + + Parameters + ---------- + max_cell_length : int, default 25 + Maximum length of cell content before truncation. + max_width : int, default 1000 + Maximum width of the displayed table in pixels. + max_height : int, default 300 + Maximum height of the displayed table in pixels. + enable_cell_expansion : bool, default True + Whether to allow cells to expand when clicked. + custom_css : str, optional + Custom CSS to apply to the HTML table. + show_truncation_message : bool, default True + Whether to show a message indicating that content has been truncated. + style_provider : StyleProvider, optional + Provider of CSS styles for the HTML table. If None, DefaultStyleProvider + is used. + use_shared_styles : bool, default True + Whether to use shared styles across multiple tables. + + Raises: + ------ + ValueError + If max_cell_length, max_width, or max_height is not a positive integer. + TypeError + If enable_cell_expansion, show_truncation_message, or use_shared_styles is + not a boolean, + or if custom_css is provided but is not a string, + or if style_provider is provided but does not implement the StyleProvider + protocol. + """ + # Validate numeric parameters + + if not isinstance(max_cell_length, int) or max_cell_length <= 0: + msg = "max_cell_length must be a positive integer" + raise ValueError(msg) + if not isinstance(max_width, int) or max_width <= 0: + msg = "max_width must be a positive integer" + raise ValueError(msg) + if not isinstance(max_height, int) or max_height <= 0: + msg = "max_height must be a positive integer" + raise ValueError(msg) + + # Validate boolean parameters + if not isinstance(enable_cell_expansion, bool): + msg = "enable_cell_expansion must be a boolean" + raise TypeError(msg) + if not isinstance(show_truncation_message, bool): + msg = "show_truncation_message must be a boolean" + raise TypeError(msg) + if not isinstance(use_shared_styles, bool): + msg = "use_shared_styles must be a boolean" + raise TypeError(msg) + + # Validate custom_css + if custom_css is not None and not isinstance(custom_css, str): + msg = "custom_css must be None or a string" + raise TypeError(msg) + + # Validate style_provider + if style_provider is not None and not isinstance(style_provider, StyleProvider): + msg = "style_provider must implement the StyleProvider protocol" + raise TypeError(msg) + + self.max_cell_length = max_cell_length + self.max_width = max_width + self.max_height = max_height + self.enable_cell_expansion = enable_cell_expansion + self.custom_css = custom_css + self.show_truncation_message = show_truncation_message + self.style_provider = style_provider or DefaultStyleProvider() + self.use_shared_styles = use_shared_styles + # Registry for custom type formatters + self._type_formatters: dict[type, CellFormatter] = {} + # Custom cell builders + self._custom_cell_builder: Optional[Callable[[Any, int, int, str], str]] = None + self._custom_header_builder: Optional[Callable[[Any], str]] = None + + def register_formatter(self, type_class: type, formatter: CellFormatter) -> None: + """Register a custom formatter for a specific data type. + + Args: + type_class: The type to register a formatter for + formatter: Function that takes a value of the given type and returns + a formatted string + """ + self._type_formatters[type_class] = formatter + + def set_custom_cell_builder( + self, builder: Callable[[Any, int, int, str], str] + ) -> None: + """Set a custom cell builder function. + + Args: + builder: Function that takes (value, row, col, table_id) and returns HTML + """ + self._custom_cell_builder = builder + + def set_custom_header_builder(self, builder: Callable[[Any], str]) -> None: + """Set a custom header builder function. + + Args: + builder: Function that takes a field and returns HTML + """ + self._custom_header_builder = builder + + @classmethod + def is_styles_loaded(cls) -> bool: + """Check if HTML styles have been loaded in the current session. + + This method is primarily intended for debugging UI rendering issues + related to style loading. + + Returns: + True if styles have been loaded, False otherwise + + Example: + >>> from datafusion.html_formatter import DataFrameHtmlFormatter + >>> DataFrameHtmlFormatter.is_styles_loaded() + False + """ + return cls._styles_loaded + + def format_html( + self, + batches: list, + schema: Any, + has_more: bool = False, + table_uuid: str | None = None, + ) -> str: + """Format record batches as HTML. + + This method is used by DataFrame's _repr_html_ implementation and can be + called directly when custom HTML rendering is needed. + + Args: + batches: List of Arrow RecordBatch objects + schema: Arrow Schema object + has_more: Whether there are more batches not shown + table_uuid: Unique ID for the table, used for JavaScript interactions + + Returns: + HTML string representation of the data + + Raises: + TypeError: If schema is invalid and no batches are provided + """ + if not batches: + return "No data to display" + + # Validate schema + if schema is None or not hasattr(schema, "__iter__"): + msg = "Schema must be provided" + raise TypeError(msg) + + # Generate a unique ID if none provided + table_uuid = table_uuid or f"df-{id(batches)}" + + # Build HTML components + html = [] + + # Only include styles and scripts if: + # 1. Not using shared styles, OR + # 2. Using shared styles but they haven't been loaded yet + include_styles = ( + not self.use_shared_styles or not DataFrameHtmlFormatter._styles_loaded + ) + + if include_styles: + html.extend(self._build_html_header()) + # If we're using shared styles, mark them as loaded + if self.use_shared_styles: + DataFrameHtmlFormatter._styles_loaded = True + + html.extend(self._build_table_container_start()) + + # Add table header and body + html.extend(self._build_table_header(schema)) + html.extend(self._build_table_body(batches, table_uuid)) + + html.append("") + html.append("") + + # Add footer (JavaScript and messages) + if include_styles and self.enable_cell_expansion: + html.append(self._get_javascript()) + + # Always add truncation message if needed (independent of styles) + if has_more and self.show_truncation_message: + html.append("
Data truncated due to size.
") + + return "\n".join(html) + + def _build_html_header(self) -> list[str]: + """Build the HTML header with CSS styles.""" + html = [] + html.append("") + return html + + def _build_table_container_start(self) -> list[str]: + """Build the opening tags for the table container.""" + html = [] + html.append( + f'
' + ) + html.append('') + return html + + def _build_table_header(self, schema: Any) -> list[str]: + """Build the HTML table header with column names.""" + html = [] + html.append("") + html.append("") + for field in schema: + if self._custom_header_builder: + html.append(self._custom_header_builder(field)) + else: + html.append( + f"" + ) + html.append("") + html.append("") + return html + + def _build_table_body(self, batches: list, table_uuid: str) -> list[str]: + """Build the HTML table body with data rows.""" + html = [] + html.append("") + + row_count = 0 + for batch in batches: + for row_idx in range(batch.num_rows): + row_count += 1 + html.append("") + + for col_idx, column in enumerate(batch.columns): + # Get the raw value from the column + raw_value = self._get_cell_value(column, row_idx) + + # Always check for type formatters first to format the value + formatted_value = self._format_cell_value(raw_value) + + # Then apply either custom cell builder or standard cell formatting + if self._custom_cell_builder: + # Pass both the raw value and formatted value to let the + # builder decide + cell_html = self._custom_cell_builder( + raw_value, row_count, col_idx, table_uuid + ) + html.append(cell_html) + else: + # Standard cell formatting with formatted value + if ( + len(str(raw_value)) > self.max_cell_length + and self.enable_cell_expansion + ): + cell_html = self._build_expandable_cell( + formatted_value, row_count, col_idx, table_uuid + ) + else: + cell_html = self._build_regular_cell(formatted_value) + html.append(cell_html) + + html.append("") + + html.append("") + return html + + def _get_cell_value(self, column: Any, row_idx: int) -> Any: + """Extract a cell value from a column. + + Args: + column: Arrow array + row_idx: Row index + + Returns: + The raw cell value + """ + try: + value = column[row_idx] + + if hasattr(value, "as_py"): + return value.as_py() + except (AttributeError, TypeError): + pass + else: + return value + + def _format_cell_value(self, value: Any) -> str: + """Format a cell value for display. + + Uses registered type formatters if available. + + Args: + value: The cell value to format + + Returns: + Formatted cell value as string + """ + # Check for custom type formatters + for type_cls, formatter in self._type_formatters.items(): + if isinstance(value, type_cls): + return formatter(value) + + # If no formatter matched, return string representation + return str(value) + + def _build_expandable_cell( + self, formatted_value: str, row_count: int, col_idx: int, table_uuid: str + ) -> str: + """Build an expandable cell for long content.""" + short_value = str(formatted_value)[: self.max_cell_length] + return ( + f"" + ) + + def _build_regular_cell(self, formatted_value: str) -> str: + """Build a regular table cell.""" + return ( + f"" + ) + + def _build_html_footer(self, has_more: bool) -> list[str]: + """Build the HTML footer with JavaScript and messages.""" + html = [] + + # Add JavaScript for interactivity only if cell expansion is enabled + # and we're not using the shared styles approach + if self.enable_cell_expansion and not self.use_shared_styles: + html.append(self._get_javascript()) + + # Add truncation message if needed + if has_more and self.show_truncation_message: + html.append("
Data truncated due to size.
") + + return html + + def _get_default_css(self) -> str: + """Get default CSS styles for the HTML table.""" + return """ + .expandable-container { + display: inline-block; + max-width: 200px; + } + .expandable { + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + display: block; + } + .full-text { + display: none; + white-space: normal; + } + .expand-btn { + cursor: pointer; + color: blue; + text-decoration: underline; + border: none; + background: none; + font-size: inherit; + display: block; + margin-top: 5px; + } + """ + + def _get_javascript(self) -> str: + """Get JavaScript code for interactive elements.""" + return """ + + """ + + +class FormatterManager: + """Manager class for the global DataFrame HTML formatter instance.""" + + _default_formatter: DataFrameHtmlFormatter = DataFrameHtmlFormatter() + + @classmethod + def set_formatter(cls, formatter: DataFrameHtmlFormatter) -> None: + """Set the global DataFrame HTML formatter. + + Args: + formatter: The formatter instance to use globally + """ + cls._default_formatter = formatter + _refresh_formatter_reference() + + @classmethod + def get_formatter(cls) -> DataFrameHtmlFormatter: + """Get the current global DataFrame HTML formatter. + + Returns: + The global HTML formatter instance + """ + return cls._default_formatter + + +def get_formatter() -> DataFrameHtmlFormatter: + """Get the current global DataFrame HTML formatter. + + This function is used by the DataFrame._repr_html_ implementation to access + the shared formatter instance. It can also be used directly when custom + HTML rendering is needed. + + Returns: + The global HTML formatter instance + + Example: + >>> from datafusion.html_formatter import get_formatter + >>> formatter = get_formatter() + >>> formatter.max_cell_length = 50 # Increase cell length + """ + return FormatterManager.get_formatter() + + +def set_formatter(formatter: DataFrameHtmlFormatter) -> None: + """Set the global DataFrame HTML formatter. + + Args: + formatter: The formatter instance to use globally + + Example: + >>> from datafusion.html_formatter import get_formatter, set_formatter + >>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100) + >>> set_formatter(custom_formatter) + """ + FormatterManager.set_formatter(formatter) + + +def configure_formatter(**kwargs: Any) -> None: + """Configure the global DataFrame HTML formatter. + + This function creates a new formatter with the provided configuration + and sets it as the global formatter for all DataFrames. + + Args: + **kwargs: Formatter configuration parameters like max_cell_length, + max_width, max_height, enable_cell_expansion, etc. + + Example: + >>> from datafusion.html_formatter import configure_formatter + >>> configure_formatter( + ... max_cell_length=50, + ... max_height=500, + ... enable_cell_expansion=True, + ... use_shared_styles=True + ... ) + """ + set_formatter(DataFrameHtmlFormatter(**kwargs)) + + +def reset_formatter() -> None: + """Reset the global DataFrame HTML formatter to default settings. + + This function creates a new formatter with default configuration + and sets it as the global formatter for all DataFrames. + + Example: + >>> from datafusion.html_formatter import reset_formatter + >>> reset_formatter() # Reset formatter to default settings + """ + formatter = DataFrameHtmlFormatter() + # Reset the styles_loaded flag to ensure styles will be reloaded + DataFrameHtmlFormatter._styles_loaded = False + set_formatter(formatter) + + +def reset_styles_loaded_state() -> None: + """Reset the styles loaded state to force reloading of styles. + + This can be useful when switching between notebook sessions or + when styles need to be refreshed. + + Example: + >>> from datafusion.html_formatter import reset_styles_loaded_state + >>> reset_styles_loaded_state() # Force styles to reload in next render + """ + DataFrameHtmlFormatter._styles_loaded = False + + +def _refresh_formatter_reference() -> None: + """Refresh formatter reference in any modules using it. + + This helps ensure that changes to the formatter are reflected in existing + DataFrames that might be caching the formatter reference. + """ + # This is a no-op but signals modules to refresh their reference diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index eda13930d..464b884db 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -28,8 +28,17 @@ column, literal, ) -from datafusion import functions as f +from datafusion import ( + functions as f, +) from datafusion.expr import Window +from datafusion.html_formatter import ( + DataFrameHtmlFormatter, + configure_formatter, + get_formatter, + reset_formatter, + reset_styles_loaded_state, +) from pyarrow.csv import write_csv @@ -102,6 +111,12 @@ def partitioned_df(): return ctx.create_dataframe([[batch]]) +@pytest.fixture +def clean_formatter_state(): + """Reset the HTML formatter after each test.""" + reset_formatter() + + def test_select(df): df_1 = df.select( column("a") + column("b"), @@ -656,6 +671,252 @@ def test_window_frame_defaults_match_postgres(partitioned_df): assert df_2.sort(col_a).to_pydict() == expected +def test_html_formatter_configuration(df, clean_formatter_state): + """Test configuring the HTML formatter with different options.""" + # Configure with custom settings + configure_formatter( + max_cell_length=5, + max_width=500, + max_height=200, + enable_cell_expansion=False, + ) + + html_output = df._repr_html_() + + # Verify our configuration was applied + assert "max-height: 200px" in html_output + assert "max-width: 500px" in html_output + # With cell expansion disabled, we shouldn't see expandable-container elements + assert "expandable-container" not in html_output + + +def test_html_formatter_custom_style_provider(df, clean_formatter_state): + """Test using custom style providers with the HTML formatter.""" + + class CustomStyleProvider: + def get_cell_style(self) -> str: + return ( + "background-color: #f5f5f5; color: #333; padding: 8px; border: " + "1px solid #ddd;" + ) + + def get_header_style(self) -> str: + return ( + "background-color: #4285f4; color: white; font-weight: bold; " + "padding: 10px; border: 1px solid #3367d6;" + ) + + # Configure with custom style provider + configure_formatter(style_provider=CustomStyleProvider()) + + html_output = df._repr_html_() + + # Verify our custom styles were applied + assert "background-color: #4285f4" in html_output + assert "color: white" in html_output + assert "background-color: #f5f5f5" in html_output + + +def test_html_formatter_type_formatters(df, clean_formatter_state): + """Test registering custom type formatters for specific data types.""" + + # Get current formatter and register custom formatters + formatter = get_formatter() + + # Format integers with color based on value + # Using int as the type for the formatter will work since we convert + # Arrow scalar values to Python native types in _get_cell_value + def format_int(value): + return f' 2 else "blue"}">{value}' + + formatter.register_formatter(int, format_int) + + html_output = df._repr_html_() + + # Our test dataframe has values 1,2,3 so we should see: + assert '1' in html_output + + +def test_html_formatter_custom_cell_builder(df, clean_formatter_state): + """Test using a custom cell builder function.""" + + # Create a custom cell builder with distinct styling for different value ranges + def custom_cell_builder(value, row, col, table_id): + try: + num_value = int(value) + if num_value > 5: # Values > 5 get green background with indicator + return ( + '' + ) + if num_value < 3: # Values < 3 get blue background with indicator + return ( + '' + ) + except (ValueError, TypeError): + pass + + # Default styling for other cells (3, 4, 5) + return f'' + + # Set our custom cell builder + formatter = get_formatter() + formatter.set_custom_cell_builder(custom_cell_builder) + + html_output = df._repr_html_() + + # Extract cells with specific styling using regex + low_cells = re.findall( + r'', html_output + ) + mid_cells = re.findall( + r'', html_output + ) + high_cells = re.findall( + r'', html_output + ) + + # Sort the extracted values for consistent comparison + low_cells = sorted(map(int, low_cells)) + mid_cells = sorted(map(int, mid_cells)) + high_cells = sorted(map(int, high_cells)) + + # Verify specific values have the correct styling applied + assert low_cells == [1, 2] # Values < 3 + assert mid_cells == [3, 4, 5, 5] # Values 3-5 + assert high_cells == [6, 8, 8] # Values > 5 + + # Verify the exact content with styling appears in the output + assert ( + '' + in html_output + ) + assert ( + '' + in html_output + ) + assert ( + '' in html_output + ) + assert ( + '' in html_output + ) + assert ( + '' + in html_output + ) + assert ( + '' + in html_output + ) + + # Count occurrences to ensure all cells are properly styled + assert html_output.count("-low") == 2 # Two low values (1, 2) + assert html_output.count("-mid") == 4 # Four mid values (3, 4, 5, 5) + assert html_output.count("-high") == 3 # Three high values (6, 8, 8) + + # Create a custom cell builder that changes background color based on value + def custom_cell_builder(value, row, col, table_id): + # Handle numeric values regardless of their exact type + try: + num_value = int(value) + if num_value > 5: # Values > 5 get green background + return f'' + if num_value < 3: # Values < 3 get light blue background + return f'' + except (ValueError, TypeError): + pass + + # Default styling for other cells + return f'' + + # Set our custom cell builder + formatter = get_formatter() + formatter.set_custom_cell_builder(custom_cell_builder) + + html_output = df._repr_html_() + + # Verify our custom cell styling was applied + assert "background-color: #d3e9f0" in html_output # For values 1,2 + + +def test_html_formatter_custom_header_builder(df, clean_formatter_state): + """Test using a custom header builder function.""" + + # Create a custom header builder with tooltips + def custom_header_builder(field): + tooltips = { + "a": "Primary key column", + "b": "Secondary values", + "c": "Additional data", + } + tooltip = tooltips.get(field.name, "") + return ( + f'' + ) + + # Set our custom header builder + formatter = get_formatter() + formatter.set_custom_header_builder(custom_header_builder) + + html_output = df._repr_html_() + + # Verify our custom headers were applied + assert 'title="Primary key column"' in html_output + assert 'title="Secondary values"' in html_output + assert "background-color: #333; color: white" in html_output + + +def test_html_formatter_complex_customization(df, clean_formatter_state): + """Test combining multiple customization options together.""" + + # Create a dark mode style provider + class DarkModeStyleProvider: + def get_cell_style(self) -> str: + return ( + "background-color: #222; color: #eee; " + "padding: 8px; border: 1px solid #444;" + ) + + def get_header_style(self) -> str: + return ( + "background-color: #111; color: #fff; padding: 10px; " + "border: 1px solid #333;" + ) + + # Configure with dark mode style + configure_formatter( + max_cell_length=10, + style_provider=DarkModeStyleProvider(), + custom_css=""" + .datafusion-table { + font-family: monospace; + border-collapse: collapse; + } + .datafusion-table tr:hover td { + background-color: #444 !important; + } + """, + ) + + # Add type formatters for special formatting - now working with native int values + formatter = get_formatter() + formatter.register_formatter( + int, + lambda n: f'{n}', + ) + + html_output = df._repr_html_() + + # Verify our customizations were applied + assert "background-color: #222" in html_output + assert "background-color: #111" in html_output + assert ".datafusion-table" in html_output + assert "color: #5af" in html_output # Even numbers + + def test_get_dataframe(tmp_path): ctx = SessionContext() @@ -1244,7 +1505,10 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame: assert result["new_col"] == [3 for _i in range(3)] -def test_dataframe_repr_html(df) -> None: +def test_dataframe_repr_html_structure(df) -> None: + """Test that DataFrame._repr_html_ produces expected HTML output structure.""" + import re + output = df._repr_html_() # Since we've added a fair bit of processing to the html output, lets just verify @@ -1255,9 +1519,131 @@ def test_dataframe_repr_html(df) -> None: headers = ["a", "b", "c"] headers = [f"{v}" for v in headers] header_pattern = "(.*?)".join(headers) - assert len(re.findall(header_pattern, output, re.DOTALL)) == 1 + header_matches = re.findall(header_pattern, output, re.DOTALL) + assert len(header_matches) == 1 + # Update the pattern to handle values that may be wrapped in spans body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]] - body_lines = [f"{v}" for inner in body_data for v in inner] + + body_lines = [ + f"(?:]*?>)?{v}(?:)?" + for inner in body_data + for v in inner + ] body_pattern = "(.*?)".join(body_lines) - assert len(re.findall(body_pattern, output, re.DOTALL)) == 1 + + body_matches = re.findall(body_pattern, output, re.DOTALL) + + assert len(body_matches) == 1, "Expected pattern of values not found in HTML output" + + +def test_dataframe_repr_html_values(df): + """Test that DataFrame._repr_html_ contains the expected data values.""" + html = df._repr_html_() + assert html is not None + + # Create a more flexible pattern that handles values being wrapped in spans + # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless + # of formatting + pattern = re.compile( + r"]*?>(?:]*?>)?1(?:)?.*?" + r"]*?>(?:]*?>)?4(?:)?.*?" + r"]*?>(?:]*?>)?8(?:)?.*?" + r"]*?>(?:]*?>)?2(?:)?.*?" + r"]*?>(?:]*?>)?5(?:)?.*?" + r"]*?>(?:]*?>)?5(?:)?.*?" + r"]*?>(?:]*?>)?3(?:)?.*?" + r"]*?>(?:]*?>)?6(?:)?.*?" + r"]*?>(?:]*?>)?8(?:)?", + re.DOTALL, + ) + + # Print debug info if the test fails + matches = re.findall(pattern, html) + if not matches: + print(f"HTML output snippet: {html[:500]}...") # noqa: T201 + + assert len(matches) > 0, "Expected pattern of values not found in HTML output" + + +def test_html_formatter_shared_styles(df, clean_formatter_state): + """Test that shared styles work correctly across multiple tables.""" + + # First, ensure we're using shared styles + configure_formatter(use_shared_styles=True) + + # Get HTML output for first table - should include styles + html_first = df._repr_html_() + + # Verify styles are included in first render + assert " + // Convert record batches to PyObject list + let py_batches = batches + .into_iter() + .map(|rb| rb.to_pyarrow(py)) + .collect::>>()?; -
-
" + f"{field.name}
" + f"
" + "" + "" + f"{formatted_value}" + f"" + f"
" + f"
{formatted_value}{value}-high{value}-low{value}-mid]*>(\d+)-low]*>(\d+)-mid]*>(\d+)-high1-low2-low3-mid4-mid6-high8-high{value}{value}{value}{field.name}
- \n".to_string(); + let py_schema = self.schema().into_pyobject(py)?; - let schema = batches[0].schema(); + // Get the Python formatter module and call format_html + let formatter_module = py.import("datafusion.html_formatter")?; + let get_formatter = formatter_module.getattr("get_formatter")?; + let formatter = get_formatter.call0()?; - let mut header = Vec::new(); - for field in schema.fields() { - header.push(format!("", field.name())); - } - let header_str = header.join(""); - html_str.push_str(&format!("{}\n", header_str)); - - let batch_formatters = batches - .iter() - .map(|batch| { - batch - .columns() - .iter() - .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default())) - .map(|c| { - c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string()))) - }) - .collect::, _>>() - }) - .collect::, _>>()?; - - let rows_per_batch = batches.iter().map(|batch| batch.num_rows()); - - // We need to build up row by row for html - let mut table_row = 0; - for (batch_formatter, num_rows_in_batch) in batch_formatters.iter().zip(rows_per_batch) { - for batch_row in 0..num_rows_in_batch { - table_row += 1; - let mut cells = Vec::new(); - for (col, formatter) in batch_formatter.iter().enumerate() { - let cell_data = formatter.value(batch_row).to_string(); - // From testing, primitive data types do not typically get larger than 21 characters - if cell_data.len() > MAX_LENGTH_CELL_WITHOUT_MINIMIZE { - let short_cell_data = &cell_data[0..MAX_LENGTH_CELL_WITHOUT_MINIMIZE]; - cells.push(format!(" - ")); - } else { - cells.push(format!("", formatter.value(batch_row))); - } - } - let row_str = cells.join(""); - html_str.push_str(&format!("{}\n", row_str)); - } - } - html_str.push_str("
{}
-
- {short_cell_data} - {cell_data} - -
-
{}
\n"); - - html_str.push_str(" - - "); + // Call format_html method on the formatter + let kwargs = pyo3::types::PyDict::new(py); + let py_batches_list = PyList::new(py, py_batches.as_slice())?; + kwargs.set_item("batches", py_batches_list)?; + kwargs.set_item("schema", py_schema)?; + kwargs.set_item("has_more", has_more)?; + kwargs.set_item("table_uuid", table_uuid)?; - if has_more { - html_str.push_str("Data truncated due to size."); - } + let html_result = formatter.call_method("format_html", (), Some(&kwargs))?; + let html_str: String = html_result.extract()?; Ok(html_str) } @@ -835,7 +747,7 @@ fn record_batch_into_schema( ) -> Result { let schema = Arc::new(schema.clone()); let base_schema = record_batch.schema(); - if base_schema.fields().len() == 0 { + if base_schema.fields().is_empty() { // Nothing to project return Ok(RecordBatch::new_empty(schema)); } From d0d14f6e1584f9569cbf2e36c8a7abc7c70fd903 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 24 Apr 2025 09:38:38 -0400 Subject: [PATCH 030/145] feat: update datafusion dependency 47 (#1107) * Update cargo to use DF47 release candidate * Need to be explicit for collection of Expr due to change in dataframe API * Add missing enum variant * Add missing enum variants * The interface for last_value of aggregates upstream changed * Cargo fmt * last value aggregate without ordering is ill defined * Clippy warning * Set datafusion version to 47 now that it is released --- Cargo.lock | 600 +++++++++++++++++++------------ Cargo.toml | 24 +- python/tests/test_aggregation.py | 1 - src/dataframe.rs | 2 +- src/dataset_exec.rs | 4 +- src/expr.rs | 16 +- src/functions.rs | 37 +- 7 files changed, 415 insertions(+), 269 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f90038c50..b32d19d4d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -179,9 +179,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84ef243634a39fb6e9d1710737e7a5ef96c9bacabd2326859ff889bc9ef755e5" +checksum = "3095aaf545942ff5abd46654534f15b03a90fba78299d661e045e5d587222f0d" dependencies = [ "arrow-arith", "arrow-array", @@ -201,9 +201,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f420c6aef51dad2e4a96ce29c0ec90ad84880bdb60b321c74c652a6be07b93f" +checksum = "00752064ff47cee746e816ddb8450520c3a52cbad1e256f6fa861a35f86c45e7" dependencies = [ "arrow-array", "arrow-buffer", @@ -215,9 +215,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24bda5ff6461a4ff9739959b3d57b377f45e3f878f7be1a4f28137c0a8f339fa" +checksum = "cebfe926794fbc1f49ddd0cdaf898956ca9f6e79541efce62dabccfd81380472" dependencies = [ "ahash", "arrow-buffer", @@ -232,9 +232,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc6ed265c73f134a583d02c3cab5e16afab9446d8048ede8707e31f85fad58a0" +checksum = "0303c7ec4cf1a2c60310fc4d6bbc3350cd051a17bf9e9c0a8e47b4db79277824" dependencies = [ "bytes", "half", @@ -243,9 +243,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01c648572391edcef10e5fd458db70ba27ed6f71bcaee04397d0cfb100b34f8b" +checksum = "335f769c5a218ea823d3760a743feba1ef7857cba114c01399a891c2fff34285" dependencies = [ "arrow-array", "arrow-buffer", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a02fb265a6d8011a7d3ad1a36f25816ad0a3bb04cb8e9fe7929c165b98c0cbcd" +checksum = "510db7dfbb4d5761826516cc611d97b3a68835d0ece95b034a052601109c0b1b" dependencies = [ "arrow-array", "arrow-cast", @@ -280,9 +280,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f2cebf504bb6a92a134a87fff98f01b14fbb3a93ecf7aef90cd0f888c5fffa4" +checksum = "e8affacf3351a24039ea24adab06f316ded523b6f8c3dbe28fbac5f18743451b" dependencies = [ "arrow-buffer", "arrow-schema", @@ -292,9 +292,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e6405b287671c88846e7751f7291f717b164911474cabac6d3d8614d5aa7374" +checksum = "69880a9e6934d9cba2b8630dd08a3463a91db8693b16b499d54026b6137af284" dependencies = [ "arrow-array", "arrow-buffer", @@ -306,9 +306,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5329bf9e7390cbb6b117ddd4d82e94c5362ea4cab5095697139429f36a38350c" +checksum = "d8dafd17a05449e31e0114d740530e0ada7379d7cb9c338fd65b09a8130960b0" dependencies = [ "arrow-array", "arrow-buffer", @@ -328,9 +328,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e103c13d4b80da28339c1d7aa23dd85bd59f42158acc45d39eeb6770627909ce" +checksum = "895644523af4e17502d42c3cb6b27cb820f0cb77954c22d75c23a85247c849e1" dependencies = [ "arrow-array", "arrow-buffer", @@ -341,9 +341,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "170549a11b8534f3097a0619cfe89c42812345dc998bcf81128fc700b84345b8" +checksum = "9be8a2a4e5e7d9c822b2b8095ecd77010576d824f654d347817640acfc97d229" dependencies = [ "arrow-array", "arrow-buffer", @@ -354,18 +354,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5c53775bba63f319189f366d2b86e9a8889373eb198f07d8544938fc9f8ed9a" +checksum = "7450c76ab7c5a6805be3440dc2e2096010da58f7cab301fdc996a4ee3ee74e49" dependencies = [ - "bitflags 2.8.0", + "bitflags", ] [[package]] name = "arrow-select" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a99003b2eb562b8d9c99dfb672306f15e94b20d3734179d596895703e821dcf" +checksum = "aa5f5a93c75f46ef48e4001535e7b6c922eeb0aa20b73cf58d09e13d057490d8" dependencies = [ "ahash", "arrow-array", @@ -377,9 +377,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90fdb130ee8325f4cd8262e19bb6baa3cbcef2b2573c4bee8c6fda7ea08199d7" +checksum = "6e7005d858d84b56428ba2a98a107fe88c0132c61793cf6b8232a1f9bfc0452b" dependencies = [ "arrow-array", "arrow-buffer", @@ -406,11 +406,11 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.18" +version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df895a515f70646414f4b45c0b79082783b80552b373a68283012928df56f522" +checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" dependencies = [ - "bzip2 0.4.4", + "bzip2 0.5.2", "flate2", "futures-core", "memchr", @@ -438,18 +438,18 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "async-trait" -version = "0.1.86" +version = "0.1.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "644dd749086bf3771a2fbc5f256fdb982d53f011c7d5d560304eafeecebce79d" +checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -502,9 +502,9 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "bigdecimal" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f31f3af01c5c65a07985c804d3366560e6fa7883d640a122819b14ec327482c" +checksum = "1a22f228ab7a1b23027ccc6c350b72868017af7ea8356fbdf19f8d991c690013" dependencies = [ "autocfg", "libm", @@ -514,12 +514,6 @@ dependencies = [ "serde", ] -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.8.0" @@ -537,9 +531,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.7.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b17679a8d69b6d7fd9cd9801a536cec9fa5e5970b69f9d4747f70b39b031f5e7" +checksum = "389a099b34312839e16420d499a9cad9650541715937ffbdd40d36f49e77eeb3" dependencies = [ "arrayref", "arrayvec", @@ -608,21 +602,20 @@ dependencies = [ [[package]] name = "bzip2" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75b89e7c29231c673a61a46e722602bcd138298f6b9e81e71119693534585f5c" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" dependencies = [ "bzip2-sys", ] [[package]] name = "bzip2-sys" -version = "0.1.12+1.0.8" +version = "0.1.13+1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72ebc2f1a417f01e1da30ef264ee86ae31d2dcd2d603ea283d3c244a883ca2a9" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" dependencies = [ "cc", - "libc", "pkg-config", ] @@ -866,23 +859,26 @@ dependencies = [ [[package]] name = "datafusion" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "914e6f9525599579abbd90b0f7a55afcaaaa40350b9e9ed52563f126dfe45fd3" +checksum = "ffe060b978f74ab446be722adb8a274e052e005bf6dfd171caadc3abaad10080" dependencies = [ - "apache-avro", "arrow", "arrow-ipc", "arrow-schema", "async-trait", "bytes", - "bzip2 0.5.1", + "bzip2 0.5.2", "chrono", "datafusion-catalog", "datafusion-catalog-listing", "datafusion-common", "datafusion-common-runtime", "datafusion-datasource", + "datafusion-datasource-avro", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -897,12 +893,12 @@ dependencies = [ "datafusion-physical-expr-common", "datafusion-physical-optimizer", "datafusion-physical-plan", + "datafusion-session", "datafusion-sql", "flate2", "futures", "itertools 0.14.0", "log", - "num-traits", "object_store", "parking_lot", "parquet", @@ -919,29 +915,35 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "998a6549e6ee4ee3980e05590b2960446a56b343ea30199ef38acd0e0b9036e2" +checksum = "61fe34f401bd03724a1f96d12108144f8cd495a3cdda2bf5e091822fb80b7e66" dependencies = [ "arrow", "async-trait", "dashmap", "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", "datafusion-execution", "datafusion-expr", + "datafusion-physical-expr", "datafusion-physical-plan", + "datafusion-session", "datafusion-sql", "futures", "itertools 0.14.0", "log", + "object_store", "parking_lot", + "tokio", ] [[package]] name = "datafusion-catalog-listing" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5ac10096a5b3c0d8a227176c0e543606860842e943594ccddb45cf42a526e43" +checksum = "a4411b8e3bce5e0fc7521e44f201def2e2d5d1b5f176fb56e8cdc9942c890f00" dependencies = [ "arrow", "async-trait", @@ -953,6 +955,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", + "datafusion-session", "futures", "log", "object_store", @@ -961,9 +964,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f53d7ec508e1b3f68bd301cee3f649834fad51eff9240d898a4b2614cfd0a7a" +checksum = "0734015d81c8375eb5d4869b7f7ecccc2ee8d6cb81948ef737cd0e7b743bd69c" dependencies = [ "ahash", "apache-avro", @@ -986,27 +989,27 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0fcf41523b22e14cc349b01526e8b9f59206653037f2949a4adbfde5f8cb668" +checksum = "5167bb1d2ccbb87c6bc36c295274d7a0519b14afcfdaf401d53cbcaa4ef4968b" dependencies = [ + "futures", "log", "tokio", ] [[package]] name = "datafusion-datasource" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf7f37ad8b6e88b46c7eeab3236147d32ea64b823544f498455a8d9042839c92" +checksum = "04e602dcdf2f50c2abf297cc2203c73531e6f48b29516af7695d338cf2a778b1" dependencies = [ "arrow", "async-compression", "async-trait", "bytes", - "bzip2 0.5.1", + "bzip2 0.5.2", "chrono", - "datafusion-catalog", "datafusion-common", "datafusion-common-runtime", "datafusion-execution", @@ -1014,13 +1017,16 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", + "datafusion-session", "flate2", "futures", "glob", "itertools 0.14.0", "log", "object_store", + "parquet", "rand", + "tempfile", "tokio", "tokio-util", "url", @@ -1028,17 +1034,123 @@ dependencies = [ "zstd", ] +[[package]] +name = "datafusion-datasource-avro" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4ea5111aab9d3f2a8bff570343cccb03ce4c203875ef5a566b7d6f1eb72559e" +dependencies = [ + "apache-avro", + "arrow", + "async-trait", + "bytes", + "chrono", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "num-traits", + "object_store", + "tokio", +] + +[[package]] +name = "datafusion-datasource-csv" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bb2253952dc32296ed5b84077cb2e0257fea4be6373e1c376426e17ead4ef6" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-json" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b8c7f47a5d2fe03bfa521ec9bafdb8a5c82de8377f60967c3663f00c8790352" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "serde_json", + "tokio", +] + +[[package]] +name = "datafusion-datasource-parquet" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27d15868ea39ed2dc266728b554f6304acd473de2142281ecfa1294bb7415923" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "parquet", + "rand", + "tokio", +] + [[package]] name = "datafusion-doc" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7db7a0239fd060f359dc56c6e7db726abaa92babaed2fb2e91c3a8b2fff8b256" +checksum = "a91f8c2c5788ef32f48ff56c68e5b545527b744822a284373ac79bba1ba47292" [[package]] name = "datafusion-execution" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0938f9e5b6bc5782be4111cdfb70c02b7b5451bf34fd57e4de062a7f7c4e31f1" +checksum = "06f004d100f49a3658c9da6fb0c3a9b760062d96cd4ad82ccc3b7b69a9fb2f84" dependencies = [ "arrow", "dashmap", @@ -1055,9 +1167,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b36c28b00b00019a8695ad7f1a53ee1673487b90322ecbd604e2cf32894eb14f" +checksum = "7a4e4ce3802609be38eeb607ee72f6fe86c3091460de9dbfae9e18db423b3964" dependencies = [ "arrow", "chrono", @@ -1076,9 +1188,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18f0a851a436c5a2139189eb4617a54e6a9ccb9edc96c4b3c83b3bb7c58b950e" +checksum = "422ac9cf3b22bbbae8cdf8ceb33039107fde1b5492693168f13bd566b1bcc839" dependencies = [ "arrow", "datafusion-common", @@ -1089,12 +1201,13 @@ dependencies = [ [[package]] name = "datafusion-ffi" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d740dd9f32a4f4ed1b907e6934201bb059efe6c877532512c661771d973c7b21" +checksum = "5cf3fe9ab492c56daeb7beed526690d33622d388b8870472e0b7b7f55490338c" dependencies = [ "abi_stable", "arrow", + "arrow-schema", "async-ffi", "async-trait", "datafusion", @@ -1108,9 +1221,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3196e37d7b65469fb79fee4f05e5bb58a456831035f9a38aa5919aeb3298d40" +checksum = "2ddf0a0a2db5d2918349c978d42d80926c6aa2459cd8a3c533a84ec4bb63479e" dependencies = [ "arrow", "arrow-buffer", @@ -1137,9 +1250,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adfc2d074d5ee4d9354fdcc9283d5b2b9037849237ddecb8942a29144b77ca05" +checksum = "408a05dafdc70d05a38a29005b8b15e21b0238734dab1e98483fcb58038c5aba" dependencies = [ "ahash", "arrow", @@ -1158,9 +1271,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cbceba0f98d921309a9121b702bcd49289d383684cccabf9a92cda1602f3bbb" +checksum = "756d21da2dd6c9bef97af1504970ff56cbf35d03fbd4ffd62827f02f4d2279d4" dependencies = [ "ahash", "arrow", @@ -1171,9 +1284,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "170e27ce4baa27113ddf5f77f1a7ec484b0dbeda0c7abbd4bad3fc609c8ab71a" +checksum = "8d8d50f6334b378930d992d801a10ac5b3e93b846b39e4a05085742572844537" dependencies = [ "arrow", "arrow-ord", @@ -1192,9 +1305,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d3a06a7f0817ded87b026a437e7e51de7f59d48173b0a4e803aa896a7bd6bb5" +checksum = "cc9a97220736c8fff1446e936be90d57216c06f28969f9ffd3b72ac93c958c8a" dependencies = [ "arrow", "async-trait", @@ -1208,9 +1321,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6c608b66496a1e05e3d196131eb9bebea579eed1f59e88d962baf3dda853bc6" +checksum = "cefc2d77646e1aadd1d6a9c40088937aedec04e68c5f0465939912e1291f8193" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1225,9 +1338,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da2f9d83348957b4ad0cd87b5cb9445f2651863a36592fe5484d43b49a5f8d82" +checksum = "dd4aff082c42fa6da99ce0698c85addd5252928c908eb087ca3cfa64ff16b313" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1235,20 +1348,20 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4800e1ff7ecf8f310887e9b54c9c444b8e215ccbc7b21c2f244cfae373b1ece7" +checksum = "df6f88d7ee27daf8b108ba910f9015176b36fbc72902b1ca5c2a5f1d1717e1a1" dependencies = [ "datafusion-expr", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "datafusion-optimizer" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "971c51c54cd309001376fae752fb15a6b41750b6d1552345c46afbfb6458801b" +checksum = "084d9f979c4b155346d3c34b18f4256e6904ded508e9554d90fed416415c3515" dependencies = [ "arrow", "chrono", @@ -1265,9 +1378,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1447c2c6bc8674a16be4786b4abf528c302803fafa186aa6275692570e64d85" +checksum = "64c536062b0076f4e30084065d805f389f9fe38af0ca75bcbac86bc5e9fbab65" dependencies = [ "ahash", "arrow", @@ -1287,9 +1400,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69f8c25dcd069073a75b3d2840a79d0f81e64bdd2c05f2d3d18939afb36a7dcb" +checksum = "f8a92b53b3193fac1916a1c5b8e3f4347c526f6822e56b71faa5fb372327a863" dependencies = [ "ahash", "arrow", @@ -1301,9 +1414,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68da5266b5b9847c11d1b3404ee96b1d423814e1973e1ad3789131e5ec912763" +checksum = "6fa0a5ac94c7cf3da97bedabd69d6bbca12aef84b9b37e6e9e8c25286511b5e2" dependencies = [ "arrow", "datafusion-common", @@ -1320,9 +1433,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88cc160df00e413e370b3b259c8ea7bfbebc134d32de16325950e9e923846b7f" +checksum = "690c615db468c2e5fe5085b232d8b1c088299a6c63d87fd960a354a71f7acb55" dependencies = [ "ahash", "arrow", @@ -1350,9 +1463,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f6ef4c6eb52370cb48639e25e2331a415aac0b2b0a0a472b36e26603bdf184f" +checksum = "a4a1afb2bdb05de7ff65be6883ebfd4ec027bd9f1f21c46aa3afd01927160a83" dependencies = [ "arrow", "chrono", @@ -1366,9 +1479,9 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5faf4a9bbb0d0a305fea8a6db21ba863286b53e53a212e687d2774028dd6f03f" +checksum = "35b7a5876ebd6b564fb9a1fd2c3a2a9686b787071a256b47e4708f0916f9e46f" dependencies = [ "arrow", "datafusion-common", @@ -1398,11 +1511,35 @@ dependencies = [ "uuid", ] +[[package]] +name = "datafusion-session" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad229a134c7406c057ece00c8743c0c34b97f4e72f78b475fe17b66c5e14fa4f" +dependencies = [ + "arrow", + "async-trait", + "dashmap", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "tokio", +] + [[package]] name = "datafusion-sql" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "325a212b67b677c0eb91447bf9a11b630f9fc4f62d8e5d145bf859f5a6b29e64" +checksum = "64f6ab28b72b664c21a27b22a2ff815fd390ed224c26e89a93b5a8154a4e8607" dependencies = [ "arrow", "bigdecimal", @@ -1417,9 +1554,9 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "46.0.1" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c2be3226a683e02cff65181e66e62eba9f812ed0e9b7ec8fe11ac8dabf1a73f" +checksum = "061efc0937f0ce3abb37ed0d56cfa01dd0e654b90e408656d05e846c8b7599fe" dependencies = [ "async-recursion", "async-trait", @@ -1453,7 +1590,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -1498,21 +1635,22 @@ checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flatbuffers" -version = "24.12.23" +version = "25.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" +checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1" dependencies = [ - "bitflags 1.3.2", + "bitflags", "rustc_version", ] [[package]] name = "flate2" -version = "1.1.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc" +checksum = "7ced92e76e966ca2fd84c8f7aa01a4aea65b0eb6648d72f7c8f3e2764a67fece" dependencies = [ "crc32fast", + "libz-rs-sys", "miniz_oxide", ] @@ -1593,7 +1731,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -1703,9 +1841,9 @@ dependencies = [ [[package]] name = "half" -version = "2.4.1" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" dependencies = [ "cfg-if", "crunchy", @@ -1986,7 +2124,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -2012,9 +2150,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.7.1" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" +checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" dependencies = [ "equivalent", "hashbrown 0.15.2", @@ -2207,6 +2345,15 @@ dependencies = [ "libc", ] +[[package]] +name = "libz-rs-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6489ca9bd760fe9642d7644e827b0c9add07df89857b0416ee15c1cc1a3b8c5a" +dependencies = [ + "zlib-rs", +] + [[package]] name = "linux-raw-sys" version = "0.4.15" @@ -2241,7 +2388,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" dependencies = [ - "twox-hash", + "twox-hash 1.6.3", ] [[package]] @@ -2297,9 +2444,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.8.4" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3b1c9bd4fe1f0f8b387f6eb9eb3b4a1aa26185e5750efb9140301703f62cd1b" +checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a" dependencies = [ "adler2", ] @@ -2407,19 +2554,22 @@ dependencies = [ [[package]] name = "object_store" -version = "0.11.2" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf" +checksum = "e9ce831b09395f933addbc56d894d889e4b226eba304d4e7adbab591e26daf1e" dependencies = [ "async-trait", "base64 0.22.1", "bytes", "chrono", + "form_urlencoded", "futures", + "http", + "http-body-util", "httparse", "humantime", "hyper", - "itertools 0.13.0", + "itertools 0.14.0", "md-5", "parking_lot", "percent-encoding", @@ -2430,7 +2580,8 @@ dependencies = [ "rustls-pemfile", "serde", "serde_json", - "snafu", + "serde_urlencoded", + "thiserror 2.0.11", "tokio", "tracing", "url", @@ -2483,9 +2634,9 @@ dependencies = [ [[package]] name = "parquet" -version = "54.3.0" +version = "55.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94243778210509a5a5e9e012872127180c155d73a9cd6e2df9243d213e81e100" +checksum = "cd31a8290ac5b19f09ad77ee7a1e6a541f1be7674ad410547d5f1eef6eef4a9c" dependencies = [ "ahash", "arrow-array", @@ -2513,7 +2664,7 @@ dependencies = [ "snap", "thrift", "tokio", - "twox-hash", + "twox-hash 2.1.0", "zstd", ] @@ -2658,12 +2809,12 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.29" +version = "0.2.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6924ced06e1f7dfe3fa48d57b9f74f55d8915f5036121bef647ef4b204895fac" +checksum = "664ec5419c51e34154eec046ebcba56312d5a2fc3b09a06da188e1ad21afadf6" dependencies = [ "proc-macro2", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -2692,7 +2843,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" dependencies = [ "heck", - "itertools 0.14.0", + "itertools 0.13.0", "log", "multimap", "once_cell", @@ -2701,7 +2852,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.98", + "syn 2.0.100", "tempfile", ] @@ -2712,10 +2863,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.14.0", + "itertools 0.13.0", "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -2747,9 +2898,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.23.4" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57fe09249128b3173d092de9523eaa75136bf7ba85e0d69eca241c7939c933cc" +checksum = "17da310086b068fbdcefbba30aeb3721d5bb9af8db4987d6735b2183ca567229" dependencies = [ "cfg-if", "indoc", @@ -2765,9 +2916,9 @@ dependencies = [ [[package]] name = "pyo3-async-runtimes" -version = "0.23.0" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "977dc837525cfd22919ba6a831413854beb7c99a256c03bf8624ad707e45810e" +checksum = "dd0b83dc42f9d41f50d38180dad65f0c99763b65a3ff2a81bf351dd35a1df8bf" dependencies = [ "futures", "once_cell", @@ -2778,9 +2929,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.23.4" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd3927b5a78757a0d71aa9dff669f903b1eb64b54142a9bd9f757f8fde65fd7" +checksum = "e27165889bd793000a098bb966adc4300c312497ea25cf7a690a9f0ac5aa5fc1" dependencies = [ "once_cell", "target-lexicon", @@ -2788,9 +2939,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.23.4" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dab6bb2102bd8f991e7749f130a70d05dd557613e39ed2deeee8e9ca0c4d548d" +checksum = "05280526e1dbf6b420062f3ef228b78c0c54ba94e157f5cb724a609d0f2faabc" dependencies = [ "libc", "pyo3-build-config", @@ -2798,27 +2949,27 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.23.4" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91871864b353fd5ffcb3f91f2f703a22a9797c91b9ab497b1acac7b07ae509c7" +checksum = "5c3ce5686aa4d3f63359a5100c62a127c9f15e8398e5fdeb5deef1fed5cd5f44" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "pyo3-macros-backend" -version = "0.23.4" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43abc3b80bc20f3facd86cd3c60beed58c3e2aa26213f3cda368de39c60a27e4" +checksum = "f4cf6faa0cbfb0ed08e89beb8103ae9724eb4750e3a78084ba4017cbe94f3855" dependencies = [ "heck", "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -2891,9 +3042,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.38" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] @@ -2945,7 +3096,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -2954,7 +3105,7 @@ version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" dependencies = [ - "bitflags 2.8.0", + "bitflags", ] [[package]] @@ -3104,7 +3255,7 @@ version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags 2.8.0", + "bitflags", "errno", "libc", "linux-raw-sys", @@ -3198,9 +3349,9 @@ dependencies = [ [[package]] name = "schemars" -version = "0.8.21" +version = "0.8.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09c024468a378b7e36765cd36702b7a90cc3cba11654f6685c8f233408e89e92" +checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615" dependencies = [ "dyn-clone", "schemars_derive", @@ -3210,14 +3361,14 @@ dependencies = [ [[package]] name = "schemars_derive" -version = "0.8.21" +version = "0.8.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1eee588578aff73f856ab961cd2f79e36bc45d7ded33a7562adba4667aecc0e" +checksum = "32e265784ad618884abaea0600a9adf15393368d840e0222d101a072f3f7534d" dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3232,7 +3383,7 @@ version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316" dependencies = [ - "bitflags 2.8.0", + "bitflags", "core-foundation", "core-foundation-sys", "libc", @@ -3251,9 +3402,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.25" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" dependencies = [ "serde", ] @@ -3266,9 +3417,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.217" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" dependencies = [ "serde_derive", ] @@ -3284,13 +3435,13 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.217" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3301,14 +3452,14 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "serde_json" -version = "1.0.138" +version = "1.0.140" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" dependencies = [ "itoa", "memchr", @@ -3325,7 +3476,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3397,27 +3548,6 @@ version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" -[[package]] -name = "snafu" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "223891c85e2a29c3fe8fb900c1fae5e69c2e42415e3177752e8718475efa5019" -dependencies = [ - "snafu-derive", -] - -[[package]] -name = "snafu-derive" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "syn 2.0.98", -] - [[package]] name = "snap" version = "1.1.1" @@ -3436,9 +3566,9 @@ dependencies = [ [[package]] name = "sqlparser" -version = "0.54.0" +version = "0.55.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c66e3b7374ad4a6af849b08b3e7a6eda0edbd82f0fd59b57e22671bf16979899" +checksum = "c4521174166bac1ff04fe16ef4524c70144cd29682a45978978ca3d7f4e0be11" dependencies = [ "log", "recursive", @@ -3453,7 +3583,7 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3497,14 +3627,14 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "substrait" -version = "0.53.2" +version = "0.55.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fac3d70185423235f37b889764e184b81a5af4bb7c95833396ee9bd92577e1b" +checksum = "048fe52a3664881ccdfdc9bdb0f4e8805f3444ee64abf299d365c54f6a2ffabb" dependencies = [ "heck", "pbjson", @@ -3521,7 +3651,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.98", + "syn 2.0.100", "typify", "walkdir", ] @@ -3545,9 +3675,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.98" +version = "2.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" +checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" dependencies = [ "proc-macro2", "quote", @@ -3571,14 +3701,14 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] name = "target-lexicon" -version = "0.12.16" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" +checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" [[package]] name = "tempfile" @@ -3620,7 +3750,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3631,7 +3761,7 @@ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3681,9 +3811,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.43.0" +version = "1.44.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e" +checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48" dependencies = [ "backtrace", "bytes", @@ -3703,7 +3833,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3718,9 +3848,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.13" +version = "0.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078" +checksum = "6b9590b93e6fcc1739458317cccd391ad3955e2bde8913edf6f95f9e65a8f034" dependencies = [ "bytes", "futures-core", @@ -3775,7 +3905,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3818,6 +3948,12 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "twox-hash" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7b17f197b3050ba473acf9181f7b1d3b66d1cf7356c6cc57886662276e65908" + [[package]] name = "typed-arena" version = "2.0.2" @@ -3841,7 +3977,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -3875,7 +4011,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.98", + "syn 2.0.100", "thiserror 2.0.11", "unicode-ident", ] @@ -3893,7 +4029,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.98", + "syn 2.0.100", "typify-impl", ] @@ -4030,7 +4166,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", "wasm-bindgen-shared", ] @@ -4065,7 +4201,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4276,7 +4412,7 @@ version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" dependencies = [ - "bitflags 2.8.0", + "bitflags", ] [[package]] @@ -4320,7 +4456,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", "synstructure", ] @@ -4342,7 +4478,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] [[package]] @@ -4362,7 +4498,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", "synstructure", ] @@ -4391,9 +4527,15 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.98", + "syn 2.0.100", ] +[[package]] +name = "zlib-rs" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "868b928d7949e09af2f6086dfc1e01936064cc7a819253bce650d4e2a2d63ba8" + [[package]] name = "zstd" version = "0.13.2" diff --git a/Cargo.toml b/Cargo.toml index bc8639d4c..2c4188bb0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,25 +34,25 @@ protoc = [ "datafusion-substrait/protoc" ] substrait = ["dep:datafusion-substrait"] [dependencies] -tokio = { version = "1.43", features = ["macros", "rt", "rt-multi-thread", "sync"] } -pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } -pyo3-async-runtimes = { version = "0.23", features = ["tokio-runtime"]} -arrow = { version = "54.2.1", features = ["pyarrow"] } -datafusion = { version = "46.0.1", features = ["avro", "unicode_expressions"] } -datafusion-substrait = { version = "46.0.1", optional = true } -datafusion-proto = { version = "46.0.1" } -datafusion-ffi = { version = "46.0.1" } +tokio = { version = "1.44", features = ["macros", "rt", "rt-multi-thread", "sync"] } +pyo3 = { version = "0.24", features = ["extension-module", "abi3", "abi3-py39"] } +pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"]} +arrow = { version = "55.0.0", features = ["pyarrow"] } +datafusion = { version = "47.0.0", features = ["avro", "unicode_expressions"] } +datafusion-substrait = { version = "47.0.0", optional = true } +datafusion-proto = { version = "47.0.0" } +datafusion-ffi = { version = "47.0.0" } prost = "0.13.1" # keep in line with `datafusion-substrait` -uuid = { version = "1.12", features = ["v4"] } +uuid = { version = "1.16", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } -async-trait = "0.1.73" +async-trait = "0.1.88" futures = "0.3" -object_store = { version = "0.11.0", features = ["aws", "gcp", "azure", "http"] } +object_store = { version = "0.12.0", features = ["aws", "gcp", "azure", "http"] } url = "2" [build-dependencies] prost-types = "0.13.1" # keep in line with `datafusion-substrait` -pyo3-build-config = "0.23" +pyo3-build-config = "0.24" [lib] name = "datafusion_python" diff --git a/python/tests/test_aggregation.py b/python/tests/test_aggregation.py index 61b1c7d80..49dfb38cf 100644 --- a/python/tests/test_aggregation.py +++ b/python/tests/test_aggregation.py @@ -338,7 +338,6 @@ def test_bit_and_bool_fns(df, name, expr, result): ), [7, 9], ), - ("last_value", f.last_value(column("a")), [3, 6]), ( "last_value_ordered", f.last_value(column("a"), order_by=[column("a").sort(ascending=False)]), diff --git a/src/dataframe.rs b/src/dataframe.rs index 9b610b5d7..787f63520 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -216,7 +216,7 @@ impl PyDataFrame { #[pyo3(signature = (*args))] fn select(&self, args: Vec) -> PyDataFusionResult { - let expr = args.into_iter().map(|e| e.into()).collect(); + let expr: Vec = args.into_iter().map(|e| e.into()).collect(); let df = self.df.as_ref().clone().select(expr)?; Ok(Self::new(df)) } diff --git a/src/dataset_exec.rs b/src/dataset_exec.rs index 445e4fe74..aab8d7566 100644 --- a/src/dataset_exec.rs +++ b/src/dataset_exec.rs @@ -275,7 +275,9 @@ impl DisplayAs for DatasetExec { Python::with_gil(|py| { let number_of_fragments = self.fragments.bind(py).len(); match t { - DisplayFormatType::Default | DisplayFormatType::Verbose => { + DisplayFormatType::Default + | DisplayFormatType::Verbose + | DisplayFormatType::TreeRender => { let projected_columns: Vec = self .schema .fields() diff --git a/src/expr.rs b/src/expr.rs index 561170289..fe0e76daa 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -714,9 +714,19 @@ impl PyExpr { | Operator::BitwiseXor | Operator::BitwiseAnd | Operator::BitwiseOr => DataTypeMap::map_from_arrow_type(&DataType::Binary), - Operator::AtArrow | Operator::ArrowAt => { - Err(py_type_err(format!("Unsupported expr: ${op}"))) - } + Operator::AtArrow + | Operator::ArrowAt + | Operator::Arrow + | Operator::LongArrow + | Operator::HashArrow + | Operator::HashLongArrow + | Operator::AtAt + | Operator::IntegerDivide + | Operator::HashMinus + | Operator::AtQuestion + | Operator::Question + | Operator::QuestionAnd + | Operator::QuestionPipe => Err(py_type_err(format!("Unsupported expr: ${op}"))), }, Expr::Cast(Cast { expr: _, data_type }) => DataTypeMap::map_from_arrow_type(data_type), Expr::Literal(scalar_value) => DataTypeMap::map_from_scalar_value(scalar_value), diff --git a/src/functions.rs b/src/functions.rs index 9c406b95a..476c2b80e 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -375,27 +375,6 @@ macro_rules! aggregate_function { }; } -macro_rules! aggregate_function_vec_args { - ($NAME: ident) => { - aggregate_function_vec_args!($NAME, expr); - }; - ($NAME: ident, $($arg:ident)*) => { - #[pyfunction] - #[pyo3(signature = ($($arg),*, distinct=None, filter=None, order_by=None, null_treatment=None))] - fn $NAME( - $($arg: PyExpr),*, - distinct: Option, - filter: Option, - order_by: Option>, - null_treatment: Option - ) -> PyDataFusionResult { - let agg_fn = functions_aggregate::expr_fn::$NAME(vec![$($arg.into()),*]); - - add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) - } - }; -} - /// Generates a [pyo3] wrapper for [datafusion::functions::expr_fn] /// /// These functions have explicit named arguments. @@ -698,8 +677,22 @@ pub fn approx_percentile_cont_with_weight( add_builder_fns_to_aggregate(agg_fn, None, filter, None, None) } -aggregate_function_vec_args!(last_value); +// We handle first_value explicitly because the signature expects an order_by +// https://github.com/apache/datafusion/issues/12376 +#[pyfunction] +#[pyo3(signature = (expr, distinct=None, filter=None, order_by=None, null_treatment=None))] +pub fn last_value( + expr: PyExpr, + distinct: Option, + filter: Option, + order_by: Option>, + null_treatment: Option, +) -> PyDataFusionResult { + // If we initialize the UDAF with order_by directly, then it gets over-written by the builder + let agg_fn = functions_aggregate::expr_fn::last_value(expr.expr, None); + add_builder_fns_to_aggregate(agg_fn, distinct, filter, order_by, null_treatment) +} // We handle first_value explicitly because the signature expects an order_by // https://github.com/apache/datafusion/issues/12376 #[pyfunction] From c9f15547cb8019068bbf2dc8eaf148d6eb42bd48 Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Fri, 25 Apr 2025 21:01:54 +0800 Subject: [PATCH 031/145] feat: alias with metadata (#1111) * feat: alias with metadata * fmt --- python/datafusion/expr.py | 14 +++++++++++--- python/datafusion/functions.py | 15 ++++++++++++--- python/tests/test_expr.py | 5 +++++ python/tests/test_functions.py | 5 +++++ src/expr.rs | 6 ++++-- src/functions.rs | 9 +++++++-- 6 files changed, 44 insertions(+), 10 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 2697d8143..01e1f3ded 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -406,9 +406,17 @@ def column(value: str) -> Expr: """Creates a new expression representing a column.""" return Expr(expr_internal.RawExpr.column(value)) - def alias(self, name: str) -> Expr: - """Assign a name to the expression.""" - return Expr(self.expr.alias(name)) + def alias(self, name: str, metadata: Optional[dict[str, str]] = None) -> Expr: + """Assign a name to the expression. + + Args: + name: The name to assign to the expression. + metadata: Optional metadata to attach to the expression. + + Returns: + A new expression with the assigned name. + """ + return Expr(self.expr.alias(name, metadata)) def sort(self, ascending: bool = True, nulls_first: bool = True) -> SortExpr: """Creates a sort :py:class:`Expr` from an existing :py:class:`Expr`. diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 5cf914e16..f430cdf4b 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -372,9 +372,18 @@ def order_by(expr: Expr, ascending: bool = True, nulls_first: bool = True) -> So return SortExpr(expr, ascending=ascending, nulls_first=nulls_first) -def alias(expr: Expr, name: str) -> Expr: - """Creates an alias expression.""" - return Expr(f.alias(expr.expr, name)) +def alias(expr: Expr, name: str, metadata: Optional[dict[str, str]] = None) -> Expr: + """Creates an alias expression with an optional metadata dictionary. + + Args: + expr: The expression to alias + name: The alias name + metadata: Optional metadata to attach to the column + + Returns: + An expression with the given alias + """ + return Expr(f.alias(expr.expr, name, metadata)) def col(name: str) -> Expr: diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index 926e69845..dcf75f021 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -247,3 +247,8 @@ def test_fill_null(df): assert result.column(0) == pa.array([1, 2, 100]) assert result.column(1) == pa.array([4, 25, 6]) assert result.column(2) == pa.array([1234, 1234, 8]) + + +def test_alias_with_metadata(df): + df = df.select(col("a").alias("b", {"key": "value"})) + assert df.schema().field("b").metadata == {b"key": b"value"} diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 37f2075f5..90cf01f7e 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -1231,3 +1231,8 @@ def test_between_default(df): actual = df.collect()[0].to_pydict() assert actual == expected + + +def test_alias_with_metadata(df): + df = df.select(f.alias(f.col("a"), "b", {"key": "value"})) + assert df.schema().field("b").metadata == {b"key": b"value"} diff --git a/src/expr.rs b/src/expr.rs index fe0e76daa..7d4aa8798 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -22,6 +22,7 @@ use datafusion::logical_expr::{ }; use pyo3::IntoPyObjectExt; use pyo3::{basic::CompareOp, prelude::*}; +use std::collections::HashMap; use std::convert::{From, Into}; use std::sync::Arc; use window::PyWindowFrame; @@ -275,8 +276,9 @@ impl PyExpr { } /// assign a name to the PyExpr - pub fn alias(&self, name: &str) -> PyExpr { - self.expr.clone().alias(name).into() + #[pyo3(signature = (name, metadata=None))] + pub fn alias(&self, name: &str, metadata: Option>) -> PyExpr { + self.expr.clone().alias_with_metadata(name, metadata).into() } /// Create a sort PyExpr from an existing PyExpr. diff --git a/src/functions.rs b/src/functions.rs index 476c2b80e..caa79b8ad 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use std::collections::HashMap; + use datafusion::functions_aggregate::all_default_aggregate_functions; use datafusion::functions_window::all_default_window_functions; use datafusion::logical_expr::expr::WindowFunctionParams; @@ -205,10 +207,13 @@ fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> PyResult /// Creates a new Alias Expr #[pyfunction] -fn alias(expr: PyExpr, name: &str) -> PyResult { +#[pyo3(signature = (expr, name, metadata=None))] +fn alias(expr: PyExpr, name: &str, metadata: Option>) -> PyResult { let relation: Option = None; Ok(PyExpr { - expr: datafusion::logical_expr::Expr::Alias(Alias::new(expr.expr, relation, name)), + expr: datafusion::logical_expr::Expr::Alias( + Alias::new(expr.expr, relation, name).with_metadata(metadata), + ), }) } From 91b66351fb19d91b62e8db83444141743b106e43 Mon Sep 17 00:00:00 2001 From: kosiew Date: Sun, 27 Apr 2025 21:41:01 +0800 Subject: [PATCH 032/145] Add DataFrame usage guide with HTML rendering customization options (#1108) * docs: enhance user guide with detailed DataFrame operations and examples * move /docs/source/api/dataframe.rst into user-guide * docs: remove DataFrame API documentation * docs: fix formatting inconsistencies in DataFrame user guide * Two minor corrections to documentation rendering --------- Co-authored-by: Tim Saucer --- docs/source/index.rst | 1 + docs/source/user-guide/basics.rst | 5 +- docs/source/user-guide/dataframe.rst | 179 +++++++++++++++++++++++++++ 3 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 docs/source/user-guide/dataframe.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 558b2d572..c18793822 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -72,6 +72,7 @@ Example user-guide/introduction user-guide/basics user-guide/data-sources + user-guide/dataframe user-guide/common-operations/index user-guide/io/index user-guide/configuration diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst index 6636c0c6a..2975d9a6b 100644 --- a/docs/source/user-guide/basics.rst +++ b/docs/source/user-guide/basics.rst @@ -21,7 +21,8 @@ Concepts ======== In this section, we will cover a basic example to introduce a few key concepts. We will use the -2021 Yellow Taxi Trip Records ([download](https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet)), from the [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page). +2021 Yellow Taxi Trip Records (`download `_), +from the `TLC Trip Record Data `_. .. ipython:: python @@ -72,6 +73,8 @@ DataFrames are typically created by calling a method on :py:class:`~datafusion.c calling the transformation methods, such as :py:func:`~datafusion.dataframe.DataFrame.filter`, :py:func:`~datafusion.dataframe.DataFrame.select`, :py:func:`~datafusion.dataframe.DataFrame.aggregate`, and :py:func:`~datafusion.dataframe.DataFrame.limit` to build up a query definition. +For more details on working with DataFrames, including visualization options and conversion to other formats, see :doc:`dataframe`. + Expressions ----------- diff --git a/docs/source/user-guide/dataframe.rst b/docs/source/user-guide/dataframe.rst new file mode 100644 index 000000000..a78fd8073 --- /dev/null +++ b/docs/source/user-guide/dataframe.rst @@ -0,0 +1,179 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +DataFrames +========== + +Overview +-------- + +DataFusion's DataFrame API provides a powerful interface for building and executing queries against data sources. +It offers a familiar API similar to pandas and other DataFrame libraries, but with the performance benefits of Rust +and Arrow. + +A DataFrame represents a logical plan that can be composed through operations like filtering, projection, and aggregation. +The actual execution happens when terminal operations like ``collect()`` or ``show()`` are called. + +Basic Usage +----------- + +.. code-block:: python + + import datafusion + from datafusion import col, lit + + # Create a context and register a data source + ctx = datafusion.SessionContext() + ctx.register_csv("my_table", "path/to/data.csv") + + # Create and manipulate a DataFrame + df = ctx.sql("SELECT * FROM my_table") + + # Or use the DataFrame API directly + df = (ctx.table("my_table") + .filter(col("age") > lit(25)) + .select([col("name"), col("age")])) + + # Execute and collect results + result = df.collect() + + # Display the first few rows + df.show() + +HTML Rendering +-------------- + +When working in Jupyter notebooks or other environments that support HTML rendering, DataFrames will +automatically display as formatted HTML tables, making it easier to visualize your data. + +The ``_repr_html_`` method is called automatically by Jupyter to render a DataFrame. This method +controls how DataFrames appear in notebook environments, providing a richer visualization than +plain text output. + +Customizing HTML Rendering +-------------------------- + +You can customize how DataFrames are rendered in HTML by configuring the formatter: + +.. code-block:: python + + from datafusion.html_formatter import configure_formatter + + # Change the default styling + configure_formatter( + max_rows=50, # Maximum number of rows to display + max_width=None, # Maximum width in pixels (None for auto) + theme="light", # Theme: "light" or "dark" + precision=2, # Floating point precision + thousands_separator=",", # Separator for thousands + date_format="%Y-%m-%d", # Date format + truncate_width=20 # Max width for string columns before truncating + ) + +The formatter settings affect all DataFrames displayed after configuration. + +Custom Style Providers +---------------------- + +For advanced styling needs, you can create a custom style provider: + +.. code-block:: python + + from datafusion.html_formatter import StyleProvider, configure_formatter + + class MyStyleProvider(StyleProvider): + def get_table_styles(self): + return { + "table": "border-collapse: collapse; width: 100%;", + "th": "background-color: #007bff; color: white; padding: 8px; text-align: left;", + "td": "border: 1px solid #ddd; padding: 8px;", + "tr:nth-child(even)": "background-color: #f2f2f2;", + } + + def get_value_styles(self, dtype, value): + """Return custom styles for specific values""" + if dtype == "float" and value < 0: + return "color: red;" + return None + + # Apply the custom style provider + configure_formatter(style_provider=MyStyleProvider()) + +Creating a Custom Formatter +--------------------------- + +For complete control over rendering, you can implement a custom formatter: + +.. code-block:: python + + from datafusion.html_formatter import Formatter, get_formatter + + class MyFormatter(Formatter): + def format_html(self, batches, schema, has_more=False, table_uuid=None): + # Create your custom HTML here + html = "
" + # ... formatting logic ... + html += "
" + return html + + # Set as the global formatter + configure_formatter(formatter_class=MyFormatter) + + # Or use the formatter just for specific operations + formatter = get_formatter() + custom_html = formatter.format_html(batches, schema) + +Managing Formatters +------------------- + +Reset to default formatting: + +.. code-block:: python + + from datafusion.html_formatter import reset_formatter + + # Reset to default settings + reset_formatter() + +Get the current formatter settings: + +.. code-block:: python + + from datafusion.html_formatter import get_formatter + + formatter = get_formatter() + print(formatter.max_rows) + print(formatter.theme) + +Contextual Formatting +--------------------- + +You can also use a context manager to temporarily change formatting settings: + +.. code-block:: python + + from datafusion.html_formatter import formatting_context + + # Default formatting + df.show() + + # Temporarily use different formatting + with formatting_context(max_rows=100, theme="dark"): + df.show() # Will use the temporary settings + + # Back to default formatting + df.show() From 00dea113eb85d54b758eb3451ea448c7b9263c1c Mon Sep 17 00:00:00 2001 From: deanm0000 <37878412+deanm0000@users.noreply.github.com> Date: Sun, 27 Apr 2025 10:14:54 -0400 Subject: [PATCH 033/145] Improve col class access using __getattr__ Co-authored-by: Tim Saucer --- python/datafusion/__init__.py | 12 ++-------- python/datafusion/col.py | 45 +++++++++++++++++++++++++++++++++++ python/tests/test_expr.py | 23 ++++++++++++++++++ 3 files changed, 70 insertions(+), 10 deletions(-) create mode 100644 python/datafusion/col.py diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 60d0d61b4..15ceefbdb 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -26,6 +26,8 @@ except ImportError: import importlib_metadata +from datafusion.col import col, column + from . import functions, object_store, substrait, unparser # The following imports are okay to remain as opaque to the user. @@ -95,16 +97,6 @@ ] -def column(value: str) -> Expr: - """Create a column expression.""" - return Expr.column(value) - - -def col(value: str) -> Expr: - """Create a column expression.""" - return Expr.column(value) - - def literal(value) -> Expr: """Create a literal expression.""" return Expr.literal(value) diff --git a/python/datafusion/col.py b/python/datafusion/col.py new file mode 100644 index 000000000..1141dc092 --- /dev/null +++ b/python/datafusion/col.py @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Col class.""" + +from datafusion.expr import Expr + + +class Col: + """Create a column expression. + + This helper class allows an extra syntax of creating columns using the __getattr__ + method. + """ + + def __call__(self, value: str) -> Expr: + """Create a column expression.""" + return Expr.column(value) + + def __getattr__(self, value: str) -> Expr: + """Create a column using attribute syntax.""" + # For autocomplete to work with IPython + if value.startswith("__wrapped__"): + return getattr(type(self), value) + + return Expr.column(value) + + +col: Col = Col() +column: Col = Col() +__all__ = ["col", "column"] diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index dcf75f021..3651b60d6 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -249,6 +249,29 @@ def test_fill_null(df): assert result.column(2) == pa.array([1234, 1234, 8]) +def test_col_getattr(): + ctx = SessionContext() + data = { + "array_values": [[1, 2, 3], [4, 5], [6], []], + "struct_values": [ + {"name": "Alice", "age": 15}, + {"name": "Bob", "age": 14}, + {"name": "Charlie", "age": 13}, + {"name": None, "age": 12}, + ], + } + df = ctx.from_pydict(data, name="table1") + + names = df.select(col.struct_values["name"].alias("name")).collect() + names = [r.as_py() for rs in names for r in rs["name"]] + + array_values = df.select(col.array_values[1].alias("value")).collect() + array_values = [r.as_py() for rs in array_values for r in rs["value"]] + + assert names == ["Alice", "Bob", "Charlie", None] + assert array_values == [2, 5, None, None] + + def test_alias_with_metadata(df): df = df.select(col("a").alias("b", {"key": "value"})) assert df.schema().field("b").metadata == {b"key": b"value"} From 5a7f638286d2397bbce87e0e8197bebb46f26649 Mon Sep 17 00:00:00 2001 From: deanm0000 <37878412+deanm0000@users.noreply.github.com> Date: Sun, 27 Apr 2025 10:17:41 -0400 Subject: [PATCH 034/145] Add expression chaining of single parameter scalar functions --- python/datafusion/expr.py | 289 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 289 insertions(+) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 01e1f3ded..84e9d4ebb 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -24,6 +24,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Optional +import functions as F import pyarrow as pa try: @@ -611,6 +612,294 @@ def over(self, window: Window) -> Expr: ) ) + def asin(self) -> Expr: + """Returns the arc sine or inverse sine of a number.""" + return F.asin(self) + + def array_pop_back(self) -> Expr: + """Returns the array without the last element.""" + return F.array_pop_back(self) + + def reverse(self) -> Expr: + """Reverse the string argument.""" + return F.reverse(self) + + def bit_length(self) -> Expr: + """Returns the number of bits in the string argument.""" + return F.bit_length(self) + + def array_length(self) -> Expr: + """Returns the length of the array.""" + return F.array_length(self) + + def array_ndims(self) -> Expr: + """Returns the number of dimensions of the array.""" + return F.array_ndims(self) + + def to_hex(self) -> Expr: + """Converts an integer to a hexadecimal string.""" + return F.to_hex(self) + + def array_dims(self) -> Expr: + """Returns an array of the array's dimensions.""" + return F.array_dims(self) + + def from_unixtime(self) -> Expr: + """Converts an integer to RFC3339 timestamp format string.""" + return F.from_unixtime(self) + + def array_empty(self) -> Expr: + """Returns a boolean indicating whether the array is empty.""" + return F.array_empty(self) + + def sin(self) -> Expr: + """Returns the sine of the argument.""" + return F.sin(self) + + def log10(self) -> Expr: + """Base 10 logarithm of the argument.""" + return F.log10(self) + + def initcap(self) -> Expr: + """Set the initial letter of each word to capital. + + Converts the first letter of each word in ``string`` to uppercase and the remaining + characters to lowercase. + """ + return F.initcap(self) + + def list_distinct(self) -> Expr: + """Returns distinct values from the array after removing duplicates. + + This is an alias for :py:func:`array_distinct`. + """ + return F.list_distinct(self) + + def iszero(self) -> Expr: + """Returns true if a given number is +0.0 or -0.0 otherwise returns false.""" + return F.iszero(self) + + def array_distinct(self) -> Expr: + """Returns distinct values from the array after removing duplicates.""" + return F.array_distinct(self) + + def arrow_typeof(self) -> Expr: + """Returns the Arrow type of the expression.""" + return F.arrow_typeof(self) + + def length(self) -> Expr: + """The number of characters in the ``string``.""" + return F.length(self) + + def lower(self) -> Expr: + """Converts a string to lowercase.""" + return F.lower(self) + + def acos(self) -> Expr: + """Returns the arc cosine or inverse cosine of a number. + + Returns: + -------- + Expr + A new expression representing the arc cosine of the input expression. + """ + return F.acos(self) + + def ascii(self) -> Expr: + """Returns the numeric code of the first character of the argument.""" + return F.ascii(self) + + def sha384(self) -> Expr: + """Computes the SHA-384 hash of a binary string.""" + return F.sha384(self) + + def isnan(self) -> Expr: + """Returns true if a given number is +NaN or -NaN otherwise returns false.""" + return F.isnan(self) + + def degrees(self) -> Expr: + """Converts the argument from radians to degrees.""" + return F.degrees(self) + + def cardinality(self) -> Expr: + """Returns the total number of elements in the array.""" + return F.cardinality(self) + + def sha224(self) -> Expr: + """Computes the SHA-224 hash of a binary string.""" + return F.sha224(self) + + def asinh(self) -> Expr: + """Returns inverse hyperbolic sine.""" + return F.asinh(self) + + def flatten(self) -> Expr: + """Flattens an array of arrays into a single array.""" + return F.flatten(self) + + def exp(self) -> Expr: + """Returns the exponential of the argument.""" + return F.exp(self) + + def abs(self) -> Expr: + """Return the absolute value of a given number. + + Returns: + -------- + Expr + A new expression representing the absolute value of the input expression. + """ + return F.abs(self) + + def btrim(self) -> Expr: + """Removes all characters, spaces by default, from both sides of a string.""" + return F.btrim(self) + + def md5(self) -> Expr: + """Computes an MD5 128-bit checksum for a string expression.""" + return F.md5(self) + + def octet_length(self) -> Expr: + """Returns the number of bytes of a string.""" + return F.octet_length(self) + + def cosh(self) -> Expr: + """Returns the hyperbolic cosine of the argument.""" + return F.cosh(self) + + def radians(self) -> Expr: + """Converts the argument from degrees to radians.""" + return F.radians(self) + + def sqrt(self) -> Expr: + """Returns the square root of the argument.""" + return F.sqrt(self) + + def character_length(self) -> Expr: + """Returns the number of characters in the argument.""" + return F.character_length(self) + + def tanh(self) -> Expr: + """Returns the hyperbolic tangent of the argument.""" + return F.tanh(self) + + def atan(self) -> Expr: + """Returns inverse tangent of a number.""" + return F.atan(self) + + def rtrim(self) -> Expr: + """Removes all characters, spaces by default, from the end of a string.""" + return F.rtrim(self) + + def atanh(self) -> Expr: + """Returns inverse hyperbolic tangent.""" + return F.atanh(self) + + def list_dims(self) -> Expr: + """Returns an array of the array's dimensions. + + This is an alias for :py:func:`array_dims`. + """ + return F.list_dims(self) + + def sha256(self) -> Expr: + """Computes the SHA-256 hash of a binary string.""" + return F.sha256(self) + + def factorial(self) -> Expr: + """Returns the factorial of the argument.""" + return F.factorial(self) + + def acosh(self) -> Expr: + """Returns inverse hyperbolic cosine.""" + return F.acosh(self) + + def floor(self) -> Expr: + """Returns the nearest integer less than or equal to the argument.""" + return F.floor(self) + + def ceil(self) -> Expr: + """Returns the nearest integer greater than or equal to argument.""" + return F.ceil(self) + + def list_length(self) -> Expr: + """Returns the length of the array. + + This is an alias for :py:func:`array_length`. + """ + return F.list_length(self) + + def upper(self) -> Expr: + """Converts a string to uppercase.""" + return F.upper(self) + + def chr(self) -> Expr: + """Converts the Unicode code point to a UTF8 character.""" + return F.chr(self) + + def ln(self) -> Expr: + """Returns the natural logarithm (base e) of the argument.""" + return F.ln(self) + + def tan(self) -> Expr: + """Returns the tangent of the argument.""" + return F.tan(self) + + def array_pop_front(self) -> Expr: + """Returns the array without the first element.""" + return F.array_pop_front(self) + + def cbrt(self) -> Expr: + """Returns the cube root of a number.""" + return F.cbrt(self) + + def sha512(self) -> Expr: + """Computes the SHA-512 hash of a binary string.""" + return F.sha512(self) + + def char_length(self) -> Expr: + """The number of characters in the ``string``.""" + return F.char_length(self) + + def list_ndims(self) -> Expr: + """Returns the number of dimensions of the array. + + This is an alias for :py:func:`array_ndims`. + """ + return F.list_ndims(self) + + def trim(self) -> Expr: + """Removes all characters, spaces by default, from both sides of a string.""" + return F.trim(self) + + def cos(self) -> Expr: + """Returns the cosine of the argument.""" + return F.cos(self) + + def sinh(self) -> Expr: + """Returns the hyperbolic sine of the argument.""" + return F.sinh(self) + + def empty(self) -> Expr: + """This is an alias for :py:func:`array_empty`.""" + return F.empty(self) + + def ltrim(self) -> Expr: + """Removes all characters, spaces by default, from the beginning of a string.""" + return F.ltrim(self) + + def signum(self) -> Expr: + """Returns the sign of the argument (-1, 0, +1).""" + return F.signum(self) + + def log2(self) -> Expr: + """Base 2 logarithm of the argument.""" + return F.log2(self) + + def cot(self) -> Expr: + """Returns the cotangent of the argument.""" + return F.cot(self) + class ExprFuncBuilder: def __init__(self, builder: expr_internal.ExprFuncBuilder) -> None: From 10600fb8fc32eba43b0b0f198325b55c63f8223d Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Mon, 28 Apr 2025 21:25:59 +0800 Subject: [PATCH 035/145] fix: recursive import (#1117) * fix: recursive import * format * format --- python/datafusion/expr.py | 135 +++++++++++++++++++++++++++++++++++++- 1 file changed, 132 insertions(+), 3 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 84e9d4ebb..3750eeb3f 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -24,7 +24,6 @@ from typing import TYPE_CHECKING, Any, ClassVar, Optional -import functions as F import pyarrow as pa try: @@ -614,58 +613,84 @@ def over(self, window: Window) -> Expr: def asin(self) -> Expr: """Returns the arc sine or inverse sine of a number.""" + from . import functions as F + return F.asin(self) def array_pop_back(self) -> Expr: """Returns the array without the last element.""" + from . import functions as F + return F.array_pop_back(self) def reverse(self) -> Expr: """Reverse the string argument.""" + from . import functions as F + return F.reverse(self) def bit_length(self) -> Expr: """Returns the number of bits in the string argument.""" + from . import functions as F + return F.bit_length(self) def array_length(self) -> Expr: """Returns the length of the array.""" + from . import functions as F + return F.array_length(self) def array_ndims(self) -> Expr: """Returns the number of dimensions of the array.""" + from . import functions as F + return F.array_ndims(self) def to_hex(self) -> Expr: """Converts an integer to a hexadecimal string.""" + from . import functions as F + return F.to_hex(self) def array_dims(self) -> Expr: """Returns an array of the array's dimensions.""" + from . import functions as F + return F.array_dims(self) def from_unixtime(self) -> Expr: """Converts an integer to RFC3339 timestamp format string.""" + from . import functions as F + return F.from_unixtime(self) def array_empty(self) -> Expr: """Returns a boolean indicating whether the array is empty.""" + from . import functions as F + return F.array_empty(self) def sin(self) -> Expr: """Returns the sine of the argument.""" + from . import functions as F + return F.sin(self) def log10(self) -> Expr: """Base 10 logarithm of the argument.""" + from . import functions as F + return F.log10(self) def initcap(self) -> Expr: """Set the initial letter of each word to capital. - Converts the first letter of each word in ``string`` to uppercase and the remaining - characters to lowercase. + Converts the first letter of each word in ``string`` + to uppercase and the remaining characters to lowercase. """ + from . import functions as F + return F.initcap(self) def list_distinct(self) -> Expr: @@ -673,26 +698,38 @@ def list_distinct(self) -> Expr: This is an alias for :py:func:`array_distinct`. """ + from . import functions as F + return F.list_distinct(self) def iszero(self) -> Expr: """Returns true if a given number is +0.0 or -0.0 otherwise returns false.""" + from . import functions as F + return F.iszero(self) def array_distinct(self) -> Expr: """Returns distinct values from the array after removing duplicates.""" + from . import functions as F + return F.array_distinct(self) def arrow_typeof(self) -> Expr: """Returns the Arrow type of the expression.""" + from . import functions as F + return F.arrow_typeof(self) def length(self) -> Expr: """The number of characters in the ``string``.""" + from . import functions as F + return F.length(self) def lower(self) -> Expr: """Converts a string to lowercase.""" + from . import functions as F + return F.lower(self) def acos(self) -> Expr: @@ -703,42 +740,62 @@ def acos(self) -> Expr: Expr A new expression representing the arc cosine of the input expression. """ + from . import functions as F + return F.acos(self) def ascii(self) -> Expr: """Returns the numeric code of the first character of the argument.""" + from . import functions as F + return F.ascii(self) def sha384(self) -> Expr: """Computes the SHA-384 hash of a binary string.""" + from . import functions as F + return F.sha384(self) def isnan(self) -> Expr: """Returns true if a given number is +NaN or -NaN otherwise returns false.""" + from . import functions as F + return F.isnan(self) def degrees(self) -> Expr: """Converts the argument from radians to degrees.""" + from . import functions as F + return F.degrees(self) def cardinality(self) -> Expr: """Returns the total number of elements in the array.""" + from . import functions as F + return F.cardinality(self) def sha224(self) -> Expr: """Computes the SHA-224 hash of a binary string.""" + from . import functions as F + return F.sha224(self) def asinh(self) -> Expr: """Returns inverse hyperbolic sine.""" + from . import functions as F + return F.asinh(self) def flatten(self) -> Expr: """Flattens an array of arrays into a single array.""" + from . import functions as F + return F.flatten(self) def exp(self) -> Expr: """Returns the exponential of the argument.""" + from . import functions as F + return F.exp(self) def abs(self) -> Expr: @@ -749,50 +806,74 @@ def abs(self) -> Expr: Expr A new expression representing the absolute value of the input expression. """ + from . import functions as F + return F.abs(self) def btrim(self) -> Expr: """Removes all characters, spaces by default, from both sides of a string.""" + from . import functions as F + return F.btrim(self) def md5(self) -> Expr: """Computes an MD5 128-bit checksum for a string expression.""" + from . import functions as F + return F.md5(self) def octet_length(self) -> Expr: """Returns the number of bytes of a string.""" + from . import functions as F + return F.octet_length(self) def cosh(self) -> Expr: """Returns the hyperbolic cosine of the argument.""" + from . import functions as F + return F.cosh(self) def radians(self) -> Expr: """Converts the argument from degrees to radians.""" + from . import functions as F + return F.radians(self) def sqrt(self) -> Expr: """Returns the square root of the argument.""" + from . import functions as F + return F.sqrt(self) def character_length(self) -> Expr: """Returns the number of characters in the argument.""" + from . import functions as F + return F.character_length(self) def tanh(self) -> Expr: """Returns the hyperbolic tangent of the argument.""" + from . import functions as F + return F.tanh(self) def atan(self) -> Expr: """Returns inverse tangent of a number.""" + from . import functions as F + return F.atan(self) def rtrim(self) -> Expr: """Removes all characters, spaces by default, from the end of a string.""" + from . import functions as F + return F.rtrim(self) def atanh(self) -> Expr: """Returns inverse hyperbolic tangent.""" + from . import functions as F + return F.atanh(self) def list_dims(self) -> Expr: @@ -800,26 +881,38 @@ def list_dims(self) -> Expr: This is an alias for :py:func:`array_dims`. """ + from . import functions as F + return F.list_dims(self) def sha256(self) -> Expr: """Computes the SHA-256 hash of a binary string.""" + from . import functions as F + return F.sha256(self) def factorial(self) -> Expr: """Returns the factorial of the argument.""" + from . import functions as F + return F.factorial(self) def acosh(self) -> Expr: """Returns inverse hyperbolic cosine.""" + from . import functions as F + return F.acosh(self) def floor(self) -> Expr: """Returns the nearest integer less than or equal to the argument.""" + from . import functions as F + return F.floor(self) def ceil(self) -> Expr: """Returns the nearest integer greater than or equal to argument.""" + from . import functions as F + return F.ceil(self) def list_length(self) -> Expr: @@ -827,38 +920,56 @@ def list_length(self) -> Expr: This is an alias for :py:func:`array_length`. """ + from . import functions as F + return F.list_length(self) def upper(self) -> Expr: """Converts a string to uppercase.""" + from . import functions as F + return F.upper(self) def chr(self) -> Expr: """Converts the Unicode code point to a UTF8 character.""" + from . import functions as F + return F.chr(self) def ln(self) -> Expr: """Returns the natural logarithm (base e) of the argument.""" + from . import functions as F + return F.ln(self) def tan(self) -> Expr: """Returns the tangent of the argument.""" + from . import functions as F + return F.tan(self) def array_pop_front(self) -> Expr: """Returns the array without the first element.""" + from . import functions as F + return F.array_pop_front(self) def cbrt(self) -> Expr: """Returns the cube root of a number.""" + from . import functions as F + return F.cbrt(self) def sha512(self) -> Expr: """Computes the SHA-512 hash of a binary string.""" + from . import functions as F + return F.sha512(self) def char_length(self) -> Expr: """The number of characters in the ``string``.""" + from . import functions as F + return F.char_length(self) def list_ndims(self) -> Expr: @@ -866,38 +977,56 @@ def list_ndims(self) -> Expr: This is an alias for :py:func:`array_ndims`. """ + from . import functions as F + return F.list_ndims(self) def trim(self) -> Expr: """Removes all characters, spaces by default, from both sides of a string.""" + from . import functions as F + return F.trim(self) def cos(self) -> Expr: """Returns the cosine of the argument.""" + from . import functions as F + return F.cos(self) def sinh(self) -> Expr: """Returns the hyperbolic sine of the argument.""" + from . import functions as F + return F.sinh(self) def empty(self) -> Expr: """This is an alias for :py:func:`array_empty`.""" + from . import functions as F + return F.empty(self) def ltrim(self) -> Expr: """Removes all characters, spaces by default, from the beginning of a string.""" + from . import functions as F + return F.ltrim(self) def signum(self) -> Expr: """Returns the sign of the argument (-1, 0, +1).""" + from . import functions as F + return F.signum(self) def log2(self) -> Expr: """Base 2 logarithm of the argument.""" + from . import functions as F + return F.log2(self) def cot(self) -> Expr: """Returns the cotangent of the argument.""" + from . import functions as F + return F.cot(self) From 6fbeceff6091aee610273d9b27106483f9ce24ea Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 1 May 2025 12:10:40 -0400 Subject: [PATCH 036/145] Copy over protected branch rule from datafusion repo (#1122) --- .asf.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.asf.yaml b/.asf.yaml index e96b43cf0..75b2262de 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -29,6 +29,10 @@ github: rebase: false features: issues: true + protected_branches: + main: + required_pull_request_reviews: + required_approving_review_count: 1 staging: whoami: asf-staging From 15b96c48eb76ad8ea19022df427aa25b06c3012b Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Mon, 5 May 2025 21:43:03 +0800 Subject: [PATCH 037/145] feat: add missing PyLogicalPlan to_variant (#1085) * add expr * format * clippy * add license * update * ruff * Update expr.py * add test * ruff * Minor ruff whitespace change * Minor format change --------- Co-authored-by: Tim Saucer --- python/datafusion/common.py | 6 + python/datafusion/expr.py | 54 +++- python/tests/test_expr.py | 86 ++++++ src/common.rs | 3 + src/common/schema.rs | 89 ++++++ src/expr.rs | 41 +++ src/expr/copy_to.rs | 138 +++++++++ src/expr/create_catalog.rs | 100 +++++++ src/expr/create_catalog_schema.rs | 100 +++++++ src/expr/create_external_table.rs | 183 ++++++++++++ src/expr/create_function.rs | 182 ++++++++++++ src/expr/create_index.rs | 129 +++++++++ src/expr/describe_table.rs | 92 ++++++ src/expr/dml.rs | 136 +++++++++ src/expr/drop_catalog_schema.rs | 116 ++++++++ src/expr/drop_function.rs | 95 +++++++ src/expr/drop_view.rs | 102 +++++++ src/expr/recursive_query.rs | 111 ++++++++ src/expr/statement.rs | 454 ++++++++++++++++++++++++++++++ src/expr/values.rs | 86 ++++++ src/sql/logical.rs | 85 +++++- 21 files changed, 2372 insertions(+), 16 deletions(-) create mode 100644 src/expr/copy_to.rs create mode 100644 src/expr/create_catalog.rs create mode 100644 src/expr/create_catalog_schema.rs create mode 100644 src/expr/create_external_table.rs create mode 100644 src/expr/create_function.rs create mode 100644 src/expr/create_index.rs create mode 100644 src/expr/describe_table.rs create mode 100644 src/expr/dml.rs create mode 100644 src/expr/drop_catalog_schema.rs create mode 100644 src/expr/drop_function.rs create mode 100644 src/expr/drop_view.rs create mode 100644 src/expr/recursive_query.rs create mode 100644 src/expr/statement.rs create mode 100644 src/expr/values.rs diff --git a/python/datafusion/common.py b/python/datafusion/common.py index e762a993b..c689a816d 100644 --- a/python/datafusion/common.py +++ b/python/datafusion/common.py @@ -33,8 +33,12 @@ SqlTable = common_internal.SqlTable SqlType = common_internal.SqlType SqlView = common_internal.SqlView +TableType = common_internal.TableType +TableSource = common_internal.TableSource +Constraints = common_internal.Constraints __all__ = [ + "Constraints", "DFSchema", "DataType", "DataTypeMap", @@ -47,6 +51,8 @@ "SqlTable", "SqlType", "SqlView", + "TableSource", + "TableType", ] diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 3750eeb3f..9e58873d0 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -54,14 +54,29 @@ Case = expr_internal.Case Cast = expr_internal.Cast Column = expr_internal.Column +CopyTo = expr_internal.CopyTo +CreateCatalog = expr_internal.CreateCatalog +CreateCatalogSchema = expr_internal.CreateCatalogSchema +CreateExternalTable = expr_internal.CreateExternalTable +CreateFunction = expr_internal.CreateFunction +CreateFunctionBody = expr_internal.CreateFunctionBody +CreateIndex = expr_internal.CreateIndex CreateMemoryTable = expr_internal.CreateMemoryTable CreateView = expr_internal.CreateView +Deallocate = expr_internal.Deallocate +DescribeTable = expr_internal.DescribeTable Distinct = expr_internal.Distinct +DmlStatement = expr_internal.DmlStatement +DropCatalogSchema = expr_internal.DropCatalogSchema +DropFunction = expr_internal.DropFunction DropTable = expr_internal.DropTable +DropView = expr_internal.DropView EmptyRelation = expr_internal.EmptyRelation +Execute = expr_internal.Execute Exists = expr_internal.Exists Explain = expr_internal.Explain Extension = expr_internal.Extension +FileType = expr_internal.FileType Filter = expr_internal.Filter GroupingSet = expr_internal.GroupingSet Join = expr_internal.Join @@ -83,21 +98,31 @@ Literal = expr_internal.Literal Negative = expr_internal.Negative Not = expr_internal.Not +OperateFunctionArg = expr_internal.OperateFunctionArg Partitioning = expr_internal.Partitioning Placeholder = expr_internal.Placeholder +Prepare = expr_internal.Prepare Projection = expr_internal.Projection +RecursiveQuery = expr_internal.RecursiveQuery Repartition = expr_internal.Repartition ScalarSubquery = expr_internal.ScalarSubquery ScalarVariable = expr_internal.ScalarVariable +SetVariable = expr_internal.SetVariable SimilarTo = expr_internal.SimilarTo Sort = expr_internal.Sort Subquery = expr_internal.Subquery SubqueryAlias = expr_internal.SubqueryAlias TableScan = expr_internal.TableScan +TransactionAccessMode = expr_internal.TransactionAccessMode +TransactionConclusion = expr_internal.TransactionConclusion +TransactionEnd = expr_internal.TransactionEnd +TransactionIsolationLevel = expr_internal.TransactionIsolationLevel +TransactionStart = expr_internal.TransactionStart TryCast = expr_internal.TryCast Union = expr_internal.Union Unnest = expr_internal.Unnest UnnestExpr = expr_internal.UnnestExpr +Values = expr_internal.Values WindowExpr = expr_internal.WindowExpr __all__ = [ @@ -111,15 +136,30 @@ "CaseBuilder", "Cast", "Column", + "CopyTo", + "CreateCatalog", + "CreateCatalogSchema", + "CreateExternalTable", + "CreateFunction", + "CreateFunctionBody", + "CreateIndex", "CreateMemoryTable", "CreateView", + "Deallocate", + "DescribeTable", "Distinct", + "DmlStatement", + "DropCatalogSchema", + "DropFunction", "DropTable", + "DropView", "EmptyRelation", + "Execute", "Exists", "Explain", "Expr", "Extension", + "FileType", "Filter", "GroupingSet", "ILike", @@ -142,22 +182,32 @@ "Literal", "Negative", "Not", + "OperateFunctionArg", "Partitioning", "Placeholder", + "Prepare", "Projection", + "RecursiveQuery", "Repartition", "ScalarSubquery", "ScalarVariable", + "SetVariable", "SimilarTo", "Sort", "SortExpr", "Subquery", "SubqueryAlias", "TableScan", + "TransactionAccessMode", + "TransactionConclusion", + "TransactionEnd", + "TransactionIsolationLevel", + "TransactionStart", "TryCast", "Union", "Unnest", "UnnestExpr", + "Values", "Window", "WindowExpr", "WindowFrame", @@ -686,8 +736,8 @@ def log10(self) -> Expr: def initcap(self) -> Expr: """Set the initial letter of each word to capital. - Converts the first letter of each word in ``string`` - to uppercase and the remaining characters to lowercase. + Converts the first letter of each word in ``string`` to uppercase and the + remaining characters to lowercase. """ from . import functions as F diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index 3651b60d6..58a202724 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -23,12 +23,21 @@ AggregateFunction, BinaryExpr, Column, + CopyTo, + CreateIndex, + DescribeTable, + DmlStatement, + DropCatalogSchema, Filter, Limit, Literal, Projection, + RecursiveQuery, Sort, TableScan, + TransactionEnd, + TransactionStart, + Values, ) @@ -249,6 +258,83 @@ def test_fill_null(df): assert result.column(2) == pa.array([1234, 1234, 8]) +def test_copy_to(): + ctx = SessionContext() + ctx.sql("CREATE TABLE foo (a int, b int)").collect() + df = ctx.sql("COPY foo TO bar STORED AS CSV") + plan = df.logical_plan() + plan = plan.to_variant() + assert isinstance(plan, CopyTo) + + +def test_create_index(): + ctx = SessionContext() + ctx.sql("CREATE TABLE foo (a int, b int)").collect() + plan = ctx.sql("create index idx on foo (a)").logical_plan() + plan = plan.to_variant() + assert isinstance(plan, CreateIndex) + + +def test_describe_table(): + ctx = SessionContext() + ctx.sql("CREATE TABLE foo (a int, b int)").collect() + plan = ctx.sql("describe foo").logical_plan() + plan = plan.to_variant() + assert isinstance(plan, DescribeTable) + + +def test_dml_statement(): + ctx = SessionContext() + ctx.sql("CREATE TABLE foo (a int, b int)").collect() + plan = ctx.sql("insert into foo values (1, 2)").logical_plan() + plan = plan.to_variant() + assert isinstance(plan, DmlStatement) + + +def drop_catalog_schema(): + ctx = SessionContext() + plan = ctx.sql("drop schema cat").logical_plan() + plan = plan.to_variant() + assert isinstance(plan, DropCatalogSchema) + + +def test_recursive_query(): + ctx = SessionContext() + plan = ctx.sql( + """ + WITH RECURSIVE cte AS ( + SELECT 1 as n + UNION ALL + SELECT n + 1 FROM cte WHERE n < 5 + ) + SELECT * FROM cte; + """ + ).logical_plan() + plan = plan.inputs()[0].inputs()[0].to_variant() + assert isinstance(plan, RecursiveQuery) + + +def test_values(): + ctx = SessionContext() + plan = ctx.sql("values (1, 'foo'), (2, 'bar')").logical_plan() + plan = plan.to_variant() + assert isinstance(plan, Values) + + +def test_transaction_start(): + ctx = SessionContext() + plan = ctx.sql("START TRANSACTION").logical_plan() + plan = plan.to_variant() + assert isinstance(plan, TransactionStart) + + +def test_transaction_end(): + ctx = SessionContext() + plan = ctx.sql("COMMIT").logical_plan() + plan = plan.to_variant() + assert isinstance(plan, TransactionEnd) + + def test_col_getattr(): ctx = SessionContext() data = { diff --git a/src/common.rs b/src/common.rs index 453bf67a4..88d2fdd5f 100644 --- a/src/common.rs +++ b/src/common.rs @@ -36,5 +36,8 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/src/common/schema.rs b/src/common/schema.rs index 66ce925ae..5a54fe333 100644 --- a/src/common/schema.rs +++ b/src/common/schema.rs @@ -15,14 +15,22 @@ // specific language governing permissions and limitations // under the License. +use std::fmt::{self, Display, Formatter}; +use std::sync::Arc; use std::{any::Any, borrow::Cow}; +use arrow::datatypes::Schema; +use arrow::pyarrow::PyArrowType; use datafusion::arrow::datatypes::SchemaRef; +use datafusion::common::Constraints; +use datafusion::datasource::TableType; use datafusion::logical_expr::{Expr, TableProviderFilterPushDown, TableSource}; use pyo3::prelude::*; use datafusion::logical_expr::utils::split_conjunction; +use crate::sql::logical::PyLogicalPlan; + use super::{data_type::DataTypeMap, function::SqlFunction}; #[pyclass(name = "SqlSchema", module = "datafusion.common", subclass)] @@ -218,3 +226,84 @@ impl SqlStatistics { self.row_count } } + +#[pyclass(name = "Constraints", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyConstraints { + pub constraints: Constraints, +} + +impl From for Constraints { + fn from(constraints: PyConstraints) -> Self { + constraints.constraints + } +} + +impl From for PyConstraints { + fn from(constraints: Constraints) -> Self { + PyConstraints { constraints } + } +} + +impl Display for PyConstraints { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "Constraints: {:?}", self.constraints) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[pyclass(eq, eq_int, name = "TableType", module = "datafusion.common")] +pub enum PyTableType { + Base, + View, + Temporary, +} + +impl From for datafusion::logical_expr::TableType { + fn from(table_type: PyTableType) -> Self { + match table_type { + PyTableType::Base => datafusion::logical_expr::TableType::Base, + PyTableType::View => datafusion::logical_expr::TableType::View, + PyTableType::Temporary => datafusion::logical_expr::TableType::Temporary, + } + } +} + +impl From for PyTableType { + fn from(table_type: TableType) -> Self { + match table_type { + datafusion::logical_expr::TableType::Base => PyTableType::Base, + datafusion::logical_expr::TableType::View => PyTableType::View, + datafusion::logical_expr::TableType::Temporary => PyTableType::Temporary, + } + } +} + +#[pyclass(name = "TableSource", module = "datafusion.common", subclass)] +#[derive(Clone)] +pub struct PyTableSource { + pub table_source: Arc, +} + +#[pymethods] +impl PyTableSource { + pub fn schema(&self) -> PyArrowType { + (*self.table_source.schema()).clone().into() + } + + pub fn constraints(&self) -> Option { + self.table_source.constraints().map(|c| PyConstraints { + constraints: c.clone(), + }) + } + + pub fn table_type(&self) -> PyTableType { + self.table_source.table_type().into() + } + + pub fn get_logical_plan(&self) -> Option { + self.table_source + .get_logical_plan() + .map(|plan| PyLogicalPlan::new(plan.into_owned())) + } +} diff --git a/src/expr.rs b/src/expr.rs index 7d4aa8798..404e575f8 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -67,10 +67,21 @@ pub mod case; pub mod cast; pub mod column; pub mod conditional_expr; +pub mod copy_to; +pub mod create_catalog; +pub mod create_catalog_schema; +pub mod create_external_table; +pub mod create_function; +pub mod create_index; pub mod create_memory_table; pub mod create_view; +pub mod describe_table; pub mod distinct; +pub mod dml; +pub mod drop_catalog_schema; +pub mod drop_function; pub mod drop_table; +pub mod drop_view; pub mod empty_relation; pub mod exists; pub mod explain; @@ -86,18 +97,21 @@ pub mod literal; pub mod logical_node; pub mod placeholder; pub mod projection; +pub mod recursive_query; pub mod repartition; pub mod scalar_subquery; pub mod scalar_variable; pub mod signature; pub mod sort; pub mod sort_expr; +pub mod statement; pub mod subquery; pub mod subquery_alias; pub mod table_scan; pub mod union; pub mod unnest; pub mod unnest_expr; +pub mod values; pub mod window; use sort_expr::{to_sort_expressions, PySortExpr}; @@ -802,5 +816,32 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + Ok(()) } diff --git a/src/expr/copy_to.rs b/src/expr/copy_to.rs new file mode 100644 index 000000000..ebfcb8ebc --- /dev/null +++ b/src/expr/copy_to.rs @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + collections::HashMap, + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::{common::file_options::file_type::FileType, logical_expr::dml::CopyTo}; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::sql::logical::PyLogicalPlan; + +use super::logical_node::LogicalNode; + +#[pyclass(name = "CopyTo", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyCopyTo { + copy: CopyTo, +} + +impl From for CopyTo { + fn from(copy: PyCopyTo) -> Self { + copy.copy + } +} + +impl From for PyCopyTo { + fn from(copy: CopyTo) -> PyCopyTo { + PyCopyTo { copy } + } +} + +impl Display for PyCopyTo { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "CopyTo: {:?}", self.copy.output_url) + } +} + +impl LogicalNode for PyCopyTo { + fn inputs(&self) -> Vec { + vec![PyLogicalPlan::from((*self.copy.input).clone())] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[pymethods] +impl PyCopyTo { + #[new] + pub fn new( + input: PyLogicalPlan, + output_url: String, + partition_by: Vec, + file_type: PyFileType, + options: HashMap, + ) -> Self { + PyCopyTo { + copy: CopyTo { + input: input.plan(), + output_url, + partition_by, + file_type: file_type.file_type, + options, + }, + } + } + + fn input(&self) -> PyLogicalPlan { + PyLogicalPlan::from((*self.copy.input).clone()) + } + + fn output_url(&self) -> String { + self.copy.output_url.clone() + } + + fn partition_by(&self) -> Vec { + self.copy.partition_by.clone() + } + + fn file_type(&self) -> PyFileType { + PyFileType { + file_type: self.copy.file_type.clone(), + } + } + + fn options(&self) -> HashMap { + self.copy.options.clone() + } + + fn __repr__(&self) -> PyResult { + Ok(format!("CopyTo({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("CopyTo".to_string()) + } +} + +#[pyclass(name = "FileType", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyFileType { + file_type: Arc, +} + +impl Display for PyFileType { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "FileType: {}", self.file_type) + } +} + +#[pymethods] +impl PyFileType { + fn __repr__(&self) -> PyResult { + Ok(format!("FileType({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("FileType".to_string()) + } +} diff --git a/src/expr/create_catalog.rs b/src/expr/create_catalog.rs new file mode 100644 index 000000000..f4ea0f517 --- /dev/null +++ b/src/expr/create_catalog.rs @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::logical_expr::CreateCatalog; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; + +use super::logical_node::LogicalNode; + +#[pyclass(name = "CreateCatalog", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyCreateCatalog { + create: CreateCatalog, +} + +impl From for CreateCatalog { + fn from(create: PyCreateCatalog) -> Self { + create.create + } +} + +impl From for PyCreateCatalog { + fn from(create: CreateCatalog) -> PyCreateCatalog { + PyCreateCatalog { create } + } +} + +impl Display for PyCreateCatalog { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "CreateCatalog: {:?}", self.create.catalog_name) + } +} + +#[pymethods] +impl PyCreateCatalog { + #[new] + pub fn new( + catalog_name: String, + if_not_exists: bool, + schema: PyDFSchema, + ) -> PyResult { + Ok(PyCreateCatalog { + create: CreateCatalog { + catalog_name, + if_not_exists, + schema: Arc::new(schema.into()), + }, + }) + } + + pub fn catalog_name(&self) -> String { + self.create.catalog_name.clone() + } + + pub fn if_not_exists(&self) -> bool { + self.create.if_not_exists + } + + pub fn schema(&self) -> PyDFSchema { + (*self.create.schema).clone().into() + } + + fn __repr__(&self) -> PyResult { + Ok(format!("CreateCatalog({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("CreateCatalog".to_string()) + } +} + +impl LogicalNode for PyCreateCatalog { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/create_catalog_schema.rs b/src/expr/create_catalog_schema.rs new file mode 100644 index 000000000..85f447e1e --- /dev/null +++ b/src/expr/create_catalog_schema.rs @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::logical_expr::CreateCatalogSchema; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; + +use super::logical_node::LogicalNode; + +#[pyclass(name = "CreateCatalogSchema", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyCreateCatalogSchema { + create: CreateCatalogSchema, +} + +impl From for CreateCatalogSchema { + fn from(create: PyCreateCatalogSchema) -> Self { + create.create + } +} + +impl From for PyCreateCatalogSchema { + fn from(create: CreateCatalogSchema) -> PyCreateCatalogSchema { + PyCreateCatalogSchema { create } + } +} + +impl Display for PyCreateCatalogSchema { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "CreateCatalogSchema: {:?}", self.create.schema_name) + } +} + +#[pymethods] +impl PyCreateCatalogSchema { + #[new] + pub fn new( + schema_name: String, + if_not_exists: bool, + schema: PyDFSchema, + ) -> PyResult { + Ok(PyCreateCatalogSchema { + create: CreateCatalogSchema { + schema_name, + if_not_exists, + schema: Arc::new(schema.into()), + }, + }) + } + + pub fn schema_name(&self) -> String { + self.create.schema_name.clone() + } + + pub fn if_not_exists(&self) -> bool { + self.create.if_not_exists + } + + pub fn schema(&self) -> PyDFSchema { + (*self.create.schema).clone().into() + } + + fn __repr__(&self) -> PyResult { + Ok(format!("CreateCatalogSchema({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("CreateCatalogSchema".to_string()) + } +} + +impl LogicalNode for PyCreateCatalogSchema { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/create_external_table.rs b/src/expr/create_external_table.rs new file mode 100644 index 000000000..01ce7d0ca --- /dev/null +++ b/src/expr/create_external_table.rs @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{common::schema::PyConstraints, expr::PyExpr, sql::logical::PyLogicalPlan}; +use std::{ + collections::HashMap, + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::logical_expr::CreateExternalTable; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::common::df_schema::PyDFSchema; + +use super::{logical_node::LogicalNode, sort_expr::PySortExpr}; + +#[pyclass(name = "CreateExternalTable", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyCreateExternalTable { + create: CreateExternalTable, +} + +impl From for CreateExternalTable { + fn from(create: PyCreateExternalTable) -> Self { + create.create + } +} + +impl From for PyCreateExternalTable { + fn from(create: CreateExternalTable) -> PyCreateExternalTable { + PyCreateExternalTable { create } + } +} + +impl Display for PyCreateExternalTable { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!( + f, + "CreateExternalTable: {:?}{}", + self.create.name, self.create.constraints + ) + } +} + +#[pymethods] +impl PyCreateExternalTable { + #[allow(clippy::too_many_arguments)] + #[new] + #[pyo3(signature = (schema, name, location, file_type, table_partition_cols, if_not_exists, temporary, order_exprs, unbounded, options, constraints, column_defaults, definition=None))] + pub fn new( + schema: PyDFSchema, + name: String, + location: String, + file_type: String, + table_partition_cols: Vec, + if_not_exists: bool, + temporary: bool, + order_exprs: Vec>, + unbounded: bool, + options: HashMap, + constraints: PyConstraints, + column_defaults: HashMap, + definition: Option, + ) -> Self { + let create = CreateExternalTable { + schema: Arc::new(schema.into()), + name: name.into(), + location, + file_type, + table_partition_cols, + if_not_exists, + temporary, + definition, + order_exprs: order_exprs + .into_iter() + .map(|vec| vec.into_iter().map(|s| s.into()).collect::>()) + .collect::>(), + unbounded, + options, + constraints: constraints.constraints, + column_defaults: column_defaults + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(), + }; + PyCreateExternalTable { create } + } + + pub fn schema(&self) -> PyDFSchema { + (*self.create.schema).clone().into() + } + + pub fn name(&self) -> PyResult { + Ok(self.create.name.to_string()) + } + + pub fn location(&self) -> String { + self.create.location.clone() + } + + pub fn file_type(&self) -> String { + self.create.file_type.clone() + } + + pub fn table_partition_cols(&self) -> Vec { + self.create.table_partition_cols.clone() + } + + pub fn if_not_exists(&self) -> bool { + self.create.if_not_exists + } + + pub fn temporary(&self) -> bool { + self.create.temporary + } + + pub fn definition(&self) -> Option { + self.create.definition.clone() + } + + pub fn order_exprs(&self) -> Vec> { + self.create + .order_exprs + .iter() + .map(|vec| vec.iter().map(|s| s.clone().into()).collect()) + .collect() + } + + pub fn unbounded(&self) -> bool { + self.create.unbounded + } + + pub fn options(&self) -> HashMap { + self.create.options.clone() + } + + pub fn constraints(&self) -> PyConstraints { + PyConstraints { + constraints: self.create.constraints.clone(), + } + } + + pub fn column_defaults(&self) -> HashMap { + self.create + .column_defaults + .iter() + .map(|(k, v)| (k.clone(), v.clone().into())) + .collect() + } + + fn __repr__(&self) -> PyResult { + Ok(format!("CreateExternalTable({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("CreateExternalTable".to_string()) + } +} + +impl LogicalNode for PyCreateExternalTable { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/create_function.rs b/src/expr/create_function.rs new file mode 100644 index 000000000..6f3c3f0ff --- /dev/null +++ b/src/expr/create_function.rs @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::logical_expr::{ + CreateFunction, CreateFunctionBody, OperateFunctionArg, Volatility, +}; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use super::logical_node::LogicalNode; +use super::PyExpr; +use crate::common::{data_type::PyDataType, df_schema::PyDFSchema}; +use crate::sql::logical::PyLogicalPlan; + +#[pyclass(name = "CreateFunction", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyCreateFunction { + create: CreateFunction, +} + +impl From for CreateFunction { + fn from(create: PyCreateFunction) -> Self { + create.create + } +} + +impl From for PyCreateFunction { + fn from(create: CreateFunction) -> PyCreateFunction { + PyCreateFunction { create } + } +} + +impl Display for PyCreateFunction { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "CreateFunction: name {:?}", self.create.name) + } +} + +#[pyclass(name = "OperateFunctionArg", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyOperateFunctionArg { + arg: OperateFunctionArg, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[pyclass(eq, eq_int, name = "Volatility", module = "datafusion.expr")] +pub enum PyVolatility { + Immutable, + Stable, + Volatile, +} + +#[pyclass(name = "CreateFunctionBody", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyCreateFunctionBody { + body: CreateFunctionBody, +} + +#[pymethods] +impl PyCreateFunctionBody { + pub fn language(&self) -> Option { + self.body + .language + .as_ref() + .map(|language| language.to_string()) + } + + pub fn behavior(&self) -> Option { + self.body.behavior.as_ref().map(|behavior| match behavior { + Volatility::Immutable => PyVolatility::Immutable, + Volatility::Stable => PyVolatility::Stable, + Volatility::Volatile => PyVolatility::Volatile, + }) + } + + pub fn function_body(&self) -> Option { + self.body + .function_body + .as_ref() + .map(|function_body| function_body.clone().into()) + } +} + +#[pymethods] +impl PyCreateFunction { + #[new] + #[pyo3(signature = (or_replace, temporary, name, params, schema, return_type=None, args=None))] + pub fn new( + or_replace: bool, + temporary: bool, + name: String, + params: PyCreateFunctionBody, + schema: PyDFSchema, + return_type: Option, + args: Option>, + ) -> Self { + PyCreateFunction { + create: CreateFunction { + or_replace, + temporary, + name, + args: args.map(|args| args.into_iter().map(|arg| arg.arg).collect()), + return_type: return_type.map(|return_type| return_type.data_type), + params: params.body, + schema: Arc::new(schema.into()), + }, + } + } + + pub fn or_replace(&self) -> bool { + self.create.or_replace + } + + pub fn temporary(&self) -> bool { + self.create.temporary + } + + pub fn name(&self) -> String { + self.create.name.clone() + } + + pub fn params(&self) -> PyCreateFunctionBody { + PyCreateFunctionBody { + body: self.create.params.clone(), + } + } + + pub fn schema(&self) -> PyDFSchema { + (*self.create.schema).clone().into() + } + + pub fn return_type(&self) -> Option { + self.create + .return_type + .as_ref() + .map(|return_type| return_type.clone().into()) + } + + pub fn args(&self) -> Option> { + self.create.args.as_ref().map(|args| { + args.iter() + .map(|arg| PyOperateFunctionArg { arg: arg.clone() }) + .collect() + }) + } + + fn __repr__(&self) -> PyResult { + Ok(format!("CreateFunction({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("CreateFunction".to_string()) + } +} + +impl LogicalNode for PyCreateFunction { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/create_index.rs b/src/expr/create_index.rs new file mode 100644 index 000000000..13dadbc3f --- /dev/null +++ b/src/expr/create_index.rs @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::logical_expr::CreateIndex; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; + +use super::{logical_node::LogicalNode, sort_expr::PySortExpr}; + +#[pyclass(name = "CreateIndex", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyCreateIndex { + create: CreateIndex, +} + +impl From for CreateIndex { + fn from(create: PyCreateIndex) -> Self { + create.create + } +} + +impl From for PyCreateIndex { + fn from(create: CreateIndex) -> PyCreateIndex { + PyCreateIndex { create } + } +} + +impl Display for PyCreateIndex { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "CreateIndex: {:?}", self.create.name) + } +} + +#[pymethods] +impl PyCreateIndex { + #[new] + #[pyo3(signature = (table, columns, unique, if_not_exists, schema, name=None, using=None))] + pub fn new( + table: String, + columns: Vec, + unique: bool, + if_not_exists: bool, + schema: PyDFSchema, + name: Option, + using: Option, + ) -> PyResult { + Ok(PyCreateIndex { + create: CreateIndex { + name, + table: table.into(), + using, + columns: columns.iter().map(|c| c.clone().into()).collect(), + unique, + if_not_exists, + schema: Arc::new(schema.into()), + }, + }) + } + + pub fn name(&self) -> Option { + self.create.name.clone() + } + + pub fn table(&self) -> PyResult { + Ok(self.create.table.to_string()) + } + + pub fn using(&self) -> Option { + self.create.using.clone() + } + + pub fn columns(&self) -> Vec { + self.create + .columns + .iter() + .map(|c| c.clone().into()) + .collect() + } + + pub fn unique(&self) -> bool { + self.create.unique + } + + pub fn if_not_exists(&self) -> bool { + self.create.if_not_exists + } + + pub fn schema(&self) -> PyDFSchema { + (*self.create.schema).clone().into() + } + + fn __repr__(&self) -> PyResult { + Ok(format!("CreateIndex({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("CreateIndex".to_string()) + } +} + +impl LogicalNode for PyCreateIndex { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/describe_table.rs b/src/expr/describe_table.rs new file mode 100644 index 000000000..5658a13f2 --- /dev/null +++ b/src/expr/describe_table.rs @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use arrow::{datatypes::Schema, pyarrow::PyArrowType}; +use datafusion::logical_expr::DescribeTable; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; + +use super::logical_node::LogicalNode; + +#[pyclass(name = "DescribeTable", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyDescribeTable { + describe: DescribeTable, +} + +impl Display for PyDescribeTable { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "DescribeTable") + } +} + +#[pymethods] +impl PyDescribeTable { + #[new] + fn new(schema: PyArrowType, output_schema: PyDFSchema) -> Self { + Self { + describe: DescribeTable { + schema: Arc::new(schema.0), + output_schema: Arc::new(output_schema.into()), + }, + } + } + + pub fn schema(&self) -> PyArrowType { + (*self.describe.schema).clone().into() + } + + pub fn output_schema(&self) -> PyDFSchema { + (*self.describe.output_schema).clone().into() + } + + fn __repr__(&self) -> PyResult { + Ok(format!("DescribeTable({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("DescribeTable".to_string()) + } +} + +impl From for DescribeTable { + fn from(describe: PyDescribeTable) -> Self { + describe.describe + } +} + +impl From for PyDescribeTable { + fn from(describe: DescribeTable) -> PyDescribeTable { + PyDescribeTable { describe } + } +} + +impl LogicalNode for PyDescribeTable { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/dml.rs b/src/expr/dml.rs new file mode 100644 index 000000000..251e336cc --- /dev/null +++ b/src/expr/dml.rs @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::logical_expr::dml::InsertOp; +use datafusion::logical_expr::{DmlStatement, WriteOp}; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::common::schema::PyTableSource; +use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; + +use super::logical_node::LogicalNode; + +#[pyclass(name = "DmlStatement", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyDmlStatement { + dml: DmlStatement, +} + +impl From for DmlStatement { + fn from(dml: PyDmlStatement) -> Self { + dml.dml + } +} + +impl From for PyDmlStatement { + fn from(dml: DmlStatement) -> PyDmlStatement { + PyDmlStatement { dml } + } +} + +impl LogicalNode for PyDmlStatement { + fn inputs(&self) -> Vec { + vec![PyLogicalPlan::from((*self.dml.input).clone())] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[pymethods] +impl PyDmlStatement { + pub fn table_name(&self) -> PyResult { + Ok(self.dml.table_name.to_string()) + } + + pub fn target(&self) -> PyResult { + Ok(PyTableSource { + table_source: self.dml.target.clone(), + }) + } + + pub fn op(&self) -> PyWriteOp { + self.dml.op.clone().into() + } + + pub fn input(&self) -> PyLogicalPlan { + PyLogicalPlan { + plan: self.dml.input.clone(), + } + } + + pub fn output_schema(&self) -> PyDFSchema { + (*self.dml.output_schema).clone().into() + } + + fn __repr__(&self) -> PyResult { + Ok("DmlStatement".to_string()) + } + + fn __name__(&self) -> PyResult { + Ok("DmlStatement".to_string()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[pyclass(eq, eq_int, name = "WriteOp", module = "datafusion.expr")] +pub enum PyWriteOp { + Append, + Overwrite, + Replace, + + Update, + Delete, + Ctas, +} + +impl From for PyWriteOp { + fn from(write_op: WriteOp) -> Self { + match write_op { + WriteOp::Insert(InsertOp::Append) => PyWriteOp::Append, + WriteOp::Insert(InsertOp::Overwrite) => PyWriteOp::Overwrite, + WriteOp::Insert(InsertOp::Replace) => PyWriteOp::Replace, + + WriteOp::Update => PyWriteOp::Update, + WriteOp::Delete => PyWriteOp::Delete, + WriteOp::Ctas => PyWriteOp::Ctas, + } + } +} + +impl From for WriteOp { + fn from(py: PyWriteOp) -> Self { + match py { + PyWriteOp::Append => WriteOp::Insert(InsertOp::Append), + PyWriteOp::Overwrite => WriteOp::Insert(InsertOp::Overwrite), + PyWriteOp::Replace => WriteOp::Insert(InsertOp::Replace), + + PyWriteOp::Update => WriteOp::Update, + PyWriteOp::Delete => WriteOp::Delete, + PyWriteOp::Ctas => WriteOp::Ctas, + } + } +} + +#[pymethods] +impl PyWriteOp { + fn name(&self) -> String { + let write_op: WriteOp = self.clone().into(); + write_op.name().to_string() + } +} diff --git a/src/expr/drop_catalog_schema.rs b/src/expr/drop_catalog_schema.rs new file mode 100644 index 000000000..b7420a99c --- /dev/null +++ b/src/expr/drop_catalog_schema.rs @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::{common::SchemaReference, logical_expr::DropCatalogSchema, sql::TableReference}; +use pyo3::{exceptions::PyValueError, prelude::*, IntoPyObjectExt}; + +use crate::common::df_schema::PyDFSchema; + +use super::logical_node::LogicalNode; +use crate::sql::logical::PyLogicalPlan; + +#[pyclass(name = "DropCatalogSchema", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyDropCatalogSchema { + drop: DropCatalogSchema, +} + +impl From for DropCatalogSchema { + fn from(drop: PyDropCatalogSchema) -> Self { + drop.drop + } +} + +impl From for PyDropCatalogSchema { + fn from(drop: DropCatalogSchema) -> PyDropCatalogSchema { + PyDropCatalogSchema { drop } + } +} + +impl Display for PyDropCatalogSchema { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "DropCatalogSchema") + } +} + +fn parse_schema_reference(name: String) -> PyResult { + match name.into() { + TableReference::Bare { table } => Ok(SchemaReference::Bare { schema: table }), + TableReference::Partial { schema, table } => Ok(SchemaReference::Full { + schema: table, + catalog: schema, + }), + TableReference::Full { + catalog: _, + schema: _, + table: _, + } => Err(PyErr::new::( + "Invalid schema specifier (has 3 parts)".to_string(), + )), + } +} + +#[pymethods] +impl PyDropCatalogSchema { + #[new] + fn new(name: String, schema: PyDFSchema, if_exists: bool, cascade: bool) -> PyResult { + let name = parse_schema_reference(name)?; + Ok(PyDropCatalogSchema { + drop: DropCatalogSchema { + name, + schema: Arc::new(schema.into()), + if_exists, + cascade, + }, + }) + } + + fn name(&self) -> PyResult { + Ok(self.drop.name.to_string()) + } + + fn schema(&self) -> PyDFSchema { + (*self.drop.schema).clone().into() + } + + fn if_exists(&self) -> PyResult { + Ok(self.drop.if_exists) + } + + fn cascade(&self) -> PyResult { + Ok(self.drop.cascade) + } + + fn __repr__(&self) -> PyResult { + Ok(format!("DropCatalogSchema({})", self)) + } +} + +impl LogicalNode for PyDropCatalogSchema { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/drop_function.rs b/src/expr/drop_function.rs new file mode 100644 index 000000000..9fbd78fdc --- /dev/null +++ b/src/expr/drop_function.rs @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::logical_expr::DropFunction; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use super::logical_node::LogicalNode; +use crate::common::df_schema::PyDFSchema; +use crate::sql::logical::PyLogicalPlan; + +#[pyclass(name = "DropFunction", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyDropFunction { + drop: DropFunction, +} + +impl From for DropFunction { + fn from(drop: PyDropFunction) -> Self { + drop.drop + } +} + +impl From for PyDropFunction { + fn from(drop: DropFunction) -> PyDropFunction { + PyDropFunction { drop } + } +} + +impl Display for PyDropFunction { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "DropFunction") + } +} + +#[pymethods] +impl PyDropFunction { + #[new] + fn new(name: String, schema: PyDFSchema, if_exists: bool) -> PyResult { + Ok(PyDropFunction { + drop: DropFunction { + name, + schema: Arc::new(schema.into()), + if_exists, + }, + }) + } + fn name(&self) -> PyResult { + Ok(self.drop.name.clone()) + } + + fn schema(&self) -> PyDFSchema { + (*self.drop.schema).clone().into() + } + + fn if_exists(&self) -> PyResult { + Ok(self.drop.if_exists) + } + + fn __repr__(&self) -> PyResult { + Ok(format!("DropFunction({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("DropFunction".to_string()) + } +} + +impl LogicalNode for PyDropFunction { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/drop_view.rs b/src/expr/drop_view.rs new file mode 100644 index 000000000..1d1ab1e59 --- /dev/null +++ b/src/expr/drop_view.rs @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fmt::{self, Display, Formatter}, + sync::Arc, +}; + +use datafusion::logical_expr::DropView; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::common::df_schema::PyDFSchema; + +use super::logical_node::LogicalNode; +use crate::sql::logical::PyLogicalPlan; + +#[pyclass(name = "DropView", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyDropView { + drop: DropView, +} + +impl From for DropView { + fn from(drop: PyDropView) -> Self { + drop.drop + } +} + +impl From for PyDropView { + fn from(drop: DropView) -> PyDropView { + PyDropView { drop } + } +} + +impl Display for PyDropView { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!( + f, + "DropView: {name:?} if not exist:={if_exists}", + name = self.drop.name, + if_exists = self.drop.if_exists + ) + } +} + +#[pymethods] +impl PyDropView { + #[new] + fn new(name: String, schema: PyDFSchema, if_exists: bool) -> PyResult { + Ok(PyDropView { + drop: DropView { + name: name.into(), + schema: Arc::new(schema.into()), + if_exists, + }, + }) + } + + fn name(&self) -> PyResult { + Ok(self.drop.name.to_string()) + } + + fn schema(&self) -> PyDFSchema { + (*self.drop.schema).clone().into() + } + + fn if_exists(&self) -> PyResult { + Ok(self.drop.if_exists) + } + + fn __repr__(&self) -> PyResult { + Ok(format!("DropView({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("DropView".to_string()) + } +} + +impl LogicalNode for PyDropView { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/recursive_query.rs b/src/expr/recursive_query.rs new file mode 100644 index 000000000..65181f7d3 --- /dev/null +++ b/src/expr/recursive_query.rs @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::fmt::{self, Display, Formatter}; + +use datafusion::logical_expr::RecursiveQuery; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::sql::logical::PyLogicalPlan; + +use super::logical_node::LogicalNode; + +#[pyclass(name = "RecursiveQuery", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyRecursiveQuery { + query: RecursiveQuery, +} + +impl From for RecursiveQuery { + fn from(query: PyRecursiveQuery) -> Self { + query.query + } +} + +impl From for PyRecursiveQuery { + fn from(query: RecursiveQuery) -> PyRecursiveQuery { + PyRecursiveQuery { query } + } +} + +impl Display for PyRecursiveQuery { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!( + f, + "RecursiveQuery {name:?} is_distinct:={is_distinct}", + name = self.query.name, + is_distinct = self.query.is_distinct + ) + } +} + +#[pymethods] +impl PyRecursiveQuery { + #[new] + fn new( + name: String, + static_term: PyLogicalPlan, + recursive_term: PyLogicalPlan, + is_distinct: bool, + ) -> Self { + Self { + query: RecursiveQuery { + name, + static_term: static_term.plan(), + recursive_term: recursive_term.plan(), + is_distinct, + }, + } + } + + fn name(&self) -> PyResult { + Ok(self.query.name.clone()) + } + + fn static_term(&self) -> PyLogicalPlan { + PyLogicalPlan::from((*self.query.static_term).clone()) + } + + fn recursive_term(&self) -> PyLogicalPlan { + PyLogicalPlan::from((*self.query.recursive_term).clone()) + } + + fn is_distinct(&self) -> PyResult { + Ok(self.query.is_distinct) + } + + fn __repr__(&self) -> PyResult { + Ok(format!("RecursiveQuery({})", self)) + } + + fn __name__(&self) -> PyResult { + Ok("RecursiveQuery".to_string()) + } +} + +impl LogicalNode for PyRecursiveQuery { + fn inputs(&self) -> Vec { + vec![ + PyLogicalPlan::from((*self.query.static_term).clone()), + PyLogicalPlan::from((*self.query.recursive_term).clone()), + ] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} diff --git a/src/expr/statement.rs b/src/expr/statement.rs new file mode 100644 index 000000000..83774cda1 --- /dev/null +++ b/src/expr/statement.rs @@ -0,0 +1,454 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::logical_expr::{ + Deallocate, Execute, Prepare, SetVariable, TransactionAccessMode, TransactionConclusion, + TransactionEnd, TransactionIsolationLevel, TransactionStart, +}; +use pyo3::{prelude::*, IntoPyObjectExt}; + +use crate::{common::data_type::PyDataType, sql::logical::PyLogicalPlan}; + +use super::{logical_node::LogicalNode, PyExpr}; + +#[pyclass(name = "TransactionStart", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyTransactionStart { + transaction_start: TransactionStart, +} + +impl From for PyTransactionStart { + fn from(transaction_start: TransactionStart) -> PyTransactionStart { + PyTransactionStart { transaction_start } + } +} + +impl TryFrom for TransactionStart { + type Error = PyErr; + + fn try_from(py: PyTransactionStart) -> Result { + Ok(py.transaction_start) + } +} + +impl LogicalNode for PyTransactionStart { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[pyclass(eq, eq_int, name = "TransactionAccessMode", module = "datafusion.expr")] +pub enum PyTransactionAccessMode { + ReadOnly, + ReadWrite, +} + +impl From for PyTransactionAccessMode { + fn from(access_mode: TransactionAccessMode) -> PyTransactionAccessMode { + match access_mode { + TransactionAccessMode::ReadOnly => PyTransactionAccessMode::ReadOnly, + TransactionAccessMode::ReadWrite => PyTransactionAccessMode::ReadWrite, + } + } +} + +impl TryFrom for TransactionAccessMode { + type Error = PyErr; + + fn try_from(py: PyTransactionAccessMode) -> Result { + match py { + PyTransactionAccessMode::ReadOnly => Ok(TransactionAccessMode::ReadOnly), + PyTransactionAccessMode::ReadWrite => Ok(TransactionAccessMode::ReadWrite), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[pyclass( + eq, + eq_int, + name = "TransactionIsolationLevel", + module = "datafusion.expr" +)] +pub enum PyTransactionIsolationLevel { + ReadUncommitted, + ReadCommitted, + RepeatableRead, + Serializable, + Snapshot, +} + +impl From for PyTransactionIsolationLevel { + fn from(isolation_level: TransactionIsolationLevel) -> PyTransactionIsolationLevel { + match isolation_level { + TransactionIsolationLevel::ReadUncommitted => { + PyTransactionIsolationLevel::ReadUncommitted + } + TransactionIsolationLevel::ReadCommitted => PyTransactionIsolationLevel::ReadCommitted, + TransactionIsolationLevel::RepeatableRead => { + PyTransactionIsolationLevel::RepeatableRead + } + TransactionIsolationLevel::Serializable => PyTransactionIsolationLevel::Serializable, + TransactionIsolationLevel::Snapshot => PyTransactionIsolationLevel::Snapshot, + } + } +} + +impl TryFrom for TransactionIsolationLevel { + type Error = PyErr; + + fn try_from(value: PyTransactionIsolationLevel) -> Result { + match value { + PyTransactionIsolationLevel::ReadUncommitted => { + Ok(TransactionIsolationLevel::ReadUncommitted) + } + PyTransactionIsolationLevel::ReadCommitted => { + Ok(TransactionIsolationLevel::ReadCommitted) + } + PyTransactionIsolationLevel::RepeatableRead => { + Ok(TransactionIsolationLevel::RepeatableRead) + } + PyTransactionIsolationLevel::Serializable => { + Ok(TransactionIsolationLevel::Serializable) + } + PyTransactionIsolationLevel::Snapshot => Ok(TransactionIsolationLevel::Snapshot), + } + } +} + +#[pymethods] +impl PyTransactionStart { + #[new] + pub fn new( + access_mode: PyTransactionAccessMode, + isolation_level: PyTransactionIsolationLevel, + ) -> PyResult { + let access_mode = access_mode.try_into()?; + let isolation_level = isolation_level.try_into()?; + Ok(PyTransactionStart { + transaction_start: TransactionStart { + access_mode, + isolation_level, + }, + }) + } + + pub fn access_mode(&self) -> PyResult { + Ok(self.transaction_start.access_mode.clone().into()) + } + + pub fn isolation_level(&self) -> PyResult { + Ok(self.transaction_start.isolation_level.clone().into()) + } +} + +#[pyclass(name = "TransactionEnd", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyTransactionEnd { + transaction_end: TransactionEnd, +} + +impl From for PyTransactionEnd { + fn from(transaction_end: TransactionEnd) -> PyTransactionEnd { + PyTransactionEnd { transaction_end } + } +} + +impl TryFrom for TransactionEnd { + type Error = PyErr; + + fn try_from(py: PyTransactionEnd) -> Result { + Ok(py.transaction_end) + } +} + +impl LogicalNode for PyTransactionEnd { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[pyclass(eq, eq_int, name = "TransactionConclusion", module = "datafusion.expr")] +pub enum PyTransactionConclusion { + Commit, + Rollback, +} + +impl From for PyTransactionConclusion { + fn from(value: TransactionConclusion) -> Self { + match value { + TransactionConclusion::Commit => PyTransactionConclusion::Commit, + TransactionConclusion::Rollback => PyTransactionConclusion::Rollback, + } + } +} + +impl TryFrom for TransactionConclusion { + type Error = PyErr; + + fn try_from(value: PyTransactionConclusion) -> Result { + match value { + PyTransactionConclusion::Commit => Ok(TransactionConclusion::Commit), + PyTransactionConclusion::Rollback => Ok(TransactionConclusion::Rollback), + } + } +} +#[pymethods] +impl PyTransactionEnd { + #[new] + pub fn new(conclusion: PyTransactionConclusion, chain: bool) -> PyResult { + let conclusion = conclusion.try_into()?; + Ok(PyTransactionEnd { + transaction_end: TransactionEnd { conclusion, chain }, + }) + } + + pub fn conclusion(&self) -> PyResult { + Ok(self.transaction_end.conclusion.clone().into()) + } + + pub fn chain(&self) -> bool { + self.transaction_end.chain + } +} + +#[pyclass(name = "SetVariable", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PySetVariable { + set_variable: SetVariable, +} + +impl From for PySetVariable { + fn from(set_variable: SetVariable) -> PySetVariable { + PySetVariable { set_variable } + } +} + +impl TryFrom for SetVariable { + type Error = PyErr; + + fn try_from(py: PySetVariable) -> Result { + Ok(py.set_variable) + } +} + +impl LogicalNode for PySetVariable { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[pymethods] +impl PySetVariable { + #[new] + pub fn new(variable: String, value: String) -> Self { + PySetVariable { + set_variable: SetVariable { variable, value }, + } + } + + pub fn variable(&self) -> String { + self.set_variable.variable.clone() + } + + pub fn value(&self) -> String { + self.set_variable.value.clone() + } +} + +#[pyclass(name = "Prepare", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyPrepare { + prepare: Prepare, +} + +impl From for PyPrepare { + fn from(prepare: Prepare) -> PyPrepare { + PyPrepare { prepare } + } +} + +impl TryFrom for Prepare { + type Error = PyErr; + + fn try_from(py: PyPrepare) -> Result { + Ok(py.prepare) + } +} + +impl LogicalNode for PyPrepare { + fn inputs(&self) -> Vec { + vec![PyLogicalPlan::from((*self.prepare.input).clone())] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[pymethods] +impl PyPrepare { + #[new] + pub fn new(name: String, data_types: Vec, input: PyLogicalPlan) -> Self { + let input = input.plan().clone(); + let data_types = data_types + .into_iter() + .map(|data_type| data_type.into()) + .collect(); + PyPrepare { + prepare: Prepare { + name, + data_types, + input, + }, + } + } + + pub fn name(&self) -> String { + self.prepare.name.clone() + } + + pub fn data_types(&self) -> Vec { + self.prepare + .data_types + .clone() + .into_iter() + .map(|t| t.into()) + .collect() + } + + pub fn input(&self) -> PyLogicalPlan { + PyLogicalPlan { + plan: self.prepare.input.clone(), + } + } +} + +#[pyclass(name = "Execute", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyExecute { + execute: Execute, +} + +impl From for PyExecute { + fn from(execute: Execute) -> PyExecute { + PyExecute { execute } + } +} + +impl TryFrom for Execute { + type Error = PyErr; + + fn try_from(py: PyExecute) -> Result { + Ok(py.execute) + } +} + +impl LogicalNode for PyExecute { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[pymethods] +impl PyExecute { + #[new] + pub fn new(name: String, parameters: Vec) -> Self { + let parameters = parameters + .into_iter() + .map(|parameter| parameter.into()) + .collect(); + PyExecute { + execute: Execute { name, parameters }, + } + } + + pub fn name(&self) -> String { + self.execute.name.clone() + } + + pub fn parameters(&self) -> Vec { + self.execute + .parameters + .clone() + .into_iter() + .map(|t| t.into()) + .collect() + } +} + +#[pyclass(name = "Deallocate", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyDeallocate { + deallocate: Deallocate, +} + +impl From for PyDeallocate { + fn from(deallocate: Deallocate) -> PyDeallocate { + PyDeallocate { deallocate } + } +} + +impl TryFrom for Deallocate { + type Error = PyErr; + + fn try_from(py: PyDeallocate) -> Result { + Ok(py.deallocate) + } +} + +impl LogicalNode for PyDeallocate { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[pymethods] +impl PyDeallocate { + #[new] + pub fn new(name: String) -> Self { + PyDeallocate { + deallocate: Deallocate { name }, + } + } + + pub fn name(&self) -> String { + self.deallocate.name.clone() + } +} diff --git a/src/expr/values.rs b/src/expr/values.rs new file mode 100644 index 000000000..fb2692230 --- /dev/null +++ b/src/expr/values.rs @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::logical_expr::Values; +use pyo3::{prelude::*, IntoPyObjectExt}; +use pyo3::{pyclass, PyErr, PyResult, Python}; + +use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; + +use super::{logical_node::LogicalNode, PyExpr}; + +#[pyclass(name = "Values", module = "datafusion.expr", subclass)] +#[derive(Clone)] +pub struct PyValues { + values: Values, +} + +impl From for PyValues { + fn from(values: Values) -> PyValues { + PyValues { values } + } +} + +impl TryFrom for Values { + type Error = PyErr; + + fn try_from(py: PyValues) -> Result { + Ok(py.values) + } +} + +impl LogicalNode for PyValues { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[pymethods] +impl PyValues { + #[new] + pub fn new(schema: PyDFSchema, values: Vec>) -> PyResult { + let values = values + .into_iter() + .map(|row| row.into_iter().map(|expr| expr.into()).collect()) + .collect(); + Ok(PyValues { + values: Values { + schema: Arc::new(schema.into()), + values, + }, + }) + } + + pub fn schema(&self) -> PyResult { + Ok((*self.values.schema).clone().into()) + } + + pub fn values(&self) -> Vec> { + self.values + .values + .clone() + .into_iter() + .map(|row| row.into_iter().map(|expr| expr.into()).collect()) + .collect() + } +} diff --git a/src/sql/logical.rs b/src/sql/logical.rs index 96561c434..198d68bdc 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -17,10 +17,25 @@ use std::sync::Arc; +use crate::context::PySessionContext; use crate::errors::PyDataFusionResult; use crate::expr::aggregate::PyAggregate; use crate::expr::analyze::PyAnalyze; +use crate::expr::copy_to::PyCopyTo; +use crate::expr::create_catalog::PyCreateCatalog; +use crate::expr::create_catalog_schema::PyCreateCatalogSchema; +use crate::expr::create_external_table::PyCreateExternalTable; +use crate::expr::create_function::PyCreateFunction; +use crate::expr::create_index::PyCreateIndex; +use crate::expr::create_memory_table::PyCreateMemoryTable; +use crate::expr::create_view::PyCreateView; +use crate::expr::describe_table::PyDescribeTable; use crate::expr::distinct::PyDistinct; +use crate::expr::dml::PyDmlStatement; +use crate::expr::drop_catalog_schema::PyDropCatalogSchema; +use crate::expr::drop_function::PyDropFunction; +use crate::expr::drop_table::PyDropTable; +use crate::expr::drop_view::PyDropView; use crate::expr::empty_relation::PyEmptyRelation; use crate::expr::explain::PyExplain; use crate::expr::extension::PyExtension; @@ -28,14 +43,20 @@ use crate::expr::filter::PyFilter; use crate::expr::join::PyJoin; use crate::expr::limit::PyLimit; use crate::expr::projection::PyProjection; +use crate::expr::recursive_query::PyRecursiveQuery; +use crate::expr::repartition::PyRepartition; use crate::expr::sort::PySort; +use crate::expr::statement::{ + PyDeallocate, PyExecute, PyPrepare, PySetVariable, PyTransactionEnd, PyTransactionStart, +}; use crate::expr::subquery::PySubquery; use crate::expr::subquery_alias::PySubqueryAlias; use crate::expr::table_scan::PyTableScan; +use crate::expr::union::PyUnion; use crate::expr::unnest::PyUnnest; +use crate::expr::values::PyValues; use crate::expr::window::PyWindowExpr; -use crate::{context::PySessionContext, errors::py_unsupported_variant_err}; -use datafusion::logical_expr::LogicalPlan; +use datafusion::logical_expr::{DdlStatement, LogicalPlan, Statement}; use datafusion_proto::logical_plan::{AsLogicalPlan, DefaultLogicalExtensionCodec}; use prost::Message; use pyo3::{exceptions::PyRuntimeError, prelude::*, types::PyBytes}; @@ -82,18 +103,54 @@ impl PyLogicalPlan { LogicalPlan::SubqueryAlias(plan) => PySubqueryAlias::from(plan.clone()).to_variant(py), LogicalPlan::Unnest(plan) => PyUnnest::from(plan.clone()).to_variant(py), LogicalPlan::Window(plan) => PyWindowExpr::from(plan.clone()).to_variant(py), - LogicalPlan::Repartition(_) - | LogicalPlan::Union(_) - | LogicalPlan::Statement(_) - | LogicalPlan::Values(_) - | LogicalPlan::Dml(_) - | LogicalPlan::Ddl(_) - | LogicalPlan::Copy(_) - | LogicalPlan::DescribeTable(_) - | LogicalPlan::RecursiveQuery(_) => Err(py_unsupported_variant_err(format!( - "Conversion of variant not implemented: {:?}", - self.plan - ))), + LogicalPlan::Repartition(plan) => PyRepartition::from(plan.clone()).to_variant(py), + LogicalPlan::Union(plan) => PyUnion::from(plan.clone()).to_variant(py), + LogicalPlan::Statement(plan) => match plan { + Statement::TransactionStart(plan) => { + PyTransactionStart::from(plan.clone()).to_variant(py) + } + Statement::TransactionEnd(plan) => { + PyTransactionEnd::from(plan.clone()).to_variant(py) + } + Statement::SetVariable(plan) => PySetVariable::from(plan.clone()).to_variant(py), + Statement::Prepare(plan) => PyPrepare::from(plan.clone()).to_variant(py), + Statement::Execute(plan) => PyExecute::from(plan.clone()).to_variant(py), + Statement::Deallocate(plan) => PyDeallocate::from(plan.clone()).to_variant(py), + }, + LogicalPlan::Values(plan) => PyValues::from(plan.clone()).to_variant(py), + LogicalPlan::Dml(plan) => PyDmlStatement::from(plan.clone()).to_variant(py), + LogicalPlan::Ddl(plan) => match plan { + DdlStatement::CreateExternalTable(plan) => { + PyCreateExternalTable::from(plan.clone()).to_variant(py) + } + DdlStatement::CreateMemoryTable(plan) => { + PyCreateMemoryTable::from(plan.clone()).to_variant(py) + } + DdlStatement::CreateView(plan) => PyCreateView::from(plan.clone()).to_variant(py), + DdlStatement::CreateCatalogSchema(plan) => { + PyCreateCatalogSchema::from(plan.clone()).to_variant(py) + } + DdlStatement::CreateCatalog(plan) => { + PyCreateCatalog::from(plan.clone()).to_variant(py) + } + DdlStatement::CreateIndex(plan) => PyCreateIndex::from(plan.clone()).to_variant(py), + DdlStatement::DropTable(plan) => PyDropTable::from(plan.clone()).to_variant(py), + DdlStatement::DropView(plan) => PyDropView::from(plan.clone()).to_variant(py), + DdlStatement::DropCatalogSchema(plan) => { + PyDropCatalogSchema::from(plan.clone()).to_variant(py) + } + DdlStatement::CreateFunction(plan) => { + PyCreateFunction::from(plan.clone()).to_variant(py) + } + DdlStatement::DropFunction(plan) => { + PyDropFunction::from(plan.clone()).to_variant(py) + } + }, + LogicalPlan::Copy(plan) => PyCopyTo::from(plan.clone()).to_variant(py), + LogicalPlan::DescribeTable(plan) => PyDescribeTable::from(plan.clone()).to_variant(py), + LogicalPlan::RecursiveQuery(plan) => { + PyRecursiveQuery::from(plan.clone()).to_variant(py) + } } } From 7d8bcd8d20623beb76a397eb4fddfb18781589eb Mon Sep 17 00:00:00 2001 From: kosiew Date: Mon, 5 May 2025 21:50:52 +0800 Subject: [PATCH 038/145] Partial fix for 1078: Enhance DataFrame Formatter Configuration with Memory and Display Controls (#1119) * feat: add configurable max table bytes and min table rows for DataFrame display * Revert "feat: add configurable max table bytes and min table rows for DataFrame display" This reverts commit f9b78fa3180c5d6c20eaa3b6d0af7426d7084093. * feat: add FormatterConfig for configurable DataFrame display options * refactor: simplify attribute extraction in get_formatter_config function * refactor: remove hardcoded constants and use FormatterConfig for display options * refactor: simplify record batch collection by using FormatterConfig for display options * feat: add max_memory_bytes, min_rows_display, and repr_rows parameters to DataFrameHtmlFormatter * feat: add tests for HTML formatter row display settings and memory limit * refactor: extract Python formatter retrieval into a separate function * Revert "feat: add tests for HTML formatter row display settings and memory limit" This reverts commit e089d7b282e53e587116b11d92760e6d292ec871. * feat: add tests for HTML formatter row and memory limit configurations * Revert "feat: add tests for HTML formatter row and memory limit configurations" This reverts commit 4090fd2f7378855b045d6bfd1368d088cc9ada75. * feat: add tests for new parameters and validation in DataFrameHtmlFormatter * Reorganize tests * refactor: rename and restructure formatter functions for clarity and maintainability * feat: implement PythonFormatter struct and refactor formatter retrieval for improved clarity * refactor: improve comments and restructure FormatterConfig usage in PyDataFrame * Add DataFrame usage guide with HTML rendering customization options (#1108) * docs: enhance user guide with detailed DataFrame operations and examples * move /docs/source/api/dataframe.rst into user-guide * docs: remove DataFrame API documentation * docs: fix formatting inconsistencies in DataFrame user guide * Two minor corrections to documentation rendering --------- Co-authored-by: Tim Saucer * Update documentation * refactor: streamline HTML rendering documentation * refactor: extract validation logic into separate functions for clarity * Implement feature X to enhance user experience and optimize performance * feat: add validation method for FormatterConfig to ensure positive integer values * add comment - ensure minimum rows are collected even if memory or row limits are hit * Update html_formatter documentation * update tests * remove unused type hints from imports in html_formatter.py * remove redundant tests for DataFrameHtmlFormatter and clean up assertions * refactor get_attr function to support generic default values * build_formatter_config_from_python return PyResult * fix ruff errors * trigger ci * fix: remove redundant newline in test_custom_style_provider_html_formatter * add more tests * trigger ci * Fix ruff errors * fix clippy error * feat: add validation for parameters in configure_formatter * test: add tests for invalid parameters in configure_formatter * Fix ruff errors --------- Co-authored-by: Tim Saucer --- docs/source/user-guide/dataframe.rst | 52 +++++++- python/datafusion/html_formatter.py | 104 ++++++++++++--- python/tests/test_dataframe.py | 183 ++++++++++++++++++++++++--- src/dataframe.rs | 142 +++++++++++++++++---- 4 files changed, 413 insertions(+), 68 deletions(-) diff --git a/docs/source/user-guide/dataframe.rst b/docs/source/user-guide/dataframe.rst index a78fd8073..11e3d7e72 100644 --- a/docs/source/user-guide/dataframe.rst +++ b/docs/source/user-guide/dataframe.rst @@ -75,13 +75,17 @@ You can customize how DataFrames are rendered in HTML by configuring the formatt # Change the default styling configure_formatter( - max_rows=50, # Maximum number of rows to display - max_width=None, # Maximum width in pixels (None for auto) - theme="light", # Theme: "light" or "dark" - precision=2, # Floating point precision - thousands_separator=",", # Separator for thousands - date_format="%Y-%m-%d", # Date format - truncate_width=20 # Max width for string columns before truncating + max_cell_length=25, # Maximum characters in a cell before truncation + max_width=1000, # Maximum width in pixels + max_height=300, # Maximum height in pixels + max_memory_bytes=2097152, # Maximum memory for rendering (2MB) + min_rows_display=20, # Minimum number of rows to display + repr_rows=10, # Number of rows to display in __repr__ + enable_cell_expansion=True,# Allow expanding truncated cells + custom_css=None, # Additional custom CSS + show_truncation_message=True, # Show message when data is truncated + style_provider=None, # Custom styling provider + use_shared_styles=True # Share styles across tables ) The formatter settings affect all DataFrames displayed after configuration. @@ -113,6 +117,25 @@ For advanced styling needs, you can create a custom style provider: # Apply the custom style provider configure_formatter(style_provider=MyStyleProvider()) +Performance Optimization with Shared Styles +------------------------------------------- +The ``use_shared_styles`` parameter (enabled by default) optimizes performance when displaying +multiple DataFrames in notebook environments: + + .. code-block:: python + from datafusion.html_formatter import StyleProvider, configure_formatter + # Default: Use shared styles (recommended for notebooks) + configure_formatter(use_shared_styles=True) + + # Disable shared styles (each DataFrame includes its own styles) + configure_formatter(use_shared_styles=False) + +When ``use_shared_styles=True``: +- CSS styles and JavaScript are included only once per notebook session +- This reduces HTML output size and prevents style duplication +- Improves rendering performance with many DataFrames +- Applies consistent styling across all DataFrames + Creating a Custom Formatter --------------------------- @@ -177,3 +200,18 @@ You can also use a context manager to temporarily change formatting settings: # Back to default formatting df.show() + +Memory and Display Controls +--------------------------- + +You can control how much data is displayed and how much memory is used for rendering: + + .. code-block:: python + + configure_formatter( + max_memory_bytes=4 * 1024 * 1024, # 4MB maximum memory for display + min_rows_display=50, # Always show at least 50 rows + repr_rows=20 # Show 20 rows in __repr__ output + ) + +These parameters help balance comprehensive data display against performance considerations. \ No newline at end of file diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py index a50e14fd5..12a7e4553 100644 --- a/python/datafusion/html_formatter.py +++ b/python/datafusion/html_formatter.py @@ -27,6 +27,36 @@ ) +def _validate_positive_int(value: Any, param_name: str) -> None: + """Validate that a parameter is a positive integer. + + Args: + value: The value to validate + param_name: Name of the parameter (used in error message) + + Raises: + ValueError: If the value is not a positive integer + """ + if not isinstance(value, int) or value <= 0: + msg = f"{param_name} must be a positive integer" + raise ValueError(msg) + + +def _validate_bool(value: Any, param_name: str) -> None: + """Validate that a parameter is a boolean. + + Args: + value: The value to validate + param_name: Name of the parameter (used in error message) + + Raises: + TypeError: If the value is not a boolean + """ + if not isinstance(value, bool): + msg = f"{param_name} must be a boolean" + raise TypeError(msg) + + @runtime_checkable class CellFormatter(Protocol): """Protocol for cell value formatters.""" @@ -91,6 +121,9 @@ class DataFrameHtmlFormatter: max_cell_length: Maximum characters to display in a cell before truncation max_width: Maximum width of the HTML table in pixels max_height: Maximum height of the HTML table in pixels + max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB) + min_rows_display: Minimum number of rows to display + repr_rows: Default number of rows to display in repr output enable_cell_expansion: Whether to add expand/collapse buttons for long cell values custom_css: Additional CSS to include in the HTML output @@ -108,6 +141,9 @@ def __init__( max_cell_length: int = 25, max_width: int = 1000, max_height: int = 300, + max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB + min_rows_display: int = 20, + repr_rows: int = 10, enable_cell_expansion: bool = True, custom_css: Optional[str] = None, show_truncation_message: bool = True, @@ -124,6 +160,12 @@ def __init__( Maximum width of the displayed table in pixels. max_height : int, default 300 Maximum height of the displayed table in pixels. + max_memory_bytes : int, default 2097152 (2MB) + Maximum memory in bytes for rendered data. + min_rows_display : int, default 20 + Minimum number of rows to display. + repr_rows : int, default 10 + Default number of rows to display in repr output. enable_cell_expansion : bool, default True Whether to allow cells to expand when clicked. custom_css : str, optional @@ -139,7 +181,8 @@ def __init__( Raises: ------ ValueError - If max_cell_length, max_width, or max_height is not a positive integer. + If max_cell_length, max_width, max_height, max_memory_bytes, + min_rows_display, or repr_rows is not a positive integer. TypeError If enable_cell_expansion, show_truncation_message, or use_shared_styles is not a boolean, @@ -148,27 +191,17 @@ def __init__( protocol. """ # Validate numeric parameters - - if not isinstance(max_cell_length, int) or max_cell_length <= 0: - msg = "max_cell_length must be a positive integer" - raise ValueError(msg) - if not isinstance(max_width, int) or max_width <= 0: - msg = "max_width must be a positive integer" - raise ValueError(msg) - if not isinstance(max_height, int) or max_height <= 0: - msg = "max_height must be a positive integer" - raise ValueError(msg) + _validate_positive_int(max_cell_length, "max_cell_length") + _validate_positive_int(max_width, "max_width") + _validate_positive_int(max_height, "max_height") + _validate_positive_int(max_memory_bytes, "max_memory_bytes") + _validate_positive_int(min_rows_display, "min_rows_display") + _validate_positive_int(repr_rows, "repr_rows") # Validate boolean parameters - if not isinstance(enable_cell_expansion, bool): - msg = "enable_cell_expansion must be a boolean" - raise TypeError(msg) - if not isinstance(show_truncation_message, bool): - msg = "show_truncation_message must be a boolean" - raise TypeError(msg) - if not isinstance(use_shared_styles, bool): - msg = "use_shared_styles must be a boolean" - raise TypeError(msg) + _validate_bool(enable_cell_expansion, "enable_cell_expansion") + _validate_bool(show_truncation_message, "show_truncation_message") + _validate_bool(use_shared_styles, "use_shared_styles") # Validate custom_css if custom_css is not None and not isinstance(custom_css, str): @@ -183,6 +216,9 @@ def __init__( self.max_cell_length = max_cell_length self.max_width = max_width self.max_height = max_height + self.max_memory_bytes = max_memory_bytes + self.min_rows_display = min_rows_display + self.repr_rows = repr_rows self.enable_cell_expansion = enable_cell_expansion self.custom_css = custom_css self.show_truncation_message = show_truncation_message @@ -597,6 +633,9 @@ def configure_formatter(**kwargs: Any) -> None: **kwargs: Formatter configuration parameters like max_cell_length, max_width, max_height, enable_cell_expansion, etc. + Raises: + ValueError: If any invalid parameters are provided + Example: >>> from datafusion.html_formatter import configure_formatter >>> configure_formatter( @@ -606,6 +645,31 @@ def configure_formatter(**kwargs: Any) -> None: ... use_shared_styles=True ... ) """ + # Valid parameters accepted by DataFrameHtmlFormatter + valid_params = { + "max_cell_length", + "max_width", + "max_height", + "max_memory_bytes", + "min_rows_display", + "repr_rows", + "enable_cell_expansion", + "custom_css", + "show_truncation_message", + "style_provider", + "use_shared_styles", + } + + # Check for invalid parameters + invalid_params = set(kwargs) - valid_params + if invalid_params: + msg = ( + f"Invalid formatter parameters: {', '.join(invalid_params)}. " + f"Valid parameters are: {', '.join(valid_params)}" + ) + raise ValueError(msg) + + # Create and set formatter with validated parameters set_formatter(DataFrameHtmlFormatter(**kwargs)) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 464b884db..e01308c86 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -41,6 +41,8 @@ ) from pyarrow.csv import write_csv +MB = 1024 * 1024 + @pytest.fixture def ctx(): @@ -117,6 +119,31 @@ def clean_formatter_state(): reset_formatter() +# custom style for testing with html formatter +class CustomStyleProvider: + def get_cell_style(self) -> str: + return ( + "background-color: #f5f5f5; color: #333; padding: 8px; border: " + "1px solid #ddd;" + ) + + def get_header_style(self) -> str: + return ( + "background-color: #4285f4; color: white; font-weight: bold; " + "padding: 10px; border: 1px solid #3367d6;" + ) + + +def count_table_rows(html_content: str) -> int: + """Count the number of table rows in HTML content. + Args: + html_content: HTML string to analyze + Returns: + Number of table rows found (number of tags) + """ + return len(re.findall(r" str: - return ( - "background-color: #f5f5f5; color: #333; padding: 8px; border: " - "1px solid #ddd;" - ) - - def get_header_style(self) -> str: - return ( - "background-color: #4285f4; color: white; font-weight: bold; " - "padding: 10px; border: 1px solid #3367d6;" - ) - # Configure with custom style provider configure_formatter(style_provider=CustomStyleProvider()) @@ -917,6 +930,141 @@ def get_header_style(self) -> str: assert "color: #5af" in html_output # Even numbers +def test_html_formatter_memory(df, clean_formatter_state): + """Test the memory and row control parameters in DataFrameHtmlFormatter.""" + configure_formatter(max_memory_bytes=10, min_rows_display=1) + html_output = df._repr_html_() + + # Count the number of table rows in the output + tr_count = count_table_rows(html_output) + # With a tiny memory limit of 10 bytes, the formatter should display + # the minimum number of rows (1) plus a message about truncation + assert tr_count == 2 # 1 for header row, 1 for data row + assert "data truncated" in html_output.lower() + + configure_formatter(max_memory_bytes=10 * MB, min_rows_display=1) + html_output = df._repr_html_() + # With larger memory limit and min_rows=2, should display all rows + tr_count = count_table_rows(html_output) + # Table should have header row (1) + 3 data rows = 4 rows + assert tr_count == 4 + # No truncation message should appear + assert "data truncated" not in html_output.lower() + + +def test_html_formatter_repr_rows(df, clean_formatter_state): + configure_formatter(min_rows_display=2, repr_rows=2) + html_output = df._repr_html_() + + tr_count = count_table_rows(html_output) + # Tabe should have header row (1) + 2 data rows = 3 rows + assert tr_count == 3 + + configure_formatter(min_rows_display=2, repr_rows=3) + html_output = df._repr_html_() + + tr_count = count_table_rows(html_output) + # Tabe should have header row (1) + 3 data rows = 4 rows + assert tr_count == 4 + + +def test_html_formatter_validation(): + # Test validation for invalid parameters + + with pytest.raises(ValueError, match="max_cell_length must be a positive integer"): + DataFrameHtmlFormatter(max_cell_length=0) + + with pytest.raises(ValueError, match="max_width must be a positive integer"): + DataFrameHtmlFormatter(max_width=0) + + with pytest.raises(ValueError, match="max_height must be a positive integer"): + DataFrameHtmlFormatter(max_height=0) + + with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"): + DataFrameHtmlFormatter(max_memory_bytes=0) + + with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"): + DataFrameHtmlFormatter(max_memory_bytes=-100) + + with pytest.raises(ValueError, match="min_rows_display must be a positive integer"): + DataFrameHtmlFormatter(min_rows_display=0) + + with pytest.raises(ValueError, match="min_rows_display must be a positive integer"): + DataFrameHtmlFormatter(min_rows_display=-5) + + with pytest.raises(ValueError, match="repr_rows must be a positive integer"): + DataFrameHtmlFormatter(repr_rows=0) + + with pytest.raises(ValueError, match="repr_rows must be a positive integer"): + DataFrameHtmlFormatter(repr_rows=-10) + + +def test_configure_formatter(df, clean_formatter_state): + """Test using custom style providers with the HTML formatter and configured + parameters.""" + + # these are non-default values + max_cell_length = 10 + max_width = 500 + max_height = 30 + max_memory_bytes = 3 * MB + min_rows_display = 2 + repr_rows = 2 + enable_cell_expansion = False + show_truncation_message = False + use_shared_styles = False + + reset_formatter() + formatter_default = get_formatter() + + assert formatter_default.max_cell_length != max_cell_length + assert formatter_default.max_width != max_width + assert formatter_default.max_height != max_height + assert formatter_default.max_memory_bytes != max_memory_bytes + assert formatter_default.min_rows_display != min_rows_display + assert formatter_default.repr_rows != repr_rows + assert formatter_default.enable_cell_expansion != enable_cell_expansion + assert formatter_default.show_truncation_message != show_truncation_message + assert formatter_default.use_shared_styles != use_shared_styles + + # Configure with custom style provider and additional parameters + configure_formatter( + max_cell_length=max_cell_length, + max_width=max_width, + max_height=max_height, + max_memory_bytes=max_memory_bytes, + min_rows_display=min_rows_display, + repr_rows=repr_rows, + enable_cell_expansion=enable_cell_expansion, + show_truncation_message=show_truncation_message, + use_shared_styles=use_shared_styles, + ) + formatter_custom = get_formatter() + assert formatter_custom.max_cell_length == max_cell_length + assert formatter_custom.max_width == max_width + assert formatter_custom.max_height == max_height + assert formatter_custom.max_memory_bytes == max_memory_bytes + assert formatter_custom.min_rows_display == min_rows_display + assert formatter_custom.repr_rows == repr_rows + assert formatter_custom.enable_cell_expansion == enable_cell_expansion + assert formatter_custom.show_truncation_message == show_truncation_message + assert formatter_custom.use_shared_styles == use_shared_styles + + +def test_configure_formatter_invalid_params(clean_formatter_state): + """Test that configure_formatter rejects invalid parameters.""" + with pytest.raises(ValueError, match="Invalid formatter parameters"): + configure_formatter(invalid_param=123) + + # Test with multiple parameters, one valid and one invalid + with pytest.raises(ValueError, match="Invalid formatter parameters"): + configure_formatter(max_width=500, not_a_real_param="test") + + # Test with multiple invalid parameters + with pytest.raises(ValueError, match="Invalid formatter parameters"): + configure_formatter(fake_param1="test", fake_param2=456) + + def test_get_dataframe(tmp_path): ctx = SessionContext() @@ -1505,9 +1653,8 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame: assert result["new_col"] == [3 for _i in range(3)] -def test_dataframe_repr_html_structure(df) -> None: +def test_dataframe_repr_html_structure(df, clean_formatter_state) -> None: """Test that DataFrame._repr_html_ produces expected HTML output structure.""" - import re output = df._repr_html_() @@ -1537,7 +1684,7 @@ def test_dataframe_repr_html_structure(df) -> None: assert len(body_matches) == 1, "Expected pattern of values not found in HTML output" -def test_dataframe_repr_html_values(df): +def test_dataframe_repr_html_values(df, clean_formatter_state): """Test that DataFrame._repr_html_ contains the expected data values.""" html = df._repr_html_() assert html is not None diff --git a/src/dataframe.rs b/src/dataframe.rs index 787f63520..211e31bd1 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -71,8 +71,103 @@ impl PyTableProvider { PyTable::new(table_provider) } } -const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB -const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20; + +/// Configuration for DataFrame display formatting +#[derive(Debug, Clone)] +pub struct FormatterConfig { + /// Maximum memory in bytes to use for display (default: 2MB) + pub max_bytes: usize, + /// Minimum number of rows to display (default: 20) + pub min_rows: usize, + /// Number of rows to include in __repr__ output (default: 10) + pub repr_rows: usize, +} + +impl Default for FormatterConfig { + fn default() -> Self { + Self { + max_bytes: 2 * 1024 * 1024, // 2MB + min_rows: 20, + repr_rows: 10, + } + } +} + +impl FormatterConfig { + /// Validates that all configuration values are positive integers. + /// + /// # Returns + /// + /// `Ok(())` if all values are valid, or an `Err` with a descriptive error message. + pub fn validate(&self) -> Result<(), String> { + if self.max_bytes == 0 { + return Err("max_bytes must be a positive integer".to_string()); + } + + if self.min_rows == 0 { + return Err("min_rows must be a positive integer".to_string()); + } + + if self.repr_rows == 0 { + return Err("repr_rows must be a positive integer".to_string()); + } + + Ok(()) + } +} + +/// Holds the Python formatter and its configuration +struct PythonFormatter<'py> { + /// The Python formatter object + formatter: Bound<'py, PyAny>, + /// The formatter configuration + config: FormatterConfig, +} + +/// Get the Python formatter and its configuration +fn get_python_formatter_with_config(py: Python) -> PyResult { + let formatter = import_python_formatter(py)?; + let config = build_formatter_config_from_python(&formatter)?; + Ok(PythonFormatter { formatter, config }) +} + +/// Get the Python formatter from the datafusion.html_formatter module +fn import_python_formatter(py: Python) -> PyResult> { + let formatter_module = py.import("datafusion.html_formatter")?; + let get_formatter = formatter_module.getattr("get_formatter")?; + get_formatter.call0() +} + +// Helper function to extract attributes with fallback to default +fn get_attr<'a, T>(py_object: &'a Bound<'a, PyAny>, attr_name: &str, default_value: T) -> T +where + T: for<'py> pyo3::FromPyObject<'py> + Clone, +{ + py_object + .getattr(attr_name) + .and_then(|v| v.extract::()) + .unwrap_or_else(|_| default_value.clone()) +} + +/// Helper function to create a FormatterConfig from a Python formatter object +fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> PyResult { + let default_config = FormatterConfig::default(); + let max_bytes = get_attr(formatter, "max_memory_bytes", default_config.max_bytes); + let min_rows = get_attr(formatter, "min_rows_display", default_config.min_rows); + let repr_rows = get_attr(formatter, "repr_rows", default_config.repr_rows); + + let config = FormatterConfig { + max_bytes, + min_rows, + repr_rows, + }; + + // Return the validated config, converting String error to PyErr + config + .validate() + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e))?; + Ok(config) +} /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. @@ -114,9 +209,14 @@ impl PyDataFrame { } fn __repr__(&self, py: Python) -> PyDataFusionResult { + // Get the Python formatter config + let PythonFormatter { + formatter: _, + config, + } = get_python_formatter_with_config(py)?; let (batches, has_more) = wait_for_future( py, - collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10), + collect_record_batches_to_display(self.df.as_ref().clone(), config), )?; if batches.is_empty() { // This should not be reached, but do it for safety since we index into the vector below @@ -135,13 +235,11 @@ impl PyDataFrame { } fn _repr_html_(&self, py: Python) -> PyDataFusionResult { + // Get the Python formatter and config + let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; let (batches, has_more) = wait_for_future( py, - collect_record_batches_to_display( - self.df.as_ref().clone(), - MIN_TABLE_ROWS_TO_DISPLAY, - usize::MAX, - ), + collect_record_batches_to_display(self.df.as_ref().clone(), config), )?; if batches.is_empty() { // This should not be reached, but do it for safety since we index into the vector below @@ -158,12 +256,6 @@ impl PyDataFrame { let py_schema = self.schema().into_pyobject(py)?; - // Get the Python formatter module and call format_html - let formatter_module = py.import("datafusion.html_formatter")?; - let get_formatter = formatter_module.getattr("get_formatter")?; - let formatter = get_formatter.call0()?; - - // Call format_html method on the formatter let kwargs = pyo3::types::PyDict::new(py); let py_batches_list = PyList::new(py, py_batches.as_slice())?; kwargs.set_item("batches", py_batches_list)?; @@ -796,9 +888,14 @@ fn record_batch_into_schema( /// rows, set min_rows == max_rows. async fn collect_record_batches_to_display( df: DataFrame, - min_rows: usize, - max_rows: usize, + config: FormatterConfig, ) -> Result<(Vec, bool), DataFusionError> { + let FormatterConfig { + max_bytes, + min_rows, + repr_rows, + } = config; + let partitioned_stream = df.execute_stream_partitioned().await?; let mut stream = futures::stream::iter(partitioned_stream).flatten(); let mut size_estimate_so_far = 0; @@ -806,9 +903,8 @@ async fn collect_record_batches_to_display( let mut record_batches = Vec::default(); let mut has_more = false; - while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows) - || rows_so_far < min_rows - { + // ensure minimum rows even if memory/row limits are hit + while (size_estimate_so_far < max_bytes && rows_so_far < repr_rows) || rows_so_far < min_rows { let mut rb = match stream.next().await { None => { break; @@ -821,8 +917,8 @@ async fn collect_record_batches_to_display( if rows_in_rb > 0 { size_estimate_so_far += rb.get_array_memory_size(); - if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY { - let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32; + if size_estimate_so_far > max_bytes { + let ratio = max_bytes as f32 / size_estimate_so_far as f32; let total_rows = rows_in_rb + rows_so_far; let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize; @@ -838,8 +934,8 @@ async fn collect_record_batches_to_display( } } - if rows_in_rb + rows_so_far > max_rows { - rb = rb.slice(0, max_rows - rows_so_far); + if rows_in_rb + rows_so_far > repr_rows { + rb = rb.slice(0, repr_rows - rows_so_far); has_more = true; } From f3c98ec7a2eb325041530b1ae8d6de41aa558037 Mon Sep 17 00:00:00 2001 From: kosiew Date: Fri, 16 May 2025 14:34:19 +0800 Subject: [PATCH 039/145] Add fill_null method to DataFrame API for handling missing values (#1019) * feat: add fill_null method to DataFrame for handling null values * test: add coalesce function tests for handling default values * Resolve test cases for fill_null * feat: add fill_nan method to DataFrame for handling NaN values * move imports out of functions * docs: add documentation for fill_null and fill_nan methods in DataFrame * Add more tests * fix ruff errors * amend def fill_null to invoke PyDataFrame's fill_null - Implemented `fill_null` method in `dataframe.rs` to allow filling null values with a specified value for specific columns or all columns. - Added a helper function `python_value_to_scalar_value` to convert Python values to DataFusion ScalarValues, supporting various types including integers, floats, booleans, strings, and timestamps. - Updated the `count` method in `PyDataFrame` to maintain functionality. * refactor: remove fill_nan method documentation from functions.rst * refactor: remove unused import of Enum from dataframe.py * refactor: improve error handling and type extraction in python_value_to_scalar_value function * refactor: enhance datetime and date conversion logic in python_value_to_scalar_value function * refactor: streamline type extraction in python_value_to_scalar_value function * fix try_convert_to_string * refactor: improve type handling in python_value_to_scalar_value function * refactor: move py_obj_to_scalar_value function to utils module * refactor: update fill_null to use py_obj_to_scalar_value from utils * Remove python_object_to_scalar_value code * refactor: enhance py_obj_to_scalar_value to utilize PyArrow for complex type conversion * refactor: update py_obj_to_scalar_value to handle errors and use extract_bound for PyArrow scalar conversion * refactor: modify py_obj_to_scalar_value to return ScalarValue directly and streamline error handling * refactor: update py_obj_to_scalar_value to return a Result for better error handling * test: add tests for fill_null functionality in DataFrame with null values * test: enhance null DataFrame tests to include date32 and date64 columns * refactor: simplify py_obj_to_scalar_value by removing direct extraction of basic types * refactor: remove unnecessary documentation from py_obj_to_scalar_value function * Fix ruff errors * test: update datetime handling in coalesce tests to include timezone information * Fix ruff errors * trigger ci --- .../common-operations/functions.rst | 21 ++ python/datafusion/dataframe.py | 26 +- python/tests/test_dataframe.py | 266 ++++++++++++++++++ python/tests/test_functions.py | 61 ++++ src/config.rs | 21 +- src/dataframe.rs | 23 +- src/utils.rs | 18 ++ 7 files changed, 414 insertions(+), 22 deletions(-) diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst index 12097be8f..d458d3eb0 100644 --- a/docs/source/user-guide/common-operations/functions.rst +++ b/docs/source/user-guide/common-operations/functions.rst @@ -129,3 +129,24 @@ The function :py:func:`~datafusion.functions.in_list` allows to check a column f .limit(20) .to_pandas() ) + + +Handling Missing Values +===================== + +DataFusion provides methods to handle missing values in DataFrames: + +fill_null +--------- + +The ``fill_null()`` method replaces NULL values in specified columns with a provided value: + +.. code-block:: python + + # Fill all NULL values with 0 where possible + df = df.fill_null(0) + + # Fill NULL values only in specific string columns + df = df.fill_null("missing", subset=["name", "category"]) + +The fill value will be cast to match each column's type. If casting fails for a column, that column remains unchanged. diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 26fe8f453..a1df7e080 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -37,6 +37,8 @@ except ImportError: from typing_extensions import deprecated # Python 3.12 +from datafusion._internal import DataFrame as DataFrameInternal +from datafusion.expr import Expr, SortExpr, sort_or_default from datafusion.plan import ExecutionPlan, LogicalPlan from datafusion.record_batch import RecordBatchStream @@ -53,8 +55,6 @@ from enum import Enum -from datafusion.expr import Expr, SortExpr, sort_or_default - # excerpt from deltalake # https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163 @@ -869,3 +869,25 @@ def within_limit(df: DataFrame, limit: int) -> DataFrame: DataFrame: After applying func to the original dataframe. """ return func(self, *args) + + def fill_null(self, value: Any, subset: list[str] | None = None) -> DataFrame: + """Fill null values in specified columns with a value. + + Args: + value: Value to replace nulls with. Will be cast to match column type. + subset: Optional list of column names to fill. If None, fills all columns. + + Returns: + DataFrame with null values replaced where type casting is possible + + Examples: + >>> df = df.fill_null(0) # Fill all nulls with 0 where possible + >>> # Fill nulls in specific string columns + >>> df = df.fill_null("missing", subset=["name", "category"]) + + Notes: + - Only fills nulls in columns where the value can be cast to the column type + - For columns where casting fails, the original column is kept unchanged + - For columns not in subset, the original column is kept unchanged + """ + return DataFrame(self.df.fill_null(value, subset)) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index e01308c86..dd5f962b2 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import datetime import os import re from typing import Any @@ -119,6 +120,38 @@ def clean_formatter_state(): reset_formatter() +@pytest.fixture +def null_df(): + """Create a DataFrame with null values of different types.""" + ctx = SessionContext() + + # Create a RecordBatch with nulls across different types + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, None, 3, None], type=pa.int64()), + pa.array([4.5, 6.7, None, None], type=pa.float64()), + pa.array(["a", None, "c", None], type=pa.string()), + pa.array([True, None, False, None], type=pa.bool_()), + pa.array( + [10957, None, 18993, None], type=pa.date32() + ), # 2000-01-01, null, 2022-01-01, null + pa.array( + [946684800000, None, 1640995200000, None], type=pa.date64() + ), # 2000-01-01, null, 2022-01-01, null + ], + names=[ + "int_col", + "float_col", + "str_col", + "bool_col", + "date32_col", + "date64_col", + ], + ) + + return ctx.create_dataframe([[batch]]) + + # custom style for testing with html formatter class CustomStyleProvider: def get_cell_style(self) -> str: @@ -1794,3 +1827,236 @@ def test_html_formatter_manual_format_html(clean_formatter_state): assert "") + return html + + def _build_table_container_start(self) -> list[str]: + """Build the opening tags for the table container.""" + html = [] + html.append( + f'
' + ) + html.append('') + return html + + def _build_table_header(self, schema: Any) -> list[str]: + """Build the HTML table header with column names.""" + html = [] + html.append("") + html.append("") + for field in schema: + if self._custom_header_builder: + html.append(self._custom_header_builder(field)) + else: + html.append( + f"" + ) + html.append("") + html.append("") + return html + + def _build_table_body(self, batches: list, table_uuid: str) -> list[str]: + """Build the HTML table body with data rows.""" + html = [] + html.append("") + + row_count = 0 + for batch in batches: + for row_idx in range(batch.num_rows): + row_count += 1 + html.append("") + + for col_idx, column in enumerate(batch.columns): + # Get the raw value from the column + raw_value = self._get_cell_value(column, row_idx) + + # Always check for type formatters first to format the value + formatted_value = self._format_cell_value(raw_value) + + # Then apply either custom cell builder or standard cell formatting + if self._custom_cell_builder: + # Pass both the raw value and formatted value to let the + # builder decide + cell_html = self._custom_cell_builder( + raw_value, row_count, col_idx, table_uuid + ) + html.append(cell_html) + else: + # Standard cell formatting with formatted value + if ( + len(str(raw_value)) > self.max_cell_length + and self.enable_cell_expansion + ): + cell_html = self._build_expandable_cell( + formatted_value, row_count, col_idx, table_uuid + ) + else: + cell_html = self._build_regular_cell(formatted_value) + html.append(cell_html) + + html.append("") + + html.append("") + return html + + def _get_cell_value(self, column: Any, row_idx: int) -> Any: + """Extract a cell value from a column. + + Args: + column: Arrow array + row_idx: Row index + + Returns: + The raw cell value + """ + try: + value = column[row_idx] + + if hasattr(value, "as_py"): + return value.as_py() + except (AttributeError, TypeError): + pass + else: + return value + + def _format_cell_value(self, value: Any) -> str: + """Format a cell value for display. + + Uses registered type formatters if available. + + Args: + value: The cell value to format + + Returns: + Formatted cell value as string + """ + # Check for custom type formatters + for type_cls, formatter in self._type_formatters.items(): + if isinstance(value, type_cls): + return formatter(value) + + # If no formatter matched, return string representation + return str(value) + + def _build_expandable_cell( + self, formatted_value: str, row_count: int, col_idx: int, table_uuid: str + ) -> str: + """Build an expandable cell for long content.""" + short_value = str(formatted_value)[: self.max_cell_length] + return ( + f"" + ) + + def _build_regular_cell(self, formatted_value: str) -> str: + """Build a regular table cell.""" + return ( + f"" + ) + + def _build_html_footer(self, has_more: bool) -> list[str]: + """Build the HTML footer with JavaScript and messages.""" + html = [] + + # Add JavaScript for interactivity only if cell expansion is enabled + # and we're not using the shared styles approach + if self.enable_cell_expansion and not self.use_shared_styles: + html.append(self._get_javascript()) + + # Add truncation message if needed + if has_more and self.show_truncation_message: + html.append("
Data truncated due to size.
") + + return html + + def _get_default_css(self) -> str: + """Get default CSS styles for the HTML table.""" + return """ + .expandable-container { + display: inline-block; + max-width: 200px; + } + .expandable { + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + display: block; + } + .full-text { + display: none; + white-space: normal; + } + .expand-btn { + cursor: pointer; + color: blue; + text-decoration: underline; + border: none; + background: none; + font-size: inherit; + display: block; + margin-top: 5px; + } + """ + + def _get_javascript(self) -> str: + """Get JavaScript code for interactive elements.""" + return """ + + """ + + +class FormatterManager: + """Manager class for the global DataFrame HTML formatter instance.""" + + _default_formatter: DataFrameHtmlFormatter = DataFrameHtmlFormatter() + + @classmethod + def set_formatter(cls, formatter: DataFrameHtmlFormatter) -> None: + """Set the global DataFrame HTML formatter. + + Args: + formatter: The formatter instance to use globally + """ + cls._default_formatter = formatter + _refresh_formatter_reference() + + @classmethod + def get_formatter(cls) -> DataFrameHtmlFormatter: + """Get the current global DataFrame HTML formatter. + + Returns: + The global HTML formatter instance + """ + return cls._default_formatter + + +def get_formatter() -> DataFrameHtmlFormatter: + """Get the current global DataFrame HTML formatter. + + This function is used by the DataFrame._repr_html_ implementation to access + the shared formatter instance. It can also be used directly when custom + HTML rendering is needed. + + Returns: + The global HTML formatter instance + + Example: + >>> from datafusion.html_formatter import get_formatter + >>> formatter = get_formatter() + >>> formatter.max_cell_length = 50 # Increase cell length + """ + return FormatterManager.get_formatter() + + +def set_formatter(formatter: DataFrameHtmlFormatter) -> None: + """Set the global DataFrame HTML formatter. + + Args: + formatter: The formatter instance to use globally + + Example: + >>> from datafusion.html_formatter import get_formatter, set_formatter + >>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100) + >>> set_formatter(custom_formatter) + """ + FormatterManager.set_formatter(formatter) + + +def configure_formatter(**kwargs: Any) -> None: + """Configure the global DataFrame HTML formatter. + + This function creates a new formatter with the provided configuration + and sets it as the global formatter for all DataFrames. + + Args: + **kwargs: Formatter configuration parameters like max_cell_length, + max_width, max_height, enable_cell_expansion, etc. + + Raises: + ValueError: If any invalid parameters are provided + + Example: + >>> from datafusion.html_formatter import configure_formatter + >>> configure_formatter( + ... max_cell_length=50, + ... max_height=500, + ... enable_cell_expansion=True, + ... use_shared_styles=True + ... ) + """ + # Valid parameters accepted by DataFrameHtmlFormatter + valid_params = { + "max_cell_length", + "max_width", + "max_height", + "max_memory_bytes", + "min_rows_display", + "repr_rows", + "enable_cell_expansion", + "custom_css", + "show_truncation_message", + "style_provider", + "use_shared_styles", + } + + # Check for invalid parameters + invalid_params = set(kwargs) - valid_params + if invalid_params: + msg = ( + f"Invalid formatter parameters: {', '.join(invalid_params)}. " + f"Valid parameters are: {', '.join(valid_params)}" + ) + raise ValueError(msg) + + # Create and set formatter with validated parameters + set_formatter(DataFrameHtmlFormatter(**kwargs)) + + +def reset_formatter() -> None: + """Reset the global DataFrame HTML formatter to default settings. + + This function creates a new formatter with default configuration + and sets it as the global formatter for all DataFrames. + + Example: + >>> from datafusion.html_formatter import reset_formatter + >>> reset_formatter() # Reset formatter to default settings + """ + formatter = DataFrameHtmlFormatter() + # Reset the styles_loaded flag to ensure styles will be reloaded + DataFrameHtmlFormatter._styles_loaded = False + set_formatter(formatter) + + +def reset_styles_loaded_state() -> None: + """Reset the styles loaded state to force reloading of styles. + + This can be useful when switching between notebook sessions or + when styles need to be refreshed. + + Example: + >>> from datafusion.html_formatter import reset_styles_loaded_state + >>> reset_styles_loaded_state() # Force styles to reload in next render + """ + DataFrameHtmlFormatter._styles_loaded = False + + +def _refresh_formatter_reference() -> None: + """Refresh formatter reference in any modules using it. + + This helps ensure that changes to the formatter are reflected in existing + DataFrames that might be caching the formatter reference. + """ + # This is a no-op but signals modules to refresh their reference diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py index 12a7e4553..65eb1f042 100644 --- a/python/datafusion/html_formatter.py +++ b/python/datafusion/html_formatter.py @@ -14,698 +14,16 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""HTML formatting utilities for DataFusion DataFrames.""" -from __future__ import annotations +"""Deprecated module for dataframe formatting.""" -from typing import ( - Any, - Callable, - Optional, - Protocol, - runtime_checkable, -) - - -def _validate_positive_int(value: Any, param_name: str) -> None: - """Validate that a parameter is a positive integer. - - Args: - value: The value to validate - param_name: Name of the parameter (used in error message) - - Raises: - ValueError: If the value is not a positive integer - """ - if not isinstance(value, int) or value <= 0: - msg = f"{param_name} must be a positive integer" - raise ValueError(msg) - - -def _validate_bool(value: Any, param_name: str) -> None: - """Validate that a parameter is a boolean. - - Args: - value: The value to validate - param_name: Name of the parameter (used in error message) - - Raises: - TypeError: If the value is not a boolean - """ - if not isinstance(value, bool): - msg = f"{param_name} must be a boolean" - raise TypeError(msg) - - -@runtime_checkable -class CellFormatter(Protocol): - """Protocol for cell value formatters.""" - - def __call__(self, value: Any) -> str: - """Format a cell value to string representation.""" - ... - - -@runtime_checkable -class StyleProvider(Protocol): - """Protocol for HTML style providers.""" - - def get_cell_style(self) -> str: - """Get the CSS style for table cells.""" - ... - - def get_header_style(self) -> str: - """Get the CSS style for header cells.""" - ... - - -class DefaultStyleProvider: - """Default implementation of StyleProvider.""" - - def get_cell_style(self) -> str: - """Get the CSS style for table cells. - - Returns: - CSS style string - """ - return ( - "border: 1px solid black; padding: 8px; text-align: left; " - "white-space: nowrap;" - ) - - def get_header_style(self) -> str: - """Get the CSS style for header cells. - - Returns: - CSS style string - """ - return ( - "border: 1px solid black; padding: 8px; text-align: left; " - "background-color: #f2f2f2; white-space: nowrap; min-width: fit-content; " - "max-width: fit-content;" - ) - - -class DataFrameHtmlFormatter: - """Configurable HTML formatter for DataFusion DataFrames. - - This class handles the HTML rendering of DataFrames for display in - Jupyter notebooks and other rich display contexts. - - This class supports extension through composition. Key extension points: - - Provide a custom StyleProvider for styling cells and headers - - Register custom formatters for specific types - - Provide custom cell builders for specialized cell rendering - - Args: - max_cell_length: Maximum characters to display in a cell before truncation - max_width: Maximum width of the HTML table in pixels - max_height: Maximum height of the HTML table in pixels - max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB) - min_rows_display: Minimum number of rows to display - repr_rows: Default number of rows to display in repr output - enable_cell_expansion: Whether to add expand/collapse buttons for long cell - values - custom_css: Additional CSS to include in the HTML output - show_truncation_message: Whether to display a message when data is truncated - style_provider: Custom provider for cell and header styles - use_shared_styles: Whether to load styles and scripts only once per notebook - session - """ - - # Class variable to track if styles have been loaded in the notebook - _styles_loaded = False - - def __init__( - self, - max_cell_length: int = 25, - max_width: int = 1000, - max_height: int = 300, - max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB - min_rows_display: int = 20, - repr_rows: int = 10, - enable_cell_expansion: bool = True, - custom_css: Optional[str] = None, - show_truncation_message: bool = True, - style_provider: Optional[StyleProvider] = None, - use_shared_styles: bool = True, - ) -> None: - """Initialize the HTML formatter. - - Parameters - ---------- - max_cell_length : int, default 25 - Maximum length of cell content before truncation. - max_width : int, default 1000 - Maximum width of the displayed table in pixels. - max_height : int, default 300 - Maximum height of the displayed table in pixels. - max_memory_bytes : int, default 2097152 (2MB) - Maximum memory in bytes for rendered data. - min_rows_display : int, default 20 - Minimum number of rows to display. - repr_rows : int, default 10 - Default number of rows to display in repr output. - enable_cell_expansion : bool, default True - Whether to allow cells to expand when clicked. - custom_css : str, optional - Custom CSS to apply to the HTML table. - show_truncation_message : bool, default True - Whether to show a message indicating that content has been truncated. - style_provider : StyleProvider, optional - Provider of CSS styles for the HTML table. If None, DefaultStyleProvider - is used. - use_shared_styles : bool, default True - Whether to use shared styles across multiple tables. - - Raises: - ------ - ValueError - If max_cell_length, max_width, max_height, max_memory_bytes, - min_rows_display, or repr_rows is not a positive integer. - TypeError - If enable_cell_expansion, show_truncation_message, or use_shared_styles is - not a boolean, - or if custom_css is provided but is not a string, - or if style_provider is provided but does not implement the StyleProvider - protocol. - """ - # Validate numeric parameters - _validate_positive_int(max_cell_length, "max_cell_length") - _validate_positive_int(max_width, "max_width") - _validate_positive_int(max_height, "max_height") - _validate_positive_int(max_memory_bytes, "max_memory_bytes") - _validate_positive_int(min_rows_display, "min_rows_display") - _validate_positive_int(repr_rows, "repr_rows") - - # Validate boolean parameters - _validate_bool(enable_cell_expansion, "enable_cell_expansion") - _validate_bool(show_truncation_message, "show_truncation_message") - _validate_bool(use_shared_styles, "use_shared_styles") - - # Validate custom_css - if custom_css is not None and not isinstance(custom_css, str): - msg = "custom_css must be None or a string" - raise TypeError(msg) - - # Validate style_provider - if style_provider is not None and not isinstance(style_provider, StyleProvider): - msg = "style_provider must implement the StyleProvider protocol" - raise TypeError(msg) - - self.max_cell_length = max_cell_length - self.max_width = max_width - self.max_height = max_height - self.max_memory_bytes = max_memory_bytes - self.min_rows_display = min_rows_display - self.repr_rows = repr_rows - self.enable_cell_expansion = enable_cell_expansion - self.custom_css = custom_css - self.show_truncation_message = show_truncation_message - self.style_provider = style_provider or DefaultStyleProvider() - self.use_shared_styles = use_shared_styles - # Registry for custom type formatters - self._type_formatters: dict[type, CellFormatter] = {} - # Custom cell builders - self._custom_cell_builder: Optional[Callable[[Any, int, int, str], str]] = None - self._custom_header_builder: Optional[Callable[[Any], str]] = None - - def register_formatter(self, type_class: type, formatter: CellFormatter) -> None: - """Register a custom formatter for a specific data type. - - Args: - type_class: The type to register a formatter for - formatter: Function that takes a value of the given type and returns - a formatted string - """ - self._type_formatters[type_class] = formatter - - def set_custom_cell_builder( - self, builder: Callable[[Any, int, int, str], str] - ) -> None: - """Set a custom cell builder function. - - Args: - builder: Function that takes (value, row, col, table_id) and returns HTML - """ - self._custom_cell_builder = builder - - def set_custom_header_builder(self, builder: Callable[[Any], str]) -> None: - """Set a custom header builder function. - - Args: - builder: Function that takes a field and returns HTML - """ - self._custom_header_builder = builder - - @classmethod - def is_styles_loaded(cls) -> bool: - """Check if HTML styles have been loaded in the current session. - - This method is primarily intended for debugging UI rendering issues - related to style loading. - - Returns: - True if styles have been loaded, False otherwise - - Example: - >>> from datafusion.html_formatter import DataFrameHtmlFormatter - >>> DataFrameHtmlFormatter.is_styles_loaded() - False - """ - return cls._styles_loaded - - def format_html( - self, - batches: list, - schema: Any, - has_more: bool = False, - table_uuid: str | None = None, - ) -> str: - """Format record batches as HTML. - - This method is used by DataFrame's _repr_html_ implementation and can be - called directly when custom HTML rendering is needed. - - Args: - batches: List of Arrow RecordBatch objects - schema: Arrow Schema object - has_more: Whether there are more batches not shown - table_uuid: Unique ID for the table, used for JavaScript interactions - - Returns: - HTML string representation of the data - - Raises: - TypeError: If schema is invalid and no batches are provided - """ - if not batches: - return "No data to display" - - # Validate schema - if schema is None or not hasattr(schema, "__iter__"): - msg = "Schema must be provided" - raise TypeError(msg) - - # Generate a unique ID if none provided - table_uuid = table_uuid or f"df-{id(batches)}" - - # Build HTML components - html = [] - - # Only include styles and scripts if: - # 1. Not using shared styles, OR - # 2. Using shared styles but they haven't been loaded yet - include_styles = ( - not self.use_shared_styles or not DataFrameHtmlFormatter._styles_loaded - ) - - if include_styles: - html.extend(self._build_html_header()) - # If we're using shared styles, mark them as loaded - if self.use_shared_styles: - DataFrameHtmlFormatter._styles_loaded = True - - html.extend(self._build_table_container_start()) - - # Add table header and body - html.extend(self._build_table_header(schema)) - html.extend(self._build_table_body(batches, table_uuid)) - - html.append("
" + f"{field.name}
" + f"
" + "" + "" + f"{formatted_value}" + f"" + f"
" + f"
{formatted_value}
") - html.append("
") - - # Add footer (JavaScript and messages) - if include_styles and self.enable_cell_expansion: - html.append(self._get_javascript()) - - # Always add truncation message if needed (independent of styles) - if has_more and self.show_truncation_message: - html.append("
Data truncated due to size.
") - - return "\n".join(html) - - def _build_html_header(self) -> list[str]: - """Build the HTML header with CSS styles.""" - html = [] - html.append("") - return html +import warnings - def _build_table_container_start(self) -> list[str]: - """Build the opening tags for the table container.""" - html = [] - html.append( - f'
' - ) - html.append('') - return html +from datafusion.dataframe_formatter import * # noqa: F403 - def _build_table_header(self, schema: Any) -> list[str]: - """Build the HTML table header with column names.""" - html = [] - html.append("") - html.append("") - for field in schema: - if self._custom_header_builder: - html.append(self._custom_header_builder(field)) - else: - html.append( - f"" - ) - html.append("") - html.append("") - return html - - def _build_table_body(self, batches: list, table_uuid: str) -> list[str]: - """Build the HTML table body with data rows.""" - html = [] - html.append("") - - row_count = 0 - for batch in batches: - for row_idx in range(batch.num_rows): - row_count += 1 - html.append("") - - for col_idx, column in enumerate(batch.columns): - # Get the raw value from the column - raw_value = self._get_cell_value(column, row_idx) - - # Always check for type formatters first to format the value - formatted_value = self._format_cell_value(raw_value) - - # Then apply either custom cell builder or standard cell formatting - if self._custom_cell_builder: - # Pass both the raw value and formatted value to let the - # builder decide - cell_html = self._custom_cell_builder( - raw_value, row_count, col_idx, table_uuid - ) - html.append(cell_html) - else: - # Standard cell formatting with formatted value - if ( - len(str(raw_value)) > self.max_cell_length - and self.enable_cell_expansion - ): - cell_html = self._build_expandable_cell( - formatted_value, row_count, col_idx, table_uuid - ) - else: - cell_html = self._build_regular_cell(formatted_value) - html.append(cell_html) - - html.append("") - - html.append("") - return html - - def _get_cell_value(self, column: Any, row_idx: int) -> Any: - """Extract a cell value from a column. - - Args: - column: Arrow array - row_idx: Row index - - Returns: - The raw cell value - """ - try: - value = column[row_idx] - - if hasattr(value, "as_py"): - return value.as_py() - except (AttributeError, TypeError): - pass - else: - return value - - def _format_cell_value(self, value: Any) -> str: - """Format a cell value for display. - - Uses registered type formatters if available. - - Args: - value: The cell value to format - - Returns: - Formatted cell value as string - """ - # Check for custom type formatters - for type_cls, formatter in self._type_formatters.items(): - if isinstance(value, type_cls): - return formatter(value) - - # If no formatter matched, return string representation - return str(value) - - def _build_expandable_cell( - self, formatted_value: str, row_count: int, col_idx: int, table_uuid: str - ) -> str: - """Build an expandable cell for long content.""" - short_value = str(formatted_value)[: self.max_cell_length] - return ( - f"" - ) - - def _build_regular_cell(self, formatted_value: str) -> str: - """Build a regular table cell.""" - return ( - f"" - ) - - def _build_html_footer(self, has_more: bool) -> list[str]: - """Build the HTML footer with JavaScript and messages.""" - html = [] - - # Add JavaScript for interactivity only if cell expansion is enabled - # and we're not using the shared styles approach - if self.enable_cell_expansion and not self.use_shared_styles: - html.append(self._get_javascript()) - - # Add truncation message if needed - if has_more and self.show_truncation_message: - html.append("
Data truncated due to size.
") - - return html - - def _get_default_css(self) -> str: - """Get default CSS styles for the HTML table.""" - return """ - .expandable-container { - display: inline-block; - max-width: 200px; - } - .expandable { - white-space: nowrap; - overflow: hidden; - text-overflow: ellipsis; - display: block; - } - .full-text { - display: none; - white-space: normal; - } - .expand-btn { - cursor: pointer; - color: blue; - text-decoration: underline; - border: none; - background: none; - font-size: inherit; - display: block; - margin-top: 5px; - } - """ - - def _get_javascript(self) -> str: - """Get JavaScript code for interactive elements.""" - return """ - - """ - - -class FormatterManager: - """Manager class for the global DataFrame HTML formatter instance.""" - - _default_formatter: DataFrameHtmlFormatter = DataFrameHtmlFormatter() - - @classmethod - def set_formatter(cls, formatter: DataFrameHtmlFormatter) -> None: - """Set the global DataFrame HTML formatter. - - Args: - formatter: The formatter instance to use globally - """ - cls._default_formatter = formatter - _refresh_formatter_reference() - - @classmethod - def get_formatter(cls) -> DataFrameHtmlFormatter: - """Get the current global DataFrame HTML formatter. - - Returns: - The global HTML formatter instance - """ - return cls._default_formatter - - -def get_formatter() -> DataFrameHtmlFormatter: - """Get the current global DataFrame HTML formatter. - - This function is used by the DataFrame._repr_html_ implementation to access - the shared formatter instance. It can also be used directly when custom - HTML rendering is needed. - - Returns: - The global HTML formatter instance - - Example: - >>> from datafusion.html_formatter import get_formatter - >>> formatter = get_formatter() - >>> formatter.max_cell_length = 50 # Increase cell length - """ - return FormatterManager.get_formatter() - - -def set_formatter(formatter: DataFrameHtmlFormatter) -> None: - """Set the global DataFrame HTML formatter. - - Args: - formatter: The formatter instance to use globally - - Example: - >>> from datafusion.html_formatter import get_formatter, set_formatter - >>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100) - >>> set_formatter(custom_formatter) - """ - FormatterManager.set_formatter(formatter) - - -def configure_formatter(**kwargs: Any) -> None: - """Configure the global DataFrame HTML formatter. - - This function creates a new formatter with the provided configuration - and sets it as the global formatter for all DataFrames. - - Args: - **kwargs: Formatter configuration parameters like max_cell_length, - max_width, max_height, enable_cell_expansion, etc. - - Raises: - ValueError: If any invalid parameters are provided - - Example: - >>> from datafusion.html_formatter import configure_formatter - >>> configure_formatter( - ... max_cell_length=50, - ... max_height=500, - ... enable_cell_expansion=True, - ... use_shared_styles=True - ... ) - """ - # Valid parameters accepted by DataFrameHtmlFormatter - valid_params = { - "max_cell_length", - "max_width", - "max_height", - "max_memory_bytes", - "min_rows_display", - "repr_rows", - "enable_cell_expansion", - "custom_css", - "show_truncation_message", - "style_provider", - "use_shared_styles", - } - - # Check for invalid parameters - invalid_params = set(kwargs) - valid_params - if invalid_params: - msg = ( - f"Invalid formatter parameters: {', '.join(invalid_params)}. " - f"Valid parameters are: {', '.join(valid_params)}" - ) - raise ValueError(msg) - - # Create and set formatter with validated parameters - set_formatter(DataFrameHtmlFormatter(**kwargs)) - - -def reset_formatter() -> None: - """Reset the global DataFrame HTML formatter to default settings. - - This function creates a new formatter with default configuration - and sets it as the global formatter for all DataFrames. - - Example: - >>> from datafusion.html_formatter import reset_formatter - >>> reset_formatter() # Reset formatter to default settings - """ - formatter = DataFrameHtmlFormatter() - # Reset the styles_loaded flag to ensure styles will be reloaded - DataFrameHtmlFormatter._styles_loaded = False - set_formatter(formatter) - - -def reset_styles_loaded_state() -> None: - """Reset the styles loaded state to force reloading of styles. - - This can be useful when switching between notebook sessions or - when styles need to be refreshed. - - Example: - >>> from datafusion.html_formatter import reset_styles_loaded_state - >>> reset_styles_loaded_state() # Force styles to reload in next render - """ - DataFrameHtmlFormatter._styles_loaded = False - - -def _refresh_formatter_reference() -> None: - """Refresh formatter reference in any modules using it. - - This helps ensure that changes to the formatter are reflected in existing - DataFrames that might be caching the formatter reference. - """ - # This is a no-op but signals modules to refresh their reference +warnings.warn( + "The module 'html_formatter' is deprecated and will be removed in the next release." + "Please use 'dataframe_formatter' instead.", + DeprecationWarning, + stacklevel=3, +) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index deaa30b3d..c9ae38d8e 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -37,14 +37,14 @@ from datafusion import ( functions as f, ) -from datafusion.expr import Window -from datafusion.html_formatter import ( +from datafusion.dataframe_formatter import ( DataFrameHtmlFormatter, configure_formatter, get_formatter, reset_formatter, reset_styles_loaded_state, ) +from datafusion.expr import Window from pyarrow.csv import write_csv MB = 1024 * 1024 diff --git a/src/dataframe.rs b/src/dataframe.rs index 3d68db279..c2ad4771e 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -24,6 +24,7 @@ use arrow::compute::can_cast_types; use arrow::error::ArrowError; use arrow::ffi::FFI_ArrowSchema; use arrow::ffi_stream::FFI_ArrowArrayStream; +use arrow::pyarrow::FromPyArrow; use datafusion::arrow::datatypes::Schema; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::arrow::util::pretty; @@ -150,9 +151,9 @@ fn get_python_formatter_with_config(py: Python) -> PyResult { Ok(PythonFormatter { formatter, config }) } -/// Get the Python formatter from the datafusion.html_formatter module +/// Get the Python formatter from the datafusion.dataframe_formatter module fn import_python_formatter(py: Python) -> PyResult> { - let formatter_module = py.import("datafusion.html_formatter")?; + let formatter_module = py.import("datafusion.dataframe_formatter")?; let get_formatter = formatter_module.getattr("get_formatter")?; get_formatter.call0() } @@ -295,6 +296,46 @@ impl PyDataFrame { pub fn new(df: DataFrame) -> Self { Self { df: Arc::new(df) } } + + fn prepare_repr_string(&self, py: Python, as_html: bool) -> PyDataFusionResult { + // Get the Python formatter and config + let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display(self.df.as_ref().clone(), config), + )??; + if batches.is_empty() { + // This should not be reached, but do it for safety since we index into the vector below + return Ok("No data to display".to_string()); + } + + let table_uuid = uuid::Uuid::new_v4().to_string(); + + // Convert record batches to PyObject list + let py_batches = batches + .into_iter() + .map(|rb| rb.to_pyarrow(py)) + .collect::>>()?; + + let py_schema = self.schema().into_pyobject(py)?; + + let kwargs = pyo3::types::PyDict::new(py); + let py_batches_list = PyList::new(py, py_batches.as_slice())?; + kwargs.set_item("batches", py_batches_list)?; + kwargs.set_item("schema", py_schema)?; + kwargs.set_item("has_more", has_more)?; + kwargs.set_item("table_uuid", table_uuid)?; + + let method_name = match as_html { + true => "format_html", + false => "format_str", + }; + + let html_result = formatter.call_method(method_name, (), Some(&kwargs))?; + let html_str: String = html_result.extract()?; + + Ok(html_str) + } } #[pymethods] @@ -321,18 +362,27 @@ impl PyDataFrame { } fn __repr__(&self, py: Python) -> PyDataFusionResult { - // Get the Python formatter config - let PythonFormatter { - formatter: _, - config, - } = get_python_formatter_with_config(py)?; - let (batches, has_more) = wait_for_future( - py, - collect_record_batches_to_display(self.df.as_ref().clone(), config), - )??; + self.prepare_repr_string(py, false) + } + + #[staticmethod] + #[expect(unused_variables)] + fn default_str_repr<'py>( + batches: Vec>, + schema: &Bound<'py, PyAny>, + has_more: bool, + table_uuid: &str, + ) -> PyResult { + let batches = batches + .into_iter() + .map(|batch| RecordBatch::from_pyarrow_bound(&batch)) + .collect::>>()? + .into_iter() + .filter(|batch| batch.num_rows() > 0) + .collect::>(); + if batches.is_empty() { - // This should not be reached, but do it for safety since we index into the vector below - return Ok("No data to display".to_string()); + return Ok("No data to display".to_owned()); } let batches_as_displ = @@ -347,38 +397,7 @@ impl PyDataFrame { } fn _repr_html_(&self, py: Python) -> PyDataFusionResult { - // Get the Python formatter and config - let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; - let (batches, has_more) = wait_for_future( - py, - collect_record_batches_to_display(self.df.as_ref().clone(), config), - )??; - if batches.is_empty() { - // This should not be reached, but do it for safety since we index into the vector below - return Ok("No data to display".to_string()); - } - - let table_uuid = uuid::Uuid::new_v4().to_string(); - - // Convert record batches to PyObject list - let py_batches = batches - .into_iter() - .map(|rb| rb.to_pyarrow(py)) - .collect::>>()?; - - let py_schema = self.schema().into_pyobject(py)?; - - let kwargs = pyo3::types::PyDict::new(py); - let py_batches_list = PyList::new(py, py_batches.as_slice())?; - kwargs.set_item("batches", py_batches_list)?; - kwargs.set_item("schema", py_schema)?; - kwargs.set_item("has_more", has_more)?; - kwargs.set_item("table_uuid", table_uuid)?; - - let html_result = formatter.call_method("format_html", (), Some(&kwargs))?; - let html_str: String = html_result.extract()?; - - Ok(html_str) + self.prepare_repr_string(py, true) } /// Calculate summary statistics for a DataFrame From 954563429384078a9e85c56ad553c7e3be7ac52a Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 25 Jun 2025 11:29:35 -0400 Subject: [PATCH 055/145] feat: collect once during display() in jupyter notebooks (#1167) * Only collect one time during display() in jupyter notebooks * Check for juypter notebook environment specifically * Remove approach of checking environment which could not differentiate between jupyter console and notebook * Instead of trying to detect notebook vs console, collect one time when we have any kind if ipython environment. --- src/dataframe.rs | 36 ++++++++++++++++++++++++++---------- src/utils.rs | 11 +++++++++++ 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/dataframe.rs b/src/dataframe.rs index c2ad4771e..ab4749e35 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -51,7 +51,7 @@ use crate::physical_plan::PyExecutionPlan; use crate::record_batch::PyRecordBatchStream; use crate::sql::logical::PyLogicalPlan; use crate::utils::{ - get_tokio_runtime, py_obj_to_scalar_value, validate_pycapsule, wait_for_future, + get_tokio_runtime, is_ipython_env, py_obj_to_scalar_value, validate_pycapsule, wait_for_future, }; use crate::{ errors::PyDataFusionResult, @@ -289,21 +289,33 @@ impl PyParquetColumnOptions { #[derive(Clone)] pub struct PyDataFrame { df: Arc, + + // In IPython environment cache batches between __repr__ and _repr_html_ calls. + batches: Option<(Vec, bool)>, } impl PyDataFrame { /// creates a new PyDataFrame pub fn new(df: DataFrame) -> Self { - Self { df: Arc::new(df) } + Self { + df: Arc::new(df), + batches: None, + } } - fn prepare_repr_string(&self, py: Python, as_html: bool) -> PyDataFusionResult { + fn prepare_repr_string(&mut self, py: Python, as_html: bool) -> PyDataFusionResult { // Get the Python formatter and config let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; - let (batches, has_more) = wait_for_future( - py, - collect_record_batches_to_display(self.df.as_ref().clone(), config), - )??; + + let should_cache = *is_ipython_env(py) && self.batches.is_none(); + let (batches, has_more) = match self.batches.take() { + Some(b) => b, + None => wait_for_future( + py, + collect_record_batches_to_display(self.df.as_ref().clone(), config), + )??, + }; + if batches.is_empty() { // This should not be reached, but do it for safety since we index into the vector below return Ok("No data to display".to_string()); @@ -313,7 +325,7 @@ impl PyDataFrame { // Convert record batches to PyObject list let py_batches = batches - .into_iter() + .iter() .map(|rb| rb.to_pyarrow(py)) .collect::>>()?; @@ -334,6 +346,10 @@ impl PyDataFrame { let html_result = formatter.call_method(method_name, (), Some(&kwargs))?; let html_str: String = html_result.extract()?; + if should_cache { + self.batches = Some((batches, has_more)); + } + Ok(html_str) } } @@ -361,7 +377,7 @@ impl PyDataFrame { } } - fn __repr__(&self, py: Python) -> PyDataFusionResult { + fn __repr__(&mut self, py: Python) -> PyDataFusionResult { self.prepare_repr_string(py, false) } @@ -396,7 +412,7 @@ impl PyDataFrame { Ok(format!("DataFrame()\n{batches_as_displ}{additional_str}")) } - fn _repr_html_(&self, py: Python) -> PyDataFusionResult { + fn _repr_html_(&mut self, py: Python) -> PyDataFusionResult { self.prepare_repr_string(py, true) } diff --git a/src/utils.rs b/src/utils.rs index 90d654385..f4e121fd5 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -39,6 +39,17 @@ pub(crate) fn get_tokio_runtime() -> &'static TokioRuntime { RUNTIME.get_or_init(|| TokioRuntime(tokio::runtime::Runtime::new().unwrap())) } +#[inline] +pub(crate) fn is_ipython_env(py: Python) -> &'static bool { + static IS_IPYTHON_ENV: OnceLock = OnceLock::new(); + IS_IPYTHON_ENV.get_or_init(|| { + py.import("IPython") + .and_then(|ipython| ipython.call_method0("get_ipython")) + .map(|ipython| !ipython.is_none()) + .unwrap_or(false) + }) +} + /// Utility to get the Global Datafussion CTX #[inline] pub(crate) fn get_global_ctx() -> &'static SessionContext { From 9362f53150e5423581757ed56883b3ca2c95b8a2 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 2 Jul 2025 08:08:53 -0400 Subject: [PATCH 056/145] feat: python based catalog and schema provider (#1156) * Exposing FFI to python * Exposing FFI to python * Workin progress on python catalog * Flushing out schema and catalog providers * Adding implementation of python based catalog and schema providers * Small updates after rebase * Add default in memory options for adding schema and catalogs * Add support for creating in memory catalog and schema * Update from database to schema in unit tests * xfailed label no longer applies to these unit tests * Defining abstract methods for catalog and schema providers * Working through issues between custom catalog and build in schema * Check types on schema provider to return * Add docstring * Add documentation about how to use catalog and schema providers * Re-add module to all after rebase * Minor bugfix * Clippy updates from the new rust version --------- Co-authored-by: renato2099 --- Cargo.lock | 19 + Cargo.toml | 2 + docs/source/user-guide/data-sources.rst | 56 ++ examples/datafusion-ffi-example/Cargo.lock | 1 + examples/datafusion-ffi-example/Cargo.toml | 1 + .../python/tests/_test_catalog_provider.py | 60 +++ .../src/catalog_provider.rs | 179 +++++++ examples/datafusion-ffi-example/src/lib.rs | 3 + python/datafusion/__init__.py | 1 + python/datafusion/catalog.py | 195 ++++++- python/datafusion/context.py | 24 +- python/datafusion/dataframe.py | 9 +- python/tests/test_catalog.py | 173 ++++++- python/tests/test_context.py | 40 +- python/tests/test_sql.py | 30 +- python/tests/test_substrait.py | 4 +- src/catalog.rs | 490 ++++++++++++++++-- src/common/data_type.rs | 120 ++--- src/context.rs | 61 ++- src/expr.rs | 15 +- src/expr/aggregate.rs | 2 +- src/expr/aggregate_expr.rs | 2 +- src/expr/alias.rs | 2 +- src/expr/analyze.rs | 2 +- src/expr/between.rs | 2 +- src/expr/column.rs | 2 +- src/expr/copy_to.rs | 4 +- src/expr/create_catalog.rs | 2 +- src/expr/create_catalog_schema.rs | 2 +- src/expr/create_external_table.rs | 2 +- src/expr/create_function.rs | 2 +- src/expr/create_index.rs | 2 +- src/expr/create_memory_table.rs | 2 +- src/expr/create_view.rs | 2 +- src/expr/describe_table.rs | 2 +- src/expr/distinct.rs | 5 +- src/expr/drop_catalog_schema.rs | 2 +- src/expr/drop_function.rs | 2 +- src/expr/drop_table.rs | 2 +- src/expr/drop_view.rs | 2 +- src/expr/empty_relation.rs | 2 +- src/expr/filter.rs | 2 +- src/expr/join.rs | 2 +- src/expr/like.rs | 6 +- src/expr/limit.rs | 2 +- src/expr/projection.rs | 2 +- src/expr/recursive_query.rs | 2 +- src/expr/repartition.rs | 2 +- src/expr/sort.rs | 2 +- src/expr/sort_expr.rs | 2 +- src/expr/subquery.rs | 2 +- src/expr/subquery_alias.rs | 2 +- src/expr/table_scan.rs | 2 +- src/expr/union.rs | 2 +- src/expr/unnest.rs | 2 +- src/expr/unnest_expr.rs | 2 +- src/expr/window.rs | 11 +- src/functions.rs | 2 +- src/lib.rs | 10 +- src/physical_plan.rs | 3 +- src/sql/logical.rs | 3 +- src/utils.rs | 5 +- 62 files changed, 1340 insertions(+), 258 deletions(-) create mode 100644 examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py create mode 100644 examples/datafusion-ffi-example/src/catalog_provider.rs diff --git a/Cargo.lock b/Cargo.lock index 112167cb4..a3e9336cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -165,6 +165,12 @@ dependencies = [ "zstd", ] +[[package]] +name = "arc-swap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" + [[package]] name = "arrayref" version = "0.3.9" @@ -1503,6 +1509,7 @@ dependencies = [ "datafusion-proto", "datafusion-substrait", "futures", + "log", "mimalloc", "object_store", "prost", @@ -1510,6 +1517,7 @@ dependencies = [ "pyo3", "pyo3-async-runtimes", "pyo3-build-config", + "pyo3-log", "tokio", "url", "uuid", @@ -2953,6 +2961,17 @@ dependencies = [ "pyo3-build-config", ] +[[package]] +name = "pyo3-log" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45192e5e4a4d2505587e27806c7b710c231c40c56f3bfc19535d0bb25df52264" +dependencies = [ + "arc-swap", + "log", + "pyo3", +] + [[package]] name = "pyo3-macros" version = "0.24.2" diff --git a/Cargo.toml b/Cargo.toml index 4135e64e2..1f7895a50 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,7 @@ substrait = ["dep:datafusion-substrait"] tokio = { version = "1.45", features = ["macros", "rt", "rt-multi-thread", "sync"] } pyo3 = { version = "0.24", features = ["extension-module", "abi3", "abi3-py39"] } pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"]} +pyo3-log = "0.12.4" arrow = { version = "55.1.0", features = ["pyarrow"] } datafusion = { version = "48.0.0", features = ["avro", "unicode_expressions"] } datafusion-substrait = { version = "48.0.0", optional = true } @@ -49,6 +50,7 @@ async-trait = "0.1.88" futures = "0.3" object_store = { version = "0.12.1", features = ["aws", "gcp", "azure", "http"] } url = "2" +log = "0.4.27" [build-dependencies] prost-types = "0.13.1" # keep in line with `datafusion-substrait` diff --git a/docs/source/user-guide/data-sources.rst b/docs/source/user-guide/data-sources.rst index ba5967c97..9c95d58e0 100644 --- a/docs/source/user-guide/data-sources.rst +++ b/docs/source/user-guide/data-sources.rst @@ -185,3 +185,59 @@ the interface as describe in the :ref:`Custom Table Provider `_ is provided in the DataFusion repository. + +Catalog +======= + +A common technique for organizing tables is using a three level hierarchical approach. DataFusion +supports this form of organizing using the :py:class:`~datafusion.catalog.Catalog`, +:py:class:`~datafusion.catalog.Schema`, and :py:class:`~datafusion.catalog.Table`. By default, +a :py:class:`~datafusion.context.SessionContext` comes with a single Catalog and a single Schema +with the names ``datafusion`` and ``default``, respectively. + +The default implementation uses an in-memory approach to the catalog and schema. We have support +for adding additional in-memory catalogs and schemas. This can be done like in the following +example: + +.. code-block:: python + + from datafusion.catalog import Catalog, Schema + + my_catalog = Catalog.memory_catalog() + my_schema = Schema.memory_schema() + + my_catalog.register_schema("my_schema_name", my_schema) + + ctx.register_catalog("my_catalog_name", my_catalog) + +You could then register tables in ``my_schema`` and access them either through the DataFrame +API or via sql commands such as ``"SELECT * from my_catalog_name.my_schema_name.my_table"``. + +User Defined Catalog and Schema +------------------------------- + +If the in-memory catalogs are insufficient for your uses, there are two approaches you can take +to implementing a custom catalog and/or schema. In the below discussion, we describe how to +implement these for a Catalog, but the approach to implementing for a Schema is nearly +identical. + +DataFusion supports Catalogs written in either Rust or Python. If you write a Catalog in Rust, +you will need to export it as a Python library via PyO3. There is a complete example of a +catalog implemented this way in the +`examples folder `_ +of our repository. Writing catalog providers in Rust provides typically can lead to significant +performance improvements over the Python based approach. + +To implement a Catalog in Python, you will need to inherit from the abstract base class +:py:class:`~datafusion.catalog.CatalogProvider`. There are examples in the +`unit tests `_ of +implementing a basic Catalog in Python where we simply keep a dictionary of the +registered Schemas. + +One important note for developers is that when we have a Catalog defined in Python, we have +two different ways of accessing this Catalog. First, we register the catalog with a Rust +wrapper. This allows for any rust based code to call the Python functions as necessary. +Second, if the user access the Catalog via the Python API, we identify this and return back +the original Python object that implements the Catalog. This is an important distinction +for developers because we do *not* return a Python wrapper around the Rust wrapper of the +original Python object. diff --git a/examples/datafusion-ffi-example/Cargo.lock b/examples/datafusion-ffi-example/Cargo.lock index 075ebd5a1..e5a1ca8d1 100644 --- a/examples/datafusion-ffi-example/Cargo.lock +++ b/examples/datafusion-ffi-example/Cargo.lock @@ -1448,6 +1448,7 @@ dependencies = [ "arrow", "arrow-array", "arrow-schema", + "async-trait", "datafusion", "datafusion-ffi", "pyo3", diff --git a/examples/datafusion-ffi-example/Cargo.toml b/examples/datafusion-ffi-example/Cargo.toml index 0e17567b9..319163554 100644 --- a/examples/datafusion-ffi-example/Cargo.toml +++ b/examples/datafusion-ffi-example/Cargo.toml @@ -27,6 +27,7 @@ pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] arrow = { version = "55.0.0" } arrow-array = { version = "55.0.0" } arrow-schema = { version = "55.0.0" } +async-trait = "0.1.88" [build-dependencies] pyo3-build-config = "0.23" diff --git a/examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py b/examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py new file mode 100644 index 000000000..72aadf64c --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pyarrow as pa +from datafusion import SessionContext +from datafusion_ffi_example import MyCatalogProvider + + +def test_catalog_provider(): + ctx = SessionContext() + + my_catalog_name = "my_catalog" + expected_schema_name = "my_schema" + expected_table_name = "my_table" + expected_table_columns = ["units", "price"] + + catalog_provider = MyCatalogProvider() + ctx.register_catalog_provider(my_catalog_name, catalog_provider) + my_catalog = ctx.catalog(my_catalog_name) + + my_catalog_schemas = my_catalog.names() + assert expected_schema_name in my_catalog_schemas + my_database = my_catalog.database(expected_schema_name) + assert expected_table_name in my_database.names() + my_table = my_database.table(expected_table_name) + assert expected_table_columns == my_table.schema.names + + result = ctx.table( + f"{my_catalog_name}.{expected_schema_name}.{expected_table_name}" + ).collect() + assert len(result) == 2 + + col0_result = [r.column(0) for r in result] + col1_result = [r.column(1) for r in result] + expected_col0 = [ + pa.array([10, 20, 30], type=pa.int32()), + pa.array([5, 7], type=pa.int32()), + ] + expected_col1 = [ + pa.array([1, 2, 5], type=pa.float64()), + pa.array([1.5, 2.5], type=pa.float64()), + ] + assert col0_result == expected_col0 + assert col1_result == expected_col1 diff --git a/examples/datafusion-ffi-example/src/catalog_provider.rs b/examples/datafusion-ffi-example/src/catalog_provider.rs new file mode 100644 index 000000000..54e61cf3e --- /dev/null +++ b/examples/datafusion-ffi-example/src/catalog_provider.rs @@ -0,0 +1,179 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; +use std::{any::Any, fmt::Debug, sync::Arc}; + +use arrow::datatypes::Schema; +use async_trait::async_trait; +use datafusion::{ + catalog::{ + CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider, TableProvider, + }, + common::exec_err, + datasource::MemTable, + error::{DataFusionError, Result}, +}; +use datafusion_ffi::catalog_provider::FFI_CatalogProvider; +use pyo3::types::PyCapsule; + +pub fn my_table() -> Arc { + use arrow::datatypes::{DataType, Field}; + use datafusion::common::record_batch; + + let schema = Arc::new(Schema::new(vec![ + Field::new("units", DataType::Int32, true), + Field::new("price", DataType::Float64, true), + ])); + + let partitions = vec![ + record_batch!( + ("units", Int32, vec![10, 20, 30]), + ("price", Float64, vec![1.0, 2.0, 5.0]) + ) + .unwrap(), + record_batch!( + ("units", Int32, vec![5, 7]), + ("price", Float64, vec![1.5, 2.5]) + ) + .unwrap(), + ]; + + Arc::new(MemTable::try_new(schema, vec![partitions]).unwrap()) +} + +#[derive(Debug)] +pub struct FixedSchemaProvider { + inner: MemorySchemaProvider, +} + +impl Default for FixedSchemaProvider { + fn default() -> Self { + let inner = MemorySchemaProvider::new(); + + let table = my_table(); + + let _ = inner.register_table("my_table".to_string(), table).unwrap(); + + Self { inner } + } +} + +#[async_trait] +impl SchemaProvider for FixedSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + self.inner.table_names() + } + + async fn table(&self, name: &str) -> Result>, DataFusionError> { + self.inner.table(name).await + } + + fn register_table( + &self, + name: String, + table: Arc, + ) -> Result>> { + self.inner.register_table(name, table) + } + + fn deregister_table(&self, name: &str) -> Result>> { + self.inner.deregister_table(name) + } + + fn table_exist(&self, name: &str) -> bool { + self.inner.table_exist(name) + } +} + +/// This catalog provider is intended only for unit tests. It prepopulates with one +/// schema and only allows for schemas named after four types of fruit. +#[pyclass( + name = "MyCatalogProvider", + module = "datafusion_ffi_example", + subclass +)] +#[derive(Debug)] +pub(crate) struct MyCatalogProvider { + inner: MemoryCatalogProvider, +} + +impl Default for MyCatalogProvider { + fn default() -> Self { + let inner = MemoryCatalogProvider::new(); + + let schema_name: &str = "my_schema"; + let _ = inner.register_schema(schema_name, Arc::new(FixedSchemaProvider::default())); + + Self { inner } + } +} + +impl CatalogProvider for MyCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + self.inner.schema_names() + } + + fn schema(&self, name: &str) -> Option> { + self.inner.schema(name) + } + + fn register_schema( + &self, + name: &str, + schema: Arc, + ) -> Result>> { + self.inner.register_schema(name, schema) + } + + fn deregister_schema( + &self, + name: &str, + cascade: bool, + ) -> Result>> { + self.inner.deregister_schema(name, cascade) + } +} + +#[pymethods] +impl MyCatalogProvider { + #[new] + pub fn new() -> Self { + Self { + inner: Default::default(), + } + } + + pub fn __datafusion_catalog_provider__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let name = cr"datafusion_catalog_provider".into(); + let catalog_provider = + FFI_CatalogProvider::new(Arc::new(MyCatalogProvider::default()), None); + + PyCapsule::new(py, catalog_provider, Some(name)) + } +} diff --git a/examples/datafusion-ffi-example/src/lib.rs b/examples/datafusion-ffi-example/src/lib.rs index ae08c3b65..3a4cf2247 100644 --- a/examples/datafusion-ffi-example/src/lib.rs +++ b/examples/datafusion-ffi-example/src/lib.rs @@ -15,10 +15,12 @@ // specific language governing permissions and limitations // under the License. +use crate::catalog_provider::MyCatalogProvider; use crate::table_function::MyTableFunction; use crate::table_provider::MyTableProvider; use pyo3::prelude::*; +pub(crate) mod catalog_provider; pub(crate) mod table_function; pub(crate) mod table_provider; @@ -26,5 +28,6 @@ pub(crate) mod table_provider; fn datafusion_ffi_example(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index fd7f4fc06..e9d2dba75 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -92,6 +92,7 @@ "TableFunction", "WindowFrame", "WindowUDF", + "catalog", "col", "column", "common", diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index 67ab3ead2..536b3a790 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -19,18 +19,33 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Protocol import datafusion._internal as df_internal if TYPE_CHECKING: import pyarrow as pa +try: + from warnings import deprecated # Python 3.13+ +except ImportError: + from typing_extensions import deprecated # Python 3.12 + + +__all__ = [ + "Catalog", + "CatalogProvider", + "Schema", + "SchemaProvider", + "Table", +] + class Catalog: """DataFusion data catalog.""" - def __init__(self, catalog: df_internal.Catalog) -> None: + def __init__(self, catalog: df_internal.catalog.RawCatalog) -> None: """This constructor is not typically called by the end user.""" self.catalog = catalog @@ -38,39 +53,95 @@ def __repr__(self) -> str: """Print a string representation of the catalog.""" return self.catalog.__repr__() - def names(self) -> list[str]: - """Returns the list of databases in this catalog.""" - return self.catalog.names() + def names(self) -> set[str]: + """This is an alias for `schema_names`.""" + return self.schema_names() + + def schema_names(self) -> set[str]: + """Returns the list of schemas in this catalog.""" + return self.catalog.schema_names() + + @staticmethod + def memory_catalog() -> Catalog: + """Create an in-memory catalog provider.""" + catalog = df_internal.catalog.RawCatalog.memory_catalog() + return Catalog(catalog) - def database(self, name: str = "public") -> Database: + def schema(self, name: str = "public") -> Schema: """Returns the database with the given ``name`` from this catalog.""" - return Database(self.catalog.database(name)) + schema = self.catalog.schema(name) + + return ( + Schema(schema) + if isinstance(schema, df_internal.catalog.RawSchema) + else schema + ) + + @deprecated("Use `schema` instead.") + def database(self, name: str = "public") -> Schema: + """Returns the database with the given ``name`` from this catalog.""" + return self.schema(name) + + def register_schema(self, name, schema) -> Schema | None: + """Register a schema with this catalog.""" + if isinstance(schema, Schema): + return self.catalog.register_schema(name, schema._raw_schema) + return self.catalog.register_schema(name, schema) + + def deregister_schema(self, name: str, cascade: bool = True) -> Schema | None: + """Deregister a schema from this catalog.""" + return self.catalog.deregister_schema(name, cascade) -class Database: - """DataFusion Database.""" +class Schema: + """DataFusion Schema.""" - def __init__(self, db: df_internal.Database) -> None: + def __init__(self, schema: df_internal.catalog.RawSchema) -> None: """This constructor is not typically called by the end user.""" - self.db = db + self._raw_schema = schema def __repr__(self) -> str: - """Print a string representation of the database.""" - return self.db.__repr__() + """Print a string representation of the schema.""" + return self._raw_schema.__repr__() + + @staticmethod + def memory_schema() -> Schema: + """Create an in-memory schema provider.""" + schema = df_internal.catalog.RawSchema.memory_schema() + return Schema(schema) def names(self) -> set[str]: - """Returns the list of all tables in this database.""" - return self.db.names() + """This is an alias for `table_names`.""" + return self.table_names() + + def table_names(self) -> set[str]: + """Returns the list of all tables in this schema.""" + return self._raw_schema.table_names def table(self, name: str) -> Table: - """Return the table with the given ``name`` from this database.""" - return Table(self.db.table(name)) + """Return the table with the given ``name`` from this schema.""" + return Table(self._raw_schema.table(name)) + + def register_table(self, name, table) -> None: + """Register a table provider in this schema.""" + if isinstance(table, Table): + return self._raw_schema.register_table(name, table.table) + return self._raw_schema.register_table(name, table) + + def deregister_table(self, name: str) -> None: + """Deregister a table provider from this schema.""" + return self._raw_schema.deregister_table(name) + + +@deprecated("Use `Schema` instead.") +class Database(Schema): + """See `Schema`.""" class Table: """DataFusion table.""" - def __init__(self, table: df_internal.Table) -> None: + def __init__(self, table: df_internal.catalog.RawTable) -> None: """This constructor is not typically called by the end user.""" self.table = table @@ -78,6 +149,11 @@ def __repr__(self) -> str: """Print a string representation of the table.""" return self.table.__repr__() + @staticmethod + def from_dataset(dataset: pa.dataset.Dataset) -> Table: + """Turn a pyarrow Dataset into a Table.""" + return Table(df_internal.catalog.RawTable.from_dataset(dataset)) + @property def schema(self) -> pa.Schema: """Returns the schema associated with this table.""" @@ -87,3 +163,86 @@ def schema(self) -> pa.Schema: def kind(self) -> str: """Returns the kind of table.""" return self.table.kind + + +class CatalogProvider(ABC): + """Abstract class for defining a Python based Catalog Provider.""" + + @abstractmethod + def schema_names(self) -> set[str]: + """Set of the names of all schemas in this catalog.""" + ... + + @abstractmethod + def schema(self, name: str) -> Schema | None: + """Retrieve a specific schema from this catalog.""" + ... + + def register_schema( # noqa: B027 + self, name: str, schema: SchemaProviderExportable | SchemaProvider | Schema + ) -> None: + """Add a schema to this catalog. + + This method is optional. If your catalog provides a fixed list of schemas, you + do not need to implement this method. + """ + + def deregister_schema(self, name: str, cascade: bool) -> None: # noqa: B027 + """Remove a schema from this catalog. + + This method is optional. If your catalog provides a fixed list of schemas, you + do not need to implement this method. + + Args: + name: The name of the schema to remove. + cascade: If true, deregister the tables within the schema. + """ + + +class SchemaProvider(ABC): + """Abstract class for defining a Python based Schema Provider.""" + + def owner_name(self) -> str | None: + """Returns the owner of the schema. + + This is an optional method. The default return is None. + """ + return None + + @abstractmethod + def table_names(self) -> set[str]: + """Set of the names of all tables in this schema.""" + ... + + @abstractmethod + def table(self, name: str) -> Table | None: + """Retrieve a specific table from this schema.""" + ... + + def register_table(self, name: str, table: Table) -> None: # noqa: B027 + """Add a table from this schema. + + This method is optional. If your schema provides a fixed list of tables, you do + not need to implement this method. + """ + + def deregister_table(self, name, cascade: bool) -> None: # noqa: B027 + """Remove a table from this schema. + + This method is optional. If your schema provides a fixed list of tables, you do + not need to implement this method. + """ + + @abstractmethod + def table_exist(self, name: str) -> bool: + """Returns true if the table exists in this schema.""" + ... + + +class SchemaProviderExportable(Protocol): + """Type hint for object that has __datafusion_schema_provider__ PyCapsule. + + https://docs.rs/datafusion/latest/datafusion/catalog/trait.SchemaProvider.html + """ + + def __datafusion_schema_provider__(self) -> object: ... diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 5b99b0d26..bce51d644 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -29,7 +29,7 @@ except ImportError: from typing_extensions import deprecated # Python 3.12 -from datafusion.catalog import Catalog, Table +from datafusion.catalog import Catalog, CatalogProvider, Table from datafusion.dataframe import DataFrame from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list from datafusion.record_batch import RecordBatchStream @@ -80,6 +80,15 @@ class TableProviderExportable(Protocol): def __datafusion_table_provider__(self) -> object: ... # noqa: D105 +class CatalogProviderExportable(Protocol): + """Type hint for object that has __datafusion_catalog_provider__ PyCapsule. + + https://docs.rs/datafusion/latest/datafusion/catalog/trait.CatalogProvider.html + """ + + def __datafusion_catalog_provider__(self) -> object: ... # noqa: D105 + + class SessionConfig: """Session configuration options.""" @@ -749,6 +758,19 @@ def deregister_table(self, name: str) -> None: """Remove a table from the session.""" self.ctx.deregister_table(name) + def catalog_names(self) -> set[str]: + """Returns the list of catalogs in this context.""" + return self.ctx.catalog_names() + + def register_catalog_provider( + self, name: str, provider: CatalogProviderExportable | CatalogProvider | Catalog + ) -> None: + """Register a catalog provider.""" + if isinstance(provider, Catalog): + self.ctx.register_catalog_provider(name, provider.catalog) + else: + self.ctx.register_catalog_provider(name, provider) + def register_table_provider( self, name: str, provider: TableProviderExportable ) -> None: diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 991e6875a..61cb09438 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -760,19 +760,16 @@ def join_on( exprs = [expr.expr for expr in on_exprs] return DataFrame(self.df.join_on(right.df, exprs, how)) - def explain(self, verbose: bool = False, analyze: bool = False) -> DataFrame: - """Return a DataFrame with the explanation of its plan so far. + def explain(self, verbose: bool = False, analyze: bool = False) -> None: + """Print an explanation of the DataFrame's plan so far. If ``analyze`` is specified, runs the plan and reports metrics. Args: verbose: If ``True``, more details will be included. analyze: If ``Tru`e``, the plan will run and metrics reported. - - Returns: - DataFrame with the explanation of its plan. """ - return DataFrame(self.df.explain(verbose, analyze)) + self.df.explain(verbose, analyze) def logical_plan(self) -> LogicalPlan: """Return the unoptimized ``LogicalPlan``. diff --git a/python/tests/test_catalog.py b/python/tests/test_catalog.py index 23b328458..1f9ecbfc3 100644 --- a/python/tests/test_catalog.py +++ b/python/tests/test_catalog.py @@ -14,9 +14,13 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations +import datafusion as dfn import pyarrow as pa +import pyarrow.dataset as ds import pytest +from datafusion import SessionContext, Table # Note we take in `database` as a variable even though we don't use @@ -27,9 +31,9 @@ def test_basic(ctx, database): ctx.catalog("non-existent") default = ctx.catalog() - assert default.names() == ["public"] + assert default.names() == {"public"} - for db in [default.database("public"), default.database()]: + for db in [default.schema("public"), default.schema()]: assert db.names() == {"csv1", "csv", "csv2"} table = db.table("csv") @@ -41,3 +45,168 @@ def test_basic(ctx, database): pa.field("float", pa.float64(), nullable=True), ] ) + + +def create_dataset() -> Table: + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"], + ) + dataset = ds.dataset([batch]) + return Table.from_dataset(dataset) + + +class CustomSchemaProvider(dfn.catalog.SchemaProvider): + def __init__(self): + self.tables = {"table1": create_dataset()} + + def table_names(self) -> set[str]: + return set(self.tables.keys()) + + def register_table(self, name: str, table: Table): + self.tables[name] = table + + def deregister_table(self, name, cascade: bool = True): + del self.tables[name] + + def table(self, name: str) -> Table | None: + return self.tables[name] + + def table_exist(self, name: str) -> bool: + return name in self.tables + + +class CustomCatalogProvider(dfn.catalog.CatalogProvider): + def __init__(self): + self.schemas = {"my_schema": CustomSchemaProvider()} + + def schema_names(self) -> set[str]: + return set(self.schemas.keys()) + + def schema(self, name: str): + return self.schemas[name] + + def register_schema(self, name: str, schema: dfn.catalog.Schema): + self.schemas[name] = schema + + def deregister_schema(self, name, cascade: bool): + del self.schemas[name] + + +def test_python_catalog_provider(ctx: SessionContext): + ctx.register_catalog_provider("my_catalog", CustomCatalogProvider()) + + # Check the default catalog provider + assert ctx.catalog("datafusion").names() == {"public"} + + my_catalog = ctx.catalog("my_catalog") + assert my_catalog.names() == {"my_schema"} + + my_catalog.register_schema("second_schema", CustomSchemaProvider()) + assert my_catalog.schema_names() == {"my_schema", "second_schema"} + + my_catalog.deregister_schema("my_schema") + assert my_catalog.schema_names() == {"second_schema"} + + +def test_in_memory_providers(ctx: SessionContext): + catalog = dfn.catalog.Catalog.memory_catalog() + ctx.register_catalog_provider("in_mem_catalog", catalog) + + assert ctx.catalog_names() == {"datafusion", "in_mem_catalog"} + + schema = dfn.catalog.Schema.memory_schema() + catalog.register_schema("in_mem_schema", schema) + + schema.register_table("my_table", create_dataset()) + + batches = ctx.sql("select * from in_mem_catalog.in_mem_schema.my_table").collect() + + assert len(batches) == 1 + assert batches[0].column(0) == pa.array([1, 2, 3]) + assert batches[0].column(1) == pa.array([4, 5, 6]) + + +def test_python_schema_provider(ctx: SessionContext): + catalog = ctx.catalog() + + catalog.deregister_schema("public") + + catalog.register_schema("test_schema1", CustomSchemaProvider()) + assert catalog.names() == {"test_schema1"} + + catalog.register_schema("test_schema2", CustomSchemaProvider()) + catalog.deregister_schema("test_schema1") + assert catalog.names() == {"test_schema2"} + + +def test_python_table_provider(ctx: SessionContext): + catalog = ctx.catalog() + + catalog.register_schema("custom_schema", CustomSchemaProvider()) + schema = catalog.schema("custom_schema") + + assert schema.table_names() == {"table1"} + + schema.deregister_table("table1") + schema.register_table("table2", create_dataset()) + assert schema.table_names() == {"table2"} + + # Use the default schema instead of our custom schema + + schema = catalog.schema() + + schema.register_table("table3", create_dataset()) + assert schema.table_names() == {"table3"} + + schema.deregister_table("table3") + schema.register_table("table4", create_dataset()) + assert schema.table_names() == {"table4"} + + +def test_in_end_to_end_python_providers(ctx: SessionContext): + """Test registering all python providers and running a query against them.""" + + all_catalog_names = [ + "datafusion", + "custom_catalog", + "in_mem_catalog", + ] + + all_schema_names = [ + "custom_schema", + "in_mem_schema", + ] + + ctx.register_catalog_provider(all_catalog_names[1], CustomCatalogProvider()) + ctx.register_catalog_provider( + all_catalog_names[2], dfn.catalog.Catalog.memory_catalog() + ) + + for catalog_name in all_catalog_names: + catalog = ctx.catalog(catalog_name) + + # Clean out previous schemas if they exist so we can start clean + for schema_name in catalog.schema_names(): + catalog.deregister_schema(schema_name, cascade=False) + + catalog.register_schema(all_schema_names[0], CustomSchemaProvider()) + catalog.register_schema(all_schema_names[1], dfn.catalog.Schema.memory_schema()) + + for schema_name in all_schema_names: + schema = catalog.schema(schema_name) + + for table_name in schema.table_names(): + schema.deregister_table(table_name) + + schema.register_table("test_table", create_dataset()) + + for catalog_name in all_catalog_names: + for schema_name in all_schema_names: + table_full_name = f"{catalog_name}.{schema_name}.test_table" + + batches = ctx.sql(f"select * from {table_full_name}").collect() + + assert len(batches) == 1 + assert batches[0].column(0) == pa.array([1, 2, 3]) + assert batches[0].column(1) == pa.array([4, 5, 6]) diff --git a/python/tests/test_context.py b/python/tests/test_context.py index 4a15ac9cf..6dbcc0d5e 100644 --- a/python/tests/test_context.py +++ b/python/tests/test_context.py @@ -57,7 +57,7 @@ def test_runtime_configs(tmp_path, path_to_str): ctx = SessionContext(config, runtime) assert ctx is not None - db = ctx.catalog("foo").database("bar") + db = ctx.catalog("foo").schema("bar") assert db is not None @@ -70,7 +70,7 @@ def test_temporary_files(tmp_path, path_to_str): ctx = SessionContext(config, runtime) assert ctx is not None - db = ctx.catalog("foo").database("bar") + db = ctx.catalog("foo").schema("bar") assert db is not None @@ -91,7 +91,7 @@ def test_create_context_with_all_valid_args(): ctx = SessionContext(config, runtime) # verify that at least some of the arguments worked - ctx.catalog("foo").database("bar") + ctx.catalog("foo").schema("bar") with pytest.raises(KeyError): ctx.catalog("datafusion") @@ -105,7 +105,7 @@ def test_register_record_batches(ctx): ctx.register_record_batches("t", [[batch]]) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} result = ctx.sql("SELECT a+b, a-b FROM t").collect() @@ -121,7 +121,7 @@ def test_create_dataframe_registers_unique_table_name(ctx): ) df = ctx.create_dataframe([[batch]]) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -141,7 +141,7 @@ def test_create_dataframe_registers_with_defined_table_name(ctx): ) df = ctx.create_dataframe([[batch]], name="tbl") - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -155,7 +155,7 @@ def test_from_arrow_table(ctx): # convert to DataFrame df = ctx.from_arrow(table) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -200,7 +200,7 @@ def test_from_arrow_table_with_name(ctx): # convert to DataFrame with optional name df = ctx.from_arrow(table, name="tbl") - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert tables[0] == "tbl" @@ -213,7 +213,7 @@ def test_from_arrow_table_empty(ctx): # convert to DataFrame df = ctx.from_arrow(table) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -228,7 +228,7 @@ def test_from_arrow_table_empty_no_schema(ctx): # convert to DataFrame df = ctx.from_arrow(table) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -246,7 +246,7 @@ def test_from_pylist(ctx): ] df = ctx.from_pylist(data) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -260,7 +260,7 @@ def test_from_pydict(ctx): data = {"a": [1, 2, 3], "b": [4, 5, 6]} df = ctx.from_pydict(data) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -276,7 +276,7 @@ def test_from_pandas(ctx): pandas_df = pd.DataFrame(data) df = ctx.from_pandas(pandas_df) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -292,7 +292,7 @@ def test_from_polars(ctx): polars_df = pd.DataFrame(data) df = ctx.from_polars(polars_df) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -303,7 +303,7 @@ def test_from_polars(ctx): def test_register_table(ctx, database): default = ctx.catalog() - public = default.database("public") + public = default.schema("public") assert public.names() == {"csv", "csv1", "csv2"} table = public.table("csv") @@ -313,7 +313,7 @@ def test_register_table(ctx, database): def test_read_table(ctx, database): default = ctx.catalog() - public = default.database("public") + public = default.schema("public") assert public.names() == {"csv", "csv1", "csv2"} table = public.table("csv") @@ -323,7 +323,7 @@ def test_read_table(ctx, database): def test_deregister_table(ctx, database): default = ctx.catalog() - public = default.database("public") + public = default.schema("public") assert public.names() == {"csv", "csv1", "csv2"} ctx.deregister_table("csv") @@ -339,7 +339,7 @@ def test_register_dataset(ctx): dataset = ds.dataset([batch]) ctx.register_dataset("t", dataset) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} result = ctx.sql("SELECT a+b, a-b FROM t").collect() @@ -356,7 +356,7 @@ def test_dataset_filter(ctx, capfd): dataset = ds.dataset([batch]) ctx.register_dataset("t", dataset) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} df = ctx.sql("SELECT a+b, a-b FROM t WHERE a BETWEEN 2 and 3 AND b > 5") # Make sure the filter was pushed down in Physical Plan @@ -455,7 +455,7 @@ def test_dataset_filter_nested_data(ctx): dataset = ds.dataset([batch]) ctx.register_dataset("t", dataset) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} df = ctx.table("t") diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py index 41cee4ef3..c383edc60 100644 --- a/python/tests/test_sql.py +++ b/python/tests/test_sql.py @@ -75,7 +75,7 @@ def test_register_csv(ctx, tmp_path): ) ctx.register_csv("csv3", path, schema=alternative_schema) - assert ctx.catalog().database().names() == { + assert ctx.catalog().schema().names() == { "csv", "csv1", "csv2", @@ -150,7 +150,7 @@ def test_register_parquet(ctx, tmp_path): path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) ctx.register_parquet("t", path) ctx.register_parquet("t1", str(path)) - assert ctx.catalog().database().names() == {"t", "t1"} + assert ctx.catalog().schema().names() == {"t", "t1"} result = ctx.sql("SELECT COUNT(a) AS cnt FROM t").collect() result = pa.Table.from_batches(result) @@ -188,7 +188,7 @@ def test_register_parquet_partitioned(ctx, tmp_path, path_to_str, legacy_data_ty parquet_pruning=True, file_extension=".parquet", ) - assert ctx.catalog().database().names() == {"datapp"} + assert ctx.catalog().schema().names() == {"datapp"} result = ctx.sql("SELECT grp, COUNT(*) AS cnt FROM datapp GROUP BY grp").collect() result = pa.Table.from_batches(result) @@ -204,7 +204,7 @@ def test_register_dataset(ctx, tmp_path, path_to_str): dataset = ds.dataset(path, format="parquet") ctx.register_dataset("t", dataset) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} result = ctx.sql("SELECT COUNT(a) AS cnt FROM t").collect() result = pa.Table.from_batches(result) @@ -251,7 +251,7 @@ def test_register_json(ctx, tmp_path): ) ctx.register_json("json3", path, schema=alternative_schema) - assert ctx.catalog().database().names() == { + assert ctx.catalog().schema().names() == { "json", "json1", "json2", @@ -308,7 +308,7 @@ def test_execute(ctx, tmp_path): path = helpers.write_parquet(tmp_path / "a.parquet", pa.array(data)) ctx.register_parquet("t", path) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} # count result = ctx.sql("SELECT COUNT(a) AS cnt FROM t WHERE a IS NOT NULL").collect() @@ -451,18 +451,10 @@ def test_udf( id="datetime_ns", ), # Not writtable to parquet - pytest.param( - helpers.data_timedelta("s"), id="timedelta_s", marks=pytest.mark.xfail - ), - pytest.param( - helpers.data_timedelta("ms"), id="timedelta_ms", marks=pytest.mark.xfail - ), - pytest.param( - helpers.data_timedelta("us"), id="timedelta_us", marks=pytest.mark.xfail - ), - pytest.param( - helpers.data_timedelta("ns"), id="timedelta_ns", marks=pytest.mark.xfail - ), + pytest.param(helpers.data_timedelta("s"), id="timedelta_s"), + pytest.param(helpers.data_timedelta("ms"), id="timedelta_ms"), + pytest.param(helpers.data_timedelta("us"), id="timedelta_us"), + pytest.param(helpers.data_timedelta("ns"), id="timedelta_ns"), ], ) def test_simple_select(ctx, tmp_path, arr): @@ -524,7 +516,7 @@ def test_register_listing_table( schema=table.schema if pass_schema else None, file_sort_order=file_sort_order, ) - assert ctx.catalog().database().names() == {"my_table"} + assert ctx.catalog().schema().names() == {"my_table"} result = ctx.sql( "SELECT grp, COUNT(*) AS count FROM my_table GROUP BY grp" diff --git a/python/tests/test_substrait.py b/python/tests/test_substrait.py index f367a447d..43aa327d4 100644 --- a/python/tests/test_substrait.py +++ b/python/tests/test_substrait.py @@ -34,7 +34,7 @@ def test_substrait_serialization(ctx): ctx.register_record_batches("t", [[batch]]) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} # For now just make sure the method calls blow up substrait_plan = ss.Serde.serialize_to_plan("SELECT * FROM t", ctx) @@ -59,7 +59,7 @@ def test_substrait_file_serialization(ctx, tmp_path, path_to_str): ctx.register_record_batches("t", [[batch]]) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} path = tmp_path / "substrait_plan" path = str(path) if path_to_str else path diff --git a/src/catalog.rs b/src/catalog.rs index 83f8d08cb..17d4ec3b8 100644 --- a/src/catalog.rs +++ b/src/catalog.rs @@ -15,44 +15,54 @@ // specific language governing permissions and limitations // under the License. -use std::collections::HashSet; -use std::sync::Arc; - -use pyo3::exceptions::PyKeyError; -use pyo3::prelude::*; - -use crate::errors::{PyDataFusionError, PyDataFusionResult}; -use crate::utils::wait_for_future; +use crate::dataset::Dataset; +use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionError, PyDataFusionResult}; +use crate::utils::{validate_pycapsule, wait_for_future}; +use async_trait::async_trait; +use datafusion::catalog::{MemoryCatalogProvider, MemorySchemaProvider}; +use datafusion::common::DataFusionError; use datafusion::{ arrow::pyarrow::ToPyArrow, catalog::{CatalogProvider, SchemaProvider}, datasource::{TableProvider, TableType}, }; +use datafusion_ffi::schema_provider::{FFI_SchemaProvider, ForeignSchemaProvider}; +use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider}; +use pyo3::exceptions::PyKeyError; +use pyo3::prelude::*; +use pyo3::types::PyCapsule; +use pyo3::IntoPyObjectExt; +use std::any::Any; +use std::collections::HashSet; +use std::sync::Arc; -#[pyclass(name = "Catalog", module = "datafusion", subclass)] +#[pyclass(name = "RawCatalog", module = "datafusion.catalog", subclass)] +#[derive(Clone)] pub struct PyCatalog { pub catalog: Arc, } -#[pyclass(name = "Database", module = "datafusion", subclass)] -pub struct PyDatabase { - pub database: Arc, +#[pyclass(name = "RawSchema", module = "datafusion.catalog", subclass)] +#[derive(Clone)] +pub struct PySchema { + pub schema: Arc, } -#[pyclass(name = "Table", module = "datafusion", subclass)] +#[pyclass(name = "RawTable", module = "datafusion.catalog", subclass)] +#[derive(Clone)] pub struct PyTable { pub table: Arc, } -impl PyCatalog { - pub fn new(catalog: Arc) -> Self { +impl From> for PyCatalog { + fn from(catalog: Arc) -> Self { Self { catalog } } } -impl PyDatabase { - pub fn new(database: Arc) -> Self { - Self { database } +impl From> for PySchema { + fn from(schema: Arc) -> Self { + Self { schema } } } @@ -68,36 +78,109 @@ impl PyTable { #[pymethods] impl PyCatalog { - fn names(&self) -> Vec { - self.catalog.schema_names() + #[new] + fn new(catalog: PyObject) -> Self { + let catalog_provider = + Arc::new(RustWrappedPyCatalogProvider::new(catalog)) as Arc; + catalog_provider.into() + } + + #[staticmethod] + fn memory_catalog() -> Self { + let catalog_provider = + Arc::new(MemoryCatalogProvider::default()) as Arc; + catalog_provider.into() + } + + fn schema_names(&self) -> HashSet { + self.catalog.schema_names().into_iter().collect() } #[pyo3(signature = (name="public"))] - fn database(&self, name: &str) -> PyResult { - match self.catalog.schema(name) { - Some(database) => Ok(PyDatabase::new(database)), - None => Err(PyKeyError::new_err(format!( - "Database with name {name} doesn't exist." - ))), - } + fn schema(&self, name: &str) -> PyResult { + let schema = self + .catalog + .schema(name) + .ok_or(PyKeyError::new_err(format!( + "Schema with name {name} doesn't exist." + )))?; + + Python::with_gil(|py| { + match schema + .as_any() + .downcast_ref::() + { + Some(wrapped_schema) => Ok(wrapped_schema.schema_provider.clone_ref(py)), + None => PySchema::from(schema).into_py_any(py), + } + }) + } + + fn register_schema(&self, name: &str, schema_provider: Bound<'_, PyAny>) -> PyResult<()> { + let provider = if schema_provider.hasattr("__datafusion_schema_provider__")? { + let capsule = schema_provider + .getattr("__datafusion_schema_provider__")? + .call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_schema_provider")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignSchemaProvider = provider.into(); + Arc::new(provider) as Arc + } else { + match schema_provider.extract::() { + Ok(py_schema) => py_schema.schema, + Err(_) => Arc::new(RustWrappedPySchemaProvider::new(schema_provider.into())) + as Arc, + } + }; + + let _ = self + .catalog + .register_schema(name, provider) + .map_err(py_datafusion_err)?; + + Ok(()) + } + + fn deregister_schema(&self, name: &str, cascade: bool) -> PyResult<()> { + let _ = self + .catalog + .deregister_schema(name, cascade) + .map_err(py_datafusion_err)?; + + Ok(()) } fn __repr__(&self) -> PyResult { - Ok(format!( - "Catalog(schema_names=[{}])", - self.names().join(";") - )) + let mut names: Vec = self.schema_names().into_iter().collect(); + names.sort(); + Ok(format!("Catalog(schema_names=[{}])", names.join(", "))) } } #[pymethods] -impl PyDatabase { - fn names(&self) -> HashSet { - self.database.table_names().into_iter().collect() +impl PySchema { + #[new] + fn new(schema_provider: PyObject) -> Self { + let schema_provider = + Arc::new(RustWrappedPySchemaProvider::new(schema_provider)) as Arc; + schema_provider.into() + } + + #[staticmethod] + fn memory_schema() -> Self { + let schema_provider = Arc::new(MemorySchemaProvider::default()) as Arc; + schema_provider.into() + } + + #[getter] + fn table_names(&self) -> HashSet { + self.schema.table_names().into_iter().collect() } fn table(&self, name: &str, py: Python) -> PyDataFusionResult { - if let Some(table) = wait_for_future(py, self.database.table(name))?? { + if let Some(table) = wait_for_future(py, self.schema.table(name))?? { Ok(PyTable::new(table)) } else { Err(PyDataFusionError::Common(format!( @@ -107,14 +190,49 @@ impl PyDatabase { } fn __repr__(&self) -> PyResult { - Ok(format!( - "Database(table_names=[{}])", - Vec::from_iter(self.names()).join(";") - )) + let mut names: Vec = self.table_names().into_iter().collect(); + names.sort(); + Ok(format!("Schema(table_names=[{}])", names.join(";"))) } - // register_table - // deregister_table + fn register_table(&self, name: &str, table_provider: Bound<'_, PyAny>) -> PyResult<()> { + let provider = if table_provider.hasattr("__datafusion_table_provider__")? { + let capsule = table_provider + .getattr("__datafusion_table_provider__")? + .call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_table_provider")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignTableProvider = provider.into(); + Arc::new(provider) as Arc + } else { + match table_provider.extract::() { + Ok(py_table) => py_table.table, + Err(_) => { + let py = table_provider.py(); + let provider = Dataset::new(&table_provider, py)?; + Arc::new(provider) as Arc + } + } + }; + + let _ = self + .schema + .register_table(name.to_string(), provider) + .map_err(py_datafusion_err)?; + + Ok(()) + } + + fn deregister_table(&self, name: &str) -> PyResult<()> { + let _ = self + .schema + .deregister_table(name) + .map_err(py_datafusion_err)?; + + Ok(()) + } } #[pymethods] @@ -125,6 +243,14 @@ impl PyTable { self.table.schema().to_pyarrow(py) } + #[staticmethod] + fn from_dataset(py: Python<'_>, dataset: &Bound<'_, PyAny>) -> PyResult { + let ds = Arc::new(Dataset::new(dataset, py).map_err(py_datafusion_err)?) + as Arc; + + Ok(Self::new(ds)) + } + /// Get the type of this table for metadata/catalog purposes. #[getter] fn kind(&self) -> &str { @@ -145,3 +271,285 @@ impl PyTable { // fn has_exact_statistics // fn supports_filter_pushdown } + +#[derive(Debug)] +pub(crate) struct RustWrappedPySchemaProvider { + schema_provider: PyObject, + owner_name: Option, +} + +impl RustWrappedPySchemaProvider { + pub fn new(schema_provider: PyObject) -> Self { + let owner_name = Python::with_gil(|py| { + schema_provider + .bind(py) + .getattr("owner_name") + .ok() + .map(|name| name.to_string()) + }); + + Self { + schema_provider, + owner_name, + } + } + + fn table_inner(&self, name: &str) -> PyResult>> { + Python::with_gil(|py| { + let provider = self.schema_provider.bind(py); + let py_table_method = provider.getattr("table")?; + + let py_table = py_table_method.call((name,), None)?; + if py_table.is_none() { + return Ok(None); + } + + if py_table.hasattr("__datafusion_table_provider__")? { + let capsule = provider.getattr("__datafusion_table_provider__")?.call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_table_provider")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignTableProvider = provider.into(); + + Ok(Some(Arc::new(provider) as Arc)) + } else { + if let Ok(inner_table) = py_table.getattr("table") { + if let Ok(inner_table) = inner_table.extract::() { + return Ok(Some(inner_table.table)); + } + } + + match py_table.extract::() { + Ok(py_table) => Ok(Some(py_table.table)), + Err(_) => { + let ds = Dataset::new(&py_table, py).map_err(py_datafusion_err)?; + Ok(Some(Arc::new(ds) as Arc)) + } + } + } + }) + } +} + +#[async_trait] +impl SchemaProvider for RustWrappedPySchemaProvider { + fn owner_name(&self) -> Option<&str> { + self.owner_name.as_deref() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + Python::with_gil(|py| { + let provider = self.schema_provider.bind(py); + + provider + .getattr("table_names") + .and_then(|names| names.extract::>()) + .unwrap_or_else(|err| { + log::error!("Unable to get table_names: {err}"); + Vec::default() + }) + }) + } + + async fn table( + &self, + name: &str, + ) -> datafusion::common::Result>, DataFusionError> { + self.table_inner(name).map_err(to_datafusion_err) + } + + fn register_table( + &self, + name: String, + table: Arc, + ) -> datafusion::common::Result>> { + let py_table = PyTable::new(table); + Python::with_gil(|py| { + let provider = self.schema_provider.bind(py); + let _ = provider + .call_method1("register_table", (name, py_table)) + .map_err(to_datafusion_err)?; + // Since the definition of `register_table` says that an error + // will be returned if the table already exists, there is no + // case where we want to return a table provider as output. + Ok(None) + }) + } + + fn deregister_table( + &self, + name: &str, + ) -> datafusion::common::Result>> { + Python::with_gil(|py| { + let provider = self.schema_provider.bind(py); + let table = provider + .call_method1("deregister_table", (name,)) + .map_err(to_datafusion_err)?; + if table.is_none() { + return Ok(None); + } + + // If we can turn this table provider into a `Dataset`, return it. + // Otherwise, return None. + let dataset = match Dataset::new(&table, py) { + Ok(dataset) => Some(Arc::new(dataset) as Arc), + Err(_) => None, + }; + + Ok(dataset) + }) + } + + fn table_exist(&self, name: &str) -> bool { + Python::with_gil(|py| { + let provider = self.schema_provider.bind(py); + provider + .call_method1("table_exist", (name,)) + .and_then(|pyobj| pyobj.extract()) + .unwrap_or(false) + }) + } +} + +#[derive(Debug)] +pub(crate) struct RustWrappedPyCatalogProvider { + pub(crate) catalog_provider: PyObject, +} + +impl RustWrappedPyCatalogProvider { + pub fn new(catalog_provider: PyObject) -> Self { + Self { catalog_provider } + } + + fn schema_inner(&self, name: &str) -> PyResult>> { + Python::with_gil(|py| { + let provider = self.catalog_provider.bind(py); + + let py_schema = provider.call_method1("schema", (name,))?; + if py_schema.is_none() { + return Ok(None); + } + + if py_schema.hasattr("__datafusion_schema_provider__")? { + let capsule = provider + .getattr("__datafusion_schema_provider__")? + .call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_schema_provider")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignSchemaProvider = provider.into(); + + Ok(Some(Arc::new(provider) as Arc)) + } else { + if let Ok(inner_schema) = py_schema.getattr("schema") { + if let Ok(inner_schema) = inner_schema.extract::() { + return Ok(Some(inner_schema.schema)); + } + } + match py_schema.extract::() { + Ok(inner_schema) => Ok(Some(inner_schema.schema)), + Err(_) => { + let py_schema = RustWrappedPySchemaProvider::new(py_schema.into()); + + Ok(Some(Arc::new(py_schema) as Arc)) + } + } + } + }) + } +} + +#[async_trait] +impl CatalogProvider for RustWrappedPyCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + Python::with_gil(|py| { + let provider = self.catalog_provider.bind(py); + provider + .getattr("schema_names") + .and_then(|names| names.extract::>()) + .unwrap_or_else(|err| { + log::error!("Unable to get schema_names: {err}"); + Vec::default() + }) + }) + } + + fn schema(&self, name: &str) -> Option> { + self.schema_inner(name).unwrap_or_else(|err| { + log::error!("CatalogProvider schema returned error: {err}"); + None + }) + } + + fn register_schema( + &self, + name: &str, + schema: Arc, + ) -> datafusion::common::Result>> { + // JRIGHT HERE + // let py_schema: PySchema = schema.into(); + Python::with_gil(|py| { + let py_schema = match schema + .as_any() + .downcast_ref::() + { + Some(wrapped_schema) => wrapped_schema.schema_provider.as_any(), + None => &PySchema::from(schema) + .into_py_any(py) + .map_err(to_datafusion_err)?, + }; + + let provider = self.catalog_provider.bind(py); + let schema = provider + .call_method1("register_schema", (name, py_schema)) + .map_err(to_datafusion_err)?; + if schema.is_none() { + return Ok(None); + } + + let schema = Arc::new(RustWrappedPySchemaProvider::new(schema.into())) + as Arc; + + Ok(Some(schema)) + }) + } + + fn deregister_schema( + &self, + name: &str, + cascade: bool, + ) -> datafusion::common::Result>> { + Python::with_gil(|py| { + let provider = self.catalog_provider.bind(py); + let schema = provider + .call_method1("deregister_schema", (name, cascade)) + .map_err(to_datafusion_err)?; + if schema.is_none() { + return Ok(None); + } + + let schema = Arc::new(RustWrappedPySchemaProvider::new(schema.into())) + as Arc; + + Ok(Some(schema)) + }) + } +} + +pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + + Ok(()) +} diff --git a/src/common/data_type.rs b/src/common/data_type.rs index f5f8a6b06..5cf9d6e9f 100644 --- a/src/common/data_type.rs +++ b/src/common/data_type.rs @@ -172,7 +172,7 @@ impl DataTypeMap { SqlType::DATE, )), DataType::Duration(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Interval(interval_unit) => Ok(DataTypeMap::new( DataType::Interval(*interval_unit), @@ -189,7 +189,7 @@ impl DataTypeMap { SqlType::BINARY, )), DataType::FixedSizeBinary(_) => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", arrow_type)), + DataFusionError::NotImplemented(format!("{arrow_type:?}")), )), DataType::LargeBinary => Ok(DataTypeMap::new( DataType::LargeBinary, @@ -207,23 +207,22 @@ impl DataTypeMap { SqlType::VARCHAR, )), DataType::List(_) => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - arrow_type + "{arrow_type:?}" )))), DataType::FixedSizeList(_, _) => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", arrow_type)), + DataFusionError::NotImplemented(format!("{arrow_type:?}")), )), DataType::LargeList(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Struct(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Union(_, _) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Dictionary(_, _) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Decimal128(precision, scale) => Ok(DataTypeMap::new( DataType::Decimal128(*precision, *scale), @@ -236,23 +235,22 @@ impl DataTypeMap { SqlType::DECIMAL, )), DataType::Map(_, _) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::RunEndEncoded(_, _) => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", arrow_type)), + DataFusionError::NotImplemented(format!("{arrow_type:?}")), )), DataType::BinaryView => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Utf8View => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - arrow_type + "{arrow_type:?}" )))), DataType::ListView(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::LargeListView(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), } } @@ -379,8 +377,7 @@ impl DataTypeMap { "double" => Ok(DataType::Float64), "byte_array" => Ok(DataType::Utf8), _ => Err(PyValueError::new_err(format!( - "Unable to determine Arrow Data Type from Parquet String type: {:?}", - parquet_str_type + "Unable to determine Arrow Data Type from Parquet String type: {parquet_str_type:?}" ))), }; DataTypeMap::map_from_arrow_type(&arrow_dtype?) @@ -404,12 +401,10 @@ impl DataTypeMap { pub fn py_map_from_sql_type(sql_type: &SqlType) -> PyResult { match sql_type { SqlType::ANY => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::ARRAY => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::BIGINT => Ok(DataTypeMap::new( DataType::Int64, @@ -432,11 +427,10 @@ impl DataTypeMap { SqlType::CHAR, )), SqlType::COLUMN_LIST => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::CURSOR => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::DATE => Ok(DataTypeMap::new( DataType::Date64, @@ -449,8 +443,7 @@ impl DataTypeMap { SqlType::DECIMAL, )), SqlType::DISTINCT => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::DOUBLE => Ok(DataTypeMap::new( DataType::Decimal256(1, 1), @@ -458,7 +451,7 @@ impl DataTypeMap { SqlType::DOUBLE, )), SqlType::DYNAMIC_STAR => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::FLOAT => Ok(DataTypeMap::new( DataType::Decimal128(1, 1), @@ -466,8 +459,7 @@ impl DataTypeMap { SqlType::FLOAT, )), SqlType::GEOMETRY => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::INTEGER => Ok(DataTypeMap::new( DataType::Int8, @@ -475,55 +467,52 @@ impl DataTypeMap { SqlType::INTEGER, )), SqlType::INTERVAL => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::INTERVAL_DAY => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_DAY_HOUR => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_DAY_MINUTE => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::INTERVAL_DAY_SECOND => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::INTERVAL_HOUR => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_HOUR_MINUTE => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::INTERVAL_HOUR_SECOND => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::INTERVAL_MINUTE => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_MINUTE_SECOND => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::INTERVAL_MONTH => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_SECOND => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_YEAR => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_YEAR_MONTH => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::MAP => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::MULTISET => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::NULL => Ok(DataTypeMap::new( DataType::Null, @@ -531,20 +520,16 @@ impl DataTypeMap { SqlType::NULL, )), SqlType::OTHER => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::REAL => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::ROW => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::SARG => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::SMALLINT => Ok(DataTypeMap::new( DataType::Int16, @@ -552,25 +537,22 @@ impl DataTypeMap { SqlType::SMALLINT, )), SqlType::STRUCTURED => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::SYMBOL => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::TIME => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::TIME_WITH_LOCAL_TIME_ZONE => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::TIMESTAMP => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::TIMESTAMP_WITH_LOCAL_TIME_ZONE => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::TINYINT => Ok(DataTypeMap::new( DataType::Int8, @@ -578,8 +560,7 @@ impl DataTypeMap { SqlType::TINYINT, )), SqlType::UNKNOWN => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::VARBINARY => Ok(DataTypeMap::new( DataType::LargeBinary, @@ -682,8 +663,7 @@ impl PyDataType { "datetime64" => Ok(DataType::Date64), "object" => Ok(DataType::Utf8), _ => Err(PyValueError::new_err(format!( - "Unable to determine Arrow Data Type from Arrow String type: {:?}", - arrow_str_type + "Unable to determine Arrow Data Type from Arrow String type: {arrow_str_type:?}" ))), }; Ok(PyDataType { diff --git a/src/context.rs b/src/context.rs index 6ce1f12bc..36133a33d 100644 --- a/src/context.rs +++ b/src/context.rs @@ -31,7 +31,7 @@ use uuid::Uuid; use pyo3::exceptions::{PyKeyError, PyValueError}; use pyo3::prelude::*; -use crate::catalog::{PyCatalog, PyTable}; +use crate::catalog::{PyCatalog, PyTable, RustWrappedPyCatalogProvider}; use crate::dataframe::PyDataFrame; use crate::dataset::Dataset; use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionResult}; @@ -49,6 +49,7 @@ use crate::utils::{get_global_ctx, get_tokio_runtime, validate_pycapsule, wait_f use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::arrow::record_batch::RecordBatch; +use datafusion::catalog::CatalogProvider; use datafusion::common::TableReference; use datafusion::common::{exec_err, ScalarValue}; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; @@ -69,8 +70,10 @@ use datafusion::physical_plan::SendableRecordBatchStream; use datafusion::prelude::{ AvroReadOptions, CsvReadOptions, DataFrame, NdJsonReadOptions, ParquetReadOptions, }; +use datafusion_ffi::catalog_provider::{FFI_CatalogProvider, ForeignCatalogProvider}; use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider}; use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType}; +use pyo3::IntoPyObjectExt; use tokio::task::JoinHandle; /// Configuration options for a SessionContext @@ -365,7 +368,7 @@ impl PySessionContext { } else { &upstream_host }; - let url_string = format!("{}{}", scheme, derived_host); + let url_string = format!("{scheme}{derived_host}"); let url = Url::parse(&url_string).unwrap(); self.ctx.runtime_env().register_object_store(&url, store); Ok(()) @@ -614,6 +617,34 @@ impl PySessionContext { Ok(()) } + pub fn register_catalog_provider( + &mut self, + name: &str, + provider: Bound<'_, PyAny>, + ) -> PyDataFusionResult<()> { + let provider = if provider.hasattr("__datafusion_catalog_provider__")? { + let capsule = provider + .getattr("__datafusion_catalog_provider__")? + .call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_catalog_provider")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignCatalogProvider = provider.into(); + Arc::new(provider) as Arc + } else { + match provider.extract::() { + Ok(py_catalog) => py_catalog.catalog, + Err(_) => Arc::new(RustWrappedPyCatalogProvider::new(provider.into())) + as Arc, + } + }; + + let _ = self.ctx.register_catalog(name, provider); + + Ok(()) + } + /// Construct datafusion dataframe from Arrow Table pub fn register_table_provider( &mut self, @@ -845,14 +876,24 @@ impl PySessionContext { } #[pyo3(signature = (name="datafusion"))] - pub fn catalog(&self, name: &str) -> PyResult { - match self.ctx.catalog(name) { - Some(catalog) => Ok(PyCatalog::new(catalog)), - None => Err(PyKeyError::new_err(format!( - "Catalog with name {} doesn't exist.", - &name, - ))), - } + pub fn catalog(&self, name: &str) -> PyResult { + let catalog = self.ctx.catalog(name).ok_or(PyKeyError::new_err(format!( + "Catalog with name {name} doesn't exist." + )))?; + + Python::with_gil(|py| { + match catalog + .as_any() + .downcast_ref::() + { + Some(wrapped_schema) => Ok(wrapped_schema.catalog_provider.clone_ref(py)), + None => PyCatalog::from(catalog).into_py_any(py), + } + }) + } + + pub fn catalog_names(&self) -> HashSet { + self.ctx.catalog_names().into_iter().collect() } pub fn tables(&self) -> HashSet { diff --git a/src/expr.rs b/src/expr.rs index 6b1d01d65..f1e002367 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -171,12 +171,10 @@ impl PyExpr { Expr::Cast(value) => Ok(cast::PyCast::from(value.clone()).into_bound_py_any(py)?), Expr::TryCast(value) => Ok(cast::PyTryCast::from(value.clone()).into_bound_py_any(py)?), Expr::ScalarFunction(value) => Err(py_unsupported_variant_err(format!( - "Converting Expr::ScalarFunction to a Python object is not implemented: {:?}", - value + "Converting Expr::ScalarFunction to a Python object is not implemented: {value:?}" ))), Expr::WindowFunction(value) => Err(py_unsupported_variant_err(format!( - "Converting Expr::WindowFunction to a Python object is not implemented: {:?}", - value + "Converting Expr::WindowFunction to a Python object is not implemented: {value:?}" ))), Expr::InList(value) => Ok(in_list::PyInList::from(value.clone()).into_bound_py_any(py)?), Expr::Exists(value) => Ok(exists::PyExists::from(value.clone()).into_bound_py_any(py)?), @@ -188,8 +186,7 @@ impl PyExpr { } #[allow(deprecated)] Expr::Wildcard { qualifier, options } => Err(py_unsupported_variant_err(format!( - "Converting Expr::Wildcard to a Python object is not implemented : {:?} {:?}", - qualifier, options + "Converting Expr::Wildcard to a Python object is not implemented : {qualifier:?} {options:?}" ))), Expr::GroupingSet(value) => { Ok(grouping_set::PyGroupingSet::from(value.clone()).into_bound_py_any(py)?) @@ -198,8 +195,7 @@ impl PyExpr { Ok(placeholder::PyPlaceholder::from(value.clone()).into_bound_py_any(py)?) } Expr::OuterReferenceColumn(data_type, column) => Err(py_unsupported_variant_err(format!( - "Converting Expr::OuterReferenceColumn to a Python object is not implemented: {:?} - {:?}", - data_type, column + "Converting Expr::OuterReferenceColumn to a Python object is not implemented: {data_type:?} - {column:?}" ))), Expr::Unnest(value) => Ok(unnest_expr::PyUnnestExpr::from(value.clone()).into_bound_py_any(py)?), } @@ -755,8 +751,7 @@ impl PyExpr { Expr::Cast(Cast { expr: _, data_type }) => DataTypeMap::map_from_arrow_type(data_type), Expr::Literal(scalar_value, _) => DataTypeMap::map_from_scalar_value(scalar_value), _ => Err(py_type_err(format!( - "Non Expr::Literal encountered in types: {:?}", - expr + "Non Expr::Literal encountered in types: {expr:?}" ))), } } diff --git a/src/expr/aggregate.rs b/src/expr/aggregate.rs index a99d83d23..fd4393271 100644 --- a/src/expr/aggregate.rs +++ b/src/expr/aggregate.rs @@ -116,7 +116,7 @@ impl PyAggregate { } fn __repr__(&self) -> PyResult { - Ok(format!("Aggregate({})", self)) + Ok(format!("Aggregate({self})")) } } diff --git a/src/expr/aggregate_expr.rs b/src/expr/aggregate_expr.rs index c09f116e3..7c5d3d31f 100644 --- a/src/expr/aggregate_expr.rs +++ b/src/expr/aggregate_expr.rs @@ -75,6 +75,6 @@ impl PyAggregateFunction { /// Get a String representation of this column fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } } diff --git a/src/expr/alias.rs b/src/expr/alias.rs index e8e03cfad..40746f200 100644 --- a/src/expr/alias.rs +++ b/src/expr/alias.rs @@ -64,6 +64,6 @@ impl PyAlias { /// Get a String representation of this column fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } } diff --git a/src/expr/analyze.rs b/src/expr/analyze.rs index 62f93cd26..e8081e95b 100644 --- a/src/expr/analyze.rs +++ b/src/expr/analyze.rs @@ -69,7 +69,7 @@ impl PyAnalyze { } fn __repr__(&self) -> PyResult { - Ok(format!("Analyze({})", self)) + Ok(format!("Analyze({self})")) } } diff --git a/src/expr/between.rs b/src/expr/between.rs index a2cac1442..817f1baae 100644 --- a/src/expr/between.rs +++ b/src/expr/between.rs @@ -71,6 +71,6 @@ impl PyBetween { } fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } } diff --git a/src/expr/column.rs b/src/expr/column.rs index 365dbc0d2..50f316f1c 100644 --- a/src/expr/column.rs +++ b/src/expr/column.rs @@ -45,7 +45,7 @@ impl PyColumn { /// Get the column relation fn relation(&self) -> Option { - self.col.relation.as_ref().map(|r| format!("{}", r)) + self.col.relation.as_ref().map(|r| format!("{r}")) } /// Get the fully-qualified column name diff --git a/src/expr/copy_to.rs b/src/expr/copy_to.rs index ebfcb8ebc..473dabfed 100644 --- a/src/expr/copy_to.rs +++ b/src/expr/copy_to.rs @@ -106,7 +106,7 @@ impl PyCopyTo { } fn __repr__(&self) -> PyResult { - Ok(format!("CopyTo({})", self)) + Ok(format!("CopyTo({self})")) } fn __name__(&self) -> PyResult { @@ -129,7 +129,7 @@ impl Display for PyFileType { #[pymethods] impl PyFileType { fn __repr__(&self) -> PyResult { - Ok(format!("FileType({})", self)) + Ok(format!("FileType({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_catalog.rs b/src/expr/create_catalog.rs index f4ea0f517..d2d2ee8f6 100644 --- a/src/expr/create_catalog.rs +++ b/src/expr/create_catalog.rs @@ -81,7 +81,7 @@ impl PyCreateCatalog { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateCatalog({})", self)) + Ok(format!("CreateCatalog({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_catalog_schema.rs b/src/expr/create_catalog_schema.rs index 85f447e1e..e794962f5 100644 --- a/src/expr/create_catalog_schema.rs +++ b/src/expr/create_catalog_schema.rs @@ -81,7 +81,7 @@ impl PyCreateCatalogSchema { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateCatalogSchema({})", self)) + Ok(format!("CreateCatalogSchema({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_external_table.rs b/src/expr/create_external_table.rs index 01ce7d0ca..3e35af006 100644 --- a/src/expr/create_external_table.rs +++ b/src/expr/create_external_table.rs @@ -164,7 +164,7 @@ impl PyCreateExternalTable { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateExternalTable({})", self)) + Ok(format!("CreateExternalTable({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_function.rs b/src/expr/create_function.rs index 6f3c3f0ff..c02ceebb1 100644 --- a/src/expr/create_function.rs +++ b/src/expr/create_function.rs @@ -163,7 +163,7 @@ impl PyCreateFunction { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateFunction({})", self)) + Ok(format!("CreateFunction({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_index.rs b/src/expr/create_index.rs index 13dadbc3f..0f4b5011a 100644 --- a/src/expr/create_index.rs +++ b/src/expr/create_index.rs @@ -110,7 +110,7 @@ impl PyCreateIndex { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateIndex({})", self)) + Ok(format!("CreateIndex({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_memory_table.rs b/src/expr/create_memory_table.rs index 8872b2d47..37f4d3420 100644 --- a/src/expr/create_memory_table.rs +++ b/src/expr/create_memory_table.rs @@ -78,7 +78,7 @@ impl PyCreateMemoryTable { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateMemoryTable({})", self)) + Ok(format!("CreateMemoryTable({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_view.rs b/src/expr/create_view.rs index 87bb76876..718e404d0 100644 --- a/src/expr/create_view.rs +++ b/src/expr/create_view.rs @@ -75,7 +75,7 @@ impl PyCreateView { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateView({})", self)) + Ok(format!("CreateView({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/describe_table.rs b/src/expr/describe_table.rs index 5658a13f2..6c48f3c77 100644 --- a/src/expr/describe_table.rs +++ b/src/expr/describe_table.rs @@ -61,7 +61,7 @@ impl PyDescribeTable { } fn __repr__(&self) -> PyResult { - Ok(format!("DescribeTable({})", self)) + Ok(format!("DescribeTable({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/distinct.rs b/src/expr/distinct.rs index b62b776f8..889e7099d 100644 --- a/src/expr/distinct.rs +++ b/src/expr/distinct.rs @@ -48,8 +48,7 @@ impl Display for PyDistinct { Distinct::All(input) => write!( f, "Distinct ALL - \nInput: {:?}", - input, + \nInput: {input:?}", ), Distinct::On(distinct_on) => { write!( @@ -71,7 +70,7 @@ impl PyDistinct { } fn __repr__(&self) -> PyResult { - Ok(format!("Distinct({})", self)) + Ok(format!("Distinct({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/drop_catalog_schema.rs b/src/expr/drop_catalog_schema.rs index b7420a99c..b4a4c521c 100644 --- a/src/expr/drop_catalog_schema.rs +++ b/src/expr/drop_catalog_schema.rs @@ -101,7 +101,7 @@ impl PyDropCatalogSchema { } fn __repr__(&self) -> PyResult { - Ok(format!("DropCatalogSchema({})", self)) + Ok(format!("DropCatalogSchema({self})")) } } diff --git a/src/expr/drop_function.rs b/src/expr/drop_function.rs index 9fbd78fdc..fca9eb94b 100644 --- a/src/expr/drop_function.rs +++ b/src/expr/drop_function.rs @@ -76,7 +76,7 @@ impl PyDropFunction { } fn __repr__(&self) -> PyResult { - Ok(format!("DropFunction({})", self)) + Ok(format!("DropFunction({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/drop_table.rs b/src/expr/drop_table.rs index 96983c1cf..3f442539a 100644 --- a/src/expr/drop_table.rs +++ b/src/expr/drop_table.rs @@ -70,7 +70,7 @@ impl PyDropTable { } fn __repr__(&self) -> PyResult { - Ok(format!("DropTable({})", self)) + Ok(format!("DropTable({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/drop_view.rs b/src/expr/drop_view.rs index 1d1ab1e59..6196c8bb5 100644 --- a/src/expr/drop_view.rs +++ b/src/expr/drop_view.rs @@ -83,7 +83,7 @@ impl PyDropView { } fn __repr__(&self) -> PyResult { - Ok(format!("DropView({})", self)) + Ok(format!("DropView({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/empty_relation.rs b/src/expr/empty_relation.rs index a1534ac15..758213423 100644 --- a/src/expr/empty_relation.rs +++ b/src/expr/empty_relation.rs @@ -65,7 +65,7 @@ impl PyEmptyRelation { /// Get a String representation of this column fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } fn __name__(&self) -> PyResult { diff --git a/src/expr/filter.rs b/src/expr/filter.rs index 9bdb667cd..4fcb600cd 100644 --- a/src/expr/filter.rs +++ b/src/expr/filter.rs @@ -72,7 +72,7 @@ impl PyFilter { } fn __repr__(&self) -> String { - format!("Filter({})", self) + format!("Filter({self})") } } diff --git a/src/expr/join.rs b/src/expr/join.rs index 76ec532e7..b8d1d9da7 100644 --- a/src/expr/join.rs +++ b/src/expr/join.rs @@ -177,7 +177,7 @@ impl PyJoin { } fn __repr__(&self) -> PyResult { - Ok(format!("Join({})", self)) + Ok(format!("Join({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/like.rs b/src/expr/like.rs index 2e1f060bd..f180f5d4c 100644 --- a/src/expr/like.rs +++ b/src/expr/like.rs @@ -75,7 +75,7 @@ impl PyLike { } fn __repr__(&self) -> String { - format!("Like({})", self) + format!("Like({self})") } } @@ -133,7 +133,7 @@ impl PyILike { } fn __repr__(&self) -> String { - format!("Like({})", self) + format!("Like({self})") } } @@ -191,6 +191,6 @@ impl PySimilarTo { } fn __repr__(&self) -> String { - format!("Like({})", self) + format!("Like({self})") } } diff --git a/src/expr/limit.rs b/src/expr/limit.rs index c2a33ff89..92552814e 100644 --- a/src/expr/limit.rs +++ b/src/expr/limit.rs @@ -81,7 +81,7 @@ impl PyLimit { } fn __repr__(&self) -> PyResult { - Ok(format!("Limit({})", self)) + Ok(format!("Limit({self})")) } } diff --git a/src/expr/projection.rs b/src/expr/projection.rs index dc7e5e3c1..b5a9ef34a 100644 --- a/src/expr/projection.rs +++ b/src/expr/projection.rs @@ -85,7 +85,7 @@ impl PyProjection { } fn __repr__(&self) -> PyResult { - Ok(format!("Projection({})", self)) + Ok(format!("Projection({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/recursive_query.rs b/src/expr/recursive_query.rs index 65181f7d3..2517b7417 100644 --- a/src/expr/recursive_query.rs +++ b/src/expr/recursive_query.rs @@ -89,7 +89,7 @@ impl PyRecursiveQuery { } fn __repr__(&self) -> PyResult { - Ok(format!("RecursiveQuery({})", self)) + Ok(format!("RecursiveQuery({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/repartition.rs b/src/expr/repartition.rs index 3e782d6af..48b5e7041 100644 --- a/src/expr/repartition.rs +++ b/src/expr/repartition.rs @@ -108,7 +108,7 @@ impl PyRepartition { } fn __repr__(&self) -> PyResult { - Ok(format!("Repartition({})", self)) + Ok(format!("Repartition({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/sort.rs b/src/expr/sort.rs index ed4947591..79a8aee50 100644 --- a/src/expr/sort.rs +++ b/src/expr/sort.rs @@ -87,7 +87,7 @@ impl PySort { } fn __repr__(&self) -> PyResult { - Ok(format!("Sort({})", self)) + Ok(format!("Sort({self})")) } } diff --git a/src/expr/sort_expr.rs b/src/expr/sort_expr.rs index 12f74e4d8..79e35d978 100644 --- a/src/expr/sort_expr.rs +++ b/src/expr/sort_expr.rs @@ -85,6 +85,6 @@ impl PySortExpr { } fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } } diff --git a/src/expr/subquery.rs b/src/expr/subquery.rs index 5ebfe6927..77f56f9a9 100644 --- a/src/expr/subquery.rs +++ b/src/expr/subquery.rs @@ -62,7 +62,7 @@ impl PySubquery { } fn __repr__(&self) -> PyResult { - Ok(format!("Subquery({})", self)) + Ok(format!("Subquery({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/subquery_alias.rs b/src/expr/subquery_alias.rs index 267a4d485..3302e7f23 100644 --- a/src/expr/subquery_alias.rs +++ b/src/expr/subquery_alias.rs @@ -72,7 +72,7 @@ impl PySubqueryAlias { } fn __repr__(&self) -> PyResult { - Ok(format!("SubqueryAlias({})", self)) + Ok(format!("SubqueryAlias({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/table_scan.rs b/src/expr/table_scan.rs index 6a0d53f0f..329964687 100644 --- a/src/expr/table_scan.rs +++ b/src/expr/table_scan.rs @@ -136,7 +136,7 @@ impl PyTableScan { } fn __repr__(&self) -> PyResult { - Ok(format!("TableScan({})", self)) + Ok(format!("TableScan({self})")) } } diff --git a/src/expr/union.rs b/src/expr/union.rs index 5a08ccc13..e0b221398 100644 --- a/src/expr/union.rs +++ b/src/expr/union.rs @@ -66,7 +66,7 @@ impl PyUnion { } fn __repr__(&self) -> PyResult { - Ok(format!("Union({})", self)) + Ok(format!("Union({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/unnest.rs b/src/expr/unnest.rs index 8e70e0990..c8833347f 100644 --- a/src/expr/unnest.rs +++ b/src/expr/unnest.rs @@ -66,7 +66,7 @@ impl PyUnnest { } fn __repr__(&self) -> PyResult { - Ok(format!("Unnest({})", self)) + Ok(format!("Unnest({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/unnest_expr.rs b/src/expr/unnest_expr.rs index 2234d24b1..634186ed8 100644 --- a/src/expr/unnest_expr.rs +++ b/src/expr/unnest_expr.rs @@ -58,7 +58,7 @@ impl PyUnnestExpr { } fn __repr__(&self) -> PyResult { - Ok(format!("UnnestExpr({})", self)) + Ok(format!("UnnestExpr({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/window.rs b/src/expr/window.rs index 052d9eeb4..a408731c2 100644 --- a/src/expr/window.rs +++ b/src/expr/window.rs @@ -185,8 +185,7 @@ impl PyWindowFrame { "groups" => WindowFrameUnits::Groups, _ => { return Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - units, + "{units:?}", )))); } }; @@ -197,8 +196,7 @@ impl PyWindowFrame { WindowFrameUnits::Rows => WindowFrameBound::Preceding(ScalarValue::UInt64(None)), WindowFrameUnits::Groups => { return Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - units, + "{units:?}", )))); } }, @@ -210,8 +208,7 @@ impl PyWindowFrame { WindowFrameUnits::Range => WindowFrameBound::Following(ScalarValue::UInt64(None)), WindowFrameUnits::Groups => { return Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - units, + "{units:?}", )))); } }, @@ -236,7 +233,7 @@ impl PyWindowFrame { /// Get a String representation of this window frame fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } } diff --git a/src/functions.rs b/src/functions.rs index b2bafcb65..b40500b8b 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -937,7 +937,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(left))?; m.add_wrapped(wrap_pyfunction!(length))?; m.add_wrapped(wrap_pyfunction!(ln))?; - m.add_wrapped(wrap_pyfunction!(log))?; + m.add_wrapped(wrap_pyfunction!(self::log))?; m.add_wrapped(wrap_pyfunction!(log10))?; m.add_wrapped(wrap_pyfunction!(log2))?; m.add_wrapped(wrap_pyfunction!(lower))?; diff --git a/src/lib.rs b/src/lib.rs index 1293eee3c..29d3f41da 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -77,10 +77,10 @@ pub(crate) struct TokioRuntime(tokio::runtime::Runtime); /// datafusion directory. #[pymodule] fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { + // Initialize logging + pyo3_log::init(); + // Register the python classes - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -98,6 +98,10 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; + let catalog = PyModule::new(py, "catalog")?; + catalog::init_module(&catalog)?; + m.add_submodule(&catalog)?; + // Register `common` as a submodule. Matching `datafusion-common` https://docs.rs/datafusion-common/latest/datafusion_common/ let common = PyModule::new(py, "common")?; common::init_module(&common)?; diff --git a/src/physical_plan.rs b/src/physical_plan.rs index f0be45c6a..49db643e1 100644 --- a/src/physical_plan.rs +++ b/src/physical_plan.rs @@ -78,8 +78,7 @@ impl PyExecutionPlan { let proto_plan = datafusion_proto::protobuf::PhysicalPlanNode::decode(bytes).map_err(|e| { PyRuntimeError::new_err(format!( - "Unable to decode logical node from serialized bytes: {}", - e + "Unable to decode logical node from serialized bytes: {e}" )) })?; diff --git a/src/sql/logical.rs b/src/sql/logical.rs index 198d68bdc..97d320470 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -201,8 +201,7 @@ impl PyLogicalPlan { let proto_plan = datafusion_proto::protobuf::LogicalPlanNode::decode(bytes).map_err(|e| { PyRuntimeError::new_err(format!( - "Unable to decode logical node from serialized bytes: {}", - e + "Unable to decode logical node from serialized bytes: {e}" )) })?; diff --git a/src/utils.rs b/src/utils.rs index f4e121fd5..3b30de5de 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -109,8 +109,7 @@ pub(crate) fn validate_pycapsule(capsule: &Bound, name: &str) -> PyRe let capsule_name = capsule_name.unwrap().to_str()?; if capsule_name != name { return Err(PyValueError::new_err(format!( - "Expected name '{}' in PyCapsule, instead got '{}'", - name, capsule_name + "Expected name '{name}' in PyCapsule, instead got '{capsule_name}'" ))); } @@ -127,7 +126,7 @@ pub(crate) fn py_obj_to_scalar_value(py: Python, obj: PyObject) -> PyResult Date: Wed, 2 Jul 2025 09:59:02 -0400 Subject: [PATCH 057/145] feat: add FFI support for user defined functions (#1145) * Intermediate work adding ffi scalar udf * Add scalar UDF and example * Add aggregate udf via ffi * Initial commit for window ffi integration * Remove unused import --- docs/source/contributor-guide/ffi.rst | 2 +- examples/datafusion-ffi-example/Cargo.lock | 217 ++++++++++-------- examples/datafusion-ffi-example/Cargo.toml | 8 +- .../python/tests/_test_aggregate_udf.py | 77 +++++++ .../python/tests/_test_scalar_udf.py | 70 ++++++ .../python/tests/_test_window_udf.py | 89 +++++++ .../src/aggregate_udf.rs | 81 +++++++ .../src/catalog_provider.rs | 1 - examples/datafusion-ffi-example/src/lib.rs | 9 + .../datafusion-ffi-example/src/scalar_udf.rs | 91 ++++++++ .../datafusion-ffi-example/src/window_udf.rs | 81 +++++++ python/datafusion/user_defined.py | 107 ++++++++- src/functions.rs | 2 +- src/udaf.rs | 31 ++- src/udf.rs | 25 +- src/udwf.rs | 27 ++- 16 files changed, 805 insertions(+), 113 deletions(-) create mode 100644 examples/datafusion-ffi-example/python/tests/_test_aggregate_udf.py create mode 100644 examples/datafusion-ffi-example/python/tests/_test_scalar_udf.py create mode 100644 examples/datafusion-ffi-example/python/tests/_test_window_udf.py create mode 100644 examples/datafusion-ffi-example/src/aggregate_udf.rs create mode 100644 examples/datafusion-ffi-example/src/scalar_udf.rs create mode 100644 examples/datafusion-ffi-example/src/window_udf.rs diff --git a/docs/source/contributor-guide/ffi.rst b/docs/source/contributor-guide/ffi.rst index c1f9806b3..a40af1234 100644 --- a/docs/source/contributor-guide/ffi.rst +++ b/docs/source/contributor-guide/ffi.rst @@ -176,7 +176,7 @@ By convention the ``datafusion-python`` library expects a Python object that has ``TableProvider`` PyCapsule to have this capsule accessible by calling a function named ``__datafusion_table_provider__``. You can see a complete working example of how to share a ``TableProvider`` from one python library to DataFusion Python in the -`repository examples folder `_. +`repository examples folder `_. This section has been written using ``TableProvider`` as an example. It is the first extension that has been written using this approach and the most thoroughly implemented. diff --git a/examples/datafusion-ffi-example/Cargo.lock b/examples/datafusion-ffi-example/Cargo.lock index e5a1ca8d1..1b4ca6bee 100644 --- a/examples/datafusion-ffi-example/Cargo.lock +++ b/examples/datafusion-ffi-example/Cargo.lock @@ -323,6 +323,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73a47aa0c771b5381de2b7f16998d351a6f4eb839f1e13d48353e17e873d969b" dependencies = [ "bitflags", + "serde", + "serde_json", ] [[package]] @@ -748,9 +750,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffe060b978f74ab446be722adb8a274e052e005bf6dfd171caadc3abaad10080" +checksum = "cc6cb8c2c81eada072059983657d6c9caf3fddefc43b4a65551d243253254a96" dependencies = [ "arrow", "arrow-ipc", @@ -775,7 +777,6 @@ dependencies = [ "datafusion-functions-nested", "datafusion-functions-table", "datafusion-functions-window", - "datafusion-macros", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -790,7 +791,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand", + "rand 0.9.1", "regex", "sqlparser", "tempfile", @@ -803,9 +804,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61fe34f401bd03724a1f96d12108144f8cd495a3cdda2bf5e091822fb80b7e66" +checksum = "b7be8d1b627843af62e447396db08fe1372d882c0eb8d0ea655fd1fbc33120ee" dependencies = [ "arrow", "async-trait", @@ -829,9 +830,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4411b8e3bce5e0fc7521e44f201def2e2d5d1b5f176fb56e8cdc9942c890f00" +checksum = "38ab16c5ae43f65ee525fc493ceffbc41f40dee38b01f643dfcfc12959e92038" dependencies = [ "arrow", "async-trait", @@ -852,9 +853,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0734015d81c8375eb5d4869b7f7ecccc2ee8d6cb81948ef737cd0e7b743bd69c" +checksum = "d3d56b2ac9f476b93ca82e4ef5fb00769c8a3f248d12b4965af7e27635fa7e12" dependencies = [ "ahash", "arrow", @@ -876,9 +877,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5167bb1d2ccbb87c6bc36c295274d7a0519b14afcfdaf401d53cbcaa4ef4968b" +checksum = "16015071202d6133bc84d72756176467e3e46029f3ce9ad2cb788f9b1ff139b2" dependencies = [ "futures", "log", @@ -887,9 +888,9 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04e602dcdf2f50c2abf297cc2203c73531e6f48b29516af7695d338cf2a778b1" +checksum = "b77523c95c89d2a7eb99df14ed31390e04ab29b43ff793e562bdc1716b07e17b" dependencies = [ "arrow", "async-compression", @@ -912,7 +913,7 @@ dependencies = [ "log", "object_store", "parquet", - "rand", + "rand 0.9.1", "tempfile", "tokio", "tokio-util", @@ -923,9 +924,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3bb2253952dc32296ed5b84077cb2e0257fea4be6373e1c376426e17ead4ef6" +checksum = "40d25c5e2c0ebe8434beeea997b8e88d55b3ccc0d19344293f2373f65bc524fc" dependencies = [ "arrow", "async-trait", @@ -948,9 +949,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8c7f47a5d2fe03bfa521ec9bafdb8a5c82de8377f60967c3663f00c8790352" +checksum = "3dc6959e1155741ab35369e1dc7673ba30fc45ed568fad34c01b7cb1daeb4d4c" dependencies = [ "arrow", "async-trait", @@ -973,9 +974,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27d15868ea39ed2dc266728b554f6304acd473de2142281ecfa1294bb7415923" +checksum = "b7a6afdfe358d70f4237f60eaef26ae5a1ce7cb2c469d02d5fc6c7fd5d84e58b" dependencies = [ "arrow", "async-trait", @@ -998,21 +999,21 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand", + "rand 0.9.1", "tokio", ] [[package]] name = "datafusion-doc" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a91f8c2c5788ef32f48ff56c68e5b545527b744822a284373ac79bba1ba47292" +checksum = "9bcd8a3e3e3d02ea642541be23d44376b5d5c37c2938cce39b3873cdf7186eea" [[package]] name = "datafusion-execution" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06f004d100f49a3658c9da6fb0c3a9b760062d96cd4ad82ccc3b7b69a9fb2f84" +checksum = "670da1d45d045eee4c2319b8c7ea57b26cf48ab77b630aaa50b779e406da476a" dependencies = [ "arrow", "dashmap", @@ -1022,16 +1023,16 @@ dependencies = [ "log", "object_store", "parking_lot", - "rand", + "rand 0.9.1", "tempfile", "url", ] [[package]] name = "datafusion-expr" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a4e4ce3802609be38eeb607ee72f6fe86c3091460de9dbfae9e18db423b3964" +checksum = "b3a577f64bdb7e2cc4043cd97f8901d8c504711fde2dbcb0887645b00d7c660b" dependencies = [ "arrow", "chrono", @@ -1050,9 +1051,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "422ac9cf3b22bbbae8cdf8ceb33039107fde1b5492693168f13bd566b1bcc839" +checksum = "51b7916806ace3e9f41884f230f7f38ebf0e955dfbd88266da1826f29a0b9a6a" dependencies = [ "arrow", "datafusion-common", @@ -1063,9 +1064,9 @@ dependencies = [ [[package]] name = "datafusion-ffi" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cf3fe9ab492c56daeb7beed526690d33622d388b8870472e0b7b7f55490338c" +checksum = "980cca31de37f5dadf7ea18e4ffc2b6833611f45bed5ef9de0831d2abb50f1ef" dependencies = [ "abi_stable", "arrow", @@ -1073,7 +1074,9 @@ dependencies = [ "async-ffi", "async-trait", "datafusion", + "datafusion-functions-aggregate-common", "datafusion-proto", + "datafusion-proto-common", "futures", "log", "prost", @@ -1081,11 +1084,25 @@ dependencies = [ "tokio", ] +[[package]] +name = "datafusion-ffi-example" +version = "0.2.0" +dependencies = [ + "arrow", + "arrow-array", + "arrow-schema", + "async-trait", + "datafusion", + "datafusion-ffi", + "pyo3", + "pyo3-build-config", +] + [[package]] name = "datafusion-functions" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ddf0a0a2db5d2918349c978d42d80926c6aa2459cd8a3c533a84ec4bb63479e" +checksum = "7fb31c9dc73d3e0c365063f91139dc273308f8a8e124adda9898db8085d68357" dependencies = [ "arrow", "arrow-buffer", @@ -1103,7 +1120,7 @@ dependencies = [ "itertools", "log", "md-5", - "rand", + "rand 0.9.1", "regex", "sha2", "unicode-segmentation", @@ -1112,9 +1129,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "408a05dafdc70d05a38a29005b8b15e21b0238734dab1e98483fcb58038c5aba" +checksum = "ebb72c6940697eaaba9bd1f746a697a07819de952b817e3fb841fb75331ad5d4" dependencies = [ "ahash", "arrow", @@ -1133,9 +1150,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "756d21da2dd6c9bef97af1504970ff56cbf35d03fbd4ffd62827f02f4d2279d4" +checksum = "d7fdc54656659e5ecd49bf341061f4156ab230052611f4f3609612a0da259696" dependencies = [ "ahash", "arrow", @@ -1146,9 +1163,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d8d50f6334b378930d992d801a10ac5b3e93b846b39e4a05085742572844537" +checksum = "fad94598e3374938ca43bca6b675febe557e7a14eb627d617db427d70d65118b" dependencies = [ "arrow", "arrow-ord", @@ -1167,9 +1184,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9a97220736c8fff1446e936be90d57216c06f28969f9ffd3b72ac93c958c8a" +checksum = "de2fc6c2946da5cab8364fb28b5cac3115f0f3a87960b235ed031c3f7e2e639b" dependencies = [ "arrow", "async-trait", @@ -1183,10 +1200,11 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cefc2d77646e1aadd1d6a9c40088937aedec04e68c5f0465939912e1291f8193" +checksum = "3e5746548a8544870a119f556543adcd88fe0ba6b93723fe78ad0439e0fbb8b4" dependencies = [ + "arrow", "datafusion-common", "datafusion-doc", "datafusion-expr", @@ -1200,9 +1218,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd4aff082c42fa6da99ce0698c85addd5252928c908eb087ca3cfa64ff16b313" +checksum = "dcbe9404382cda257c434f22e13577bee7047031dfdb6216dd5e841b9465e6fe" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1210,9 +1228,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df6f88d7ee27daf8b108ba910f9015176b36fbc72902b1ca5c2a5f1d1717e1a1" +checksum = "8dce50e3b637dab0d25d04d2fe79dfdca2b257eabd76790bffd22c7f90d700c8" dependencies = [ "datafusion-expr", "quote", @@ -1221,9 +1239,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084d9f979c4b155346d3c34b18f4256e6904ded508e9554d90fed416415c3515" +checksum = "03cfaacf06445dc3bbc1e901242d2a44f2cae99a744f49f3fefddcee46240058" dependencies = [ "arrow", "chrono", @@ -1240,9 +1258,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64c536062b0076f4e30084065d805f389f9fe38af0ca75bcbac86bc5e9fbab65" +checksum = "1908034a89d7b2630898e06863583ae4c00a0dd310c1589ca284195ee3f7f8a6" dependencies = [ "ahash", "arrow", @@ -1262,9 +1280,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8a92b53b3193fac1916a1c5b8e3f4347c526f6822e56b71faa5fb372327a863" +checksum = "47b7a12dd59ea07614b67dbb01d85254fbd93df45bcffa63495e11d3bdf847df" dependencies = [ "ahash", "arrow", @@ -1276,9 +1294,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fa0a5ac94c7cf3da97bedabd69d6bbca12aef84b9b37e6e9e8c25286511b5e2" +checksum = "4371cc4ad33978cc2a8be93bd54a232d3f2857b50401a14631c0705f3f910aae" dependencies = [ "arrow", "datafusion-common", @@ -1295,9 +1313,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "690c615db468c2e5fe5085b232d8b1c088299a6c63d87fd960a354a71f7acb55" +checksum = "dc47bc33025757a5c11f2cd094c5b6b5ed87f46fa33c023e6fdfa25fcbfade23" dependencies = [ "ahash", "arrow", @@ -1325,9 +1343,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a1afb2bdb05de7ff65be6883ebfd4ec027bd9f1f21c46aa3afd01927160a83" +checksum = "d8f5d9acd7d96e3bf2a7bb04818373cab6e51de0356e3694b94905fee7b4e8b6" dependencies = [ "arrow", "chrono", @@ -1341,9 +1359,9 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35b7a5876ebd6b564fb9a1fd2c3a2a9686b787071a256b47e4708f0916f9e46f" +checksum = "09ecb5ec152c4353b60f7a5635489834391f7a291d2b39a4820cd469e318b78e" dependencies = [ "arrow", "datafusion-common", @@ -1352,9 +1370,9 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad229a134c7406c057ece00c8743c0c34b97f4e72f78b475fe17b66c5e14fa4f" +checksum = "d7485da32283985d6b45bd7d13a65169dcbe8c869e25d01b2cfbc425254b4b49" dependencies = [ "arrow", "async-trait", @@ -1376,9 +1394,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64f6ab28b72b664c21a27b22a2ff815fd390ed224c26e89a93b5a8154a4e8607" +checksum = "a466b15632befddfeac68c125f0260f569ff315c6831538cbb40db754134e0df" dependencies = [ "arrow", "bigdecimal", @@ -1441,20 +1459,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" -[[package]] -name = "ffi-table-provider" -version = "0.1.0" -dependencies = [ - "arrow", - "arrow-array", - "arrow-schema", - "async-trait", - "datafusion", - "datafusion-ffi", - "pyo3", - "pyo3-build-config", -] - [[package]] name = "fixedbitset" version = "0.5.7" @@ -1488,6 +1492,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1666,6 +1676,11 @@ name = "hashbrown" version = "0.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] [[package]] name = "heck" @@ -2271,12 +2286,14 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "petgraph" -version = "0.7.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +checksum = "54acf3a685220b533e437e264e4d932cfbdc4cc7ec0cd232ed73c08d03b8a7ca" dependencies = [ "fixedbitset", + "hashbrown 0.15.3", "indexmap", + "serde", ] [[package]] @@ -2305,7 +2322,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" dependencies = [ "phf_shared", - "rand", + "rand 0.8.5", ] [[package]] @@ -2484,19 +2501,27 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ - "libc", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" +dependencies = [ "rand_chacha", - "rand_core", + "rand_core 0.9.3", ] [[package]] name = "rand_chacha" -version = "0.3.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.9.3", ] [[package]] @@ -2504,8 +2529,14 @@ name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" dependencies = [ - "getrandom 0.2.16", + "getrandom 0.3.3", ] [[package]] @@ -3032,9 +3063,9 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.16.0" +version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9" +checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d" dependencies = [ "getrandom 0.3.3", "js-sys", diff --git a/examples/datafusion-ffi-example/Cargo.toml b/examples/datafusion-ffi-example/Cargo.toml index 319163554..b26ab48e3 100644 --- a/examples/datafusion-ffi-example/Cargo.toml +++ b/examples/datafusion-ffi-example/Cargo.toml @@ -16,13 +16,13 @@ # under the License. [package] -name = "ffi-table-provider" -version = "0.1.0" +name = "datafusion-ffi-example" +version = "0.2.0" edition = "2021" [dependencies] -datafusion = { version = "47.0.0" } -datafusion-ffi = { version = "47.0.0" } +datafusion = { version = "48.0.0" } +datafusion-ffi = { version = "48.0.0" } pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } arrow = { version = "55.0.0" } arrow-array = { version = "55.0.0" } diff --git a/examples/datafusion-ffi-example/python/tests/_test_aggregate_udf.py b/examples/datafusion-ffi-example/python/tests/_test_aggregate_udf.py new file mode 100644 index 000000000..7ea6b295c --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_aggregate_udf.py @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pyarrow as pa +from datafusion import SessionContext, col, udaf +from datafusion_ffi_example import MySumUDF + + +def setup_context_with_table(): + ctx = SessionContext() + + # Pick numbers here so we get the same value in both groups + # since we cannot be certain of the output order of batches + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, 2, 3, None], type=pa.int64()), + pa.array([1, 1, 2, 2], type=pa.int64()), + ], + names=["a", "b"], + ) + ctx.register_record_batches("test_table", [[batch]]) + return ctx + + +def test_ffi_aggregate_register(): + ctx = setup_context_with_table() + my_udaf = udaf(MySumUDF()) + ctx.register_udaf(my_udaf) + + result = ctx.sql("select my_custom_sum(a) from test_table group by b").collect() + + assert len(result) == 2 + assert result[0].num_columns == 1 + + result = [r.column(0) for r in result] + expected = [ + pa.array([3], type=pa.int64()), + pa.array([3], type=pa.int64()), + ] + + assert result == expected + + +def test_ffi_aggregate_call_directly(): + ctx = setup_context_with_table() + my_udaf = udaf(MySumUDF()) + + result = ( + ctx.table("test_table").aggregate([col("b")], [my_udaf(col("a"))]).collect() + ) + + assert len(result) == 2 + assert result[0].num_columns == 2 + + result = [r.column(1) for r in result] + expected = [ + pa.array([3], type=pa.int64()), + pa.array([3], type=pa.int64()), + ] + + assert result == expected diff --git a/examples/datafusion-ffi-example/python/tests/_test_scalar_udf.py b/examples/datafusion-ffi-example/python/tests/_test_scalar_udf.py new file mode 100644 index 000000000..0c949c34a --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_scalar_udf.py @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pyarrow as pa +from datafusion import SessionContext, col, udf +from datafusion_ffi_example import IsNullUDF + + +def setup_context_with_table(): + ctx = SessionContext() + + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3, None])], + names=["a"], + ) + ctx.register_record_batches("test_table", [[batch]]) + return ctx + + +def test_ffi_scalar_register(): + ctx = setup_context_with_table() + my_udf = udf(IsNullUDF()) + ctx.register_udf(my_udf) + + result = ctx.sql("select my_custom_is_null(a) from test_table").collect() + + assert len(result) == 1 + assert result[0].num_columns == 1 + print(result) + + result = [r.column(0) for r in result] + expected = [ + pa.array([False, False, False, True], type=pa.bool_()), + ] + + assert result == expected + + +def test_ffi_scalar_call_directly(): + ctx = setup_context_with_table() + my_udf = udf(IsNullUDF()) + + result = ctx.table("test_table").select(my_udf(col("a"))).collect() + + assert len(result) == 1 + assert result[0].num_columns == 1 + print(result) + + result = [r.column(0) for r in result] + expected = [ + pa.array([False, False, False, True], type=pa.bool_()), + ] + + assert result == expected diff --git a/examples/datafusion-ffi-example/python/tests/_test_window_udf.py b/examples/datafusion-ffi-example/python/tests/_test_window_udf.py new file mode 100644 index 000000000..7d96994b9 --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_window_udf.py @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pyarrow as pa +from datafusion import SessionContext, col, udwf +from datafusion_ffi_example import MyRankUDF + + +def setup_context_with_table(): + ctx = SessionContext() + + # Pick numbers here so we get the same value in both groups + # since we cannot be certain of the output order of batches + batch = pa.RecordBatch.from_arrays( + [ + pa.array([40, 10, 30, 20], type=pa.int64()), + ], + names=["a"], + ) + ctx.register_record_batches("test_table", [[batch]]) + return ctx + + +def test_ffi_window_register(): + ctx = setup_context_with_table() + my_udwf = udwf(MyRankUDF()) + ctx.register_udwf(my_udwf) + + result = ctx.sql( + "select a, my_custom_rank() over (order by a) from test_table" + ).collect() + assert len(result) == 1 + assert result[0].num_columns == 2 + + results = [ + (result[0][0][idx].as_py(), result[0][1][idx].as_py()) for idx in range(4) + ] + results.sort() + + expected = [ + (10, 1), + (20, 2), + (30, 3), + (40, 4), + ] + assert results == expected + + +def test_ffi_window_call_directly(): + ctx = setup_context_with_table() + my_udwf = udwf(MyRankUDF()) + + result = ( + ctx.table("test_table") + .select(col("a"), my_udwf().order_by(col("a")).build()) + .collect() + ) + + assert len(result) == 1 + assert result[0].num_columns == 2 + + results = [ + (result[0][0][idx].as_py(), result[0][1][idx].as_py()) for idx in range(4) + ] + results.sort() + + expected = [ + (10, 1), + (20, 2), + (30, 3), + (40, 4), + ] + assert results == expected diff --git a/examples/datafusion-ffi-example/src/aggregate_udf.rs b/examples/datafusion-ffi-example/src/aggregate_udf.rs new file mode 100644 index 000000000..9481fe9c6 --- /dev/null +++ b/examples/datafusion-ffi-example/src/aggregate_udf.rs @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::DataType; +use datafusion::error::Result as DataFusionResult; +use datafusion::functions_aggregate::sum::Sum; +use datafusion::logical_expr::function::AccumulatorArgs; +use datafusion::logical_expr::{Accumulator, AggregateUDF, AggregateUDFImpl, Signature}; +use datafusion_ffi::udaf::FFI_AggregateUDF; +use pyo3::types::PyCapsule; +use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; +use std::any::Any; +use std::sync::Arc; + +#[pyclass(name = "MySumUDF", module = "datafusion_ffi_example", subclass)] +#[derive(Debug, Clone)] +pub(crate) struct MySumUDF { + inner: Arc, +} + +#[pymethods] +impl MySumUDF { + #[new] + fn new() -> Self { + Self { + inner: Arc::new(Sum::new()), + } + } + + fn __datafusion_aggregate_udf__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let name = cr"datafusion_aggregate_udf".into(); + + let func = Arc::new(AggregateUDF::from(self.clone())); + let provider = FFI_AggregateUDF::from(func); + + PyCapsule::new(py, provider, Some(name)) + } +} + +impl AggregateUDFImpl for MySumUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "my_custom_sum" + } + + fn signature(&self) -> &Signature { + self.inner.signature() + } + + fn return_type(&self, arg_types: &[DataType]) -> DataFusionResult { + self.inner.return_type(arg_types) + } + + fn accumulator(&self, acc_args: AccumulatorArgs) -> DataFusionResult> { + self.inner.accumulator(acc_args) + } + + fn coerce_types(&self, arg_types: &[DataType]) -> DataFusionResult> { + self.inner.coerce_types(arg_types) + } +} diff --git a/examples/datafusion-ffi-example/src/catalog_provider.rs b/examples/datafusion-ffi-example/src/catalog_provider.rs index 54e61cf3e..cd2616916 100644 --- a/examples/datafusion-ffi-example/src/catalog_provider.rs +++ b/examples/datafusion-ffi-example/src/catalog_provider.rs @@ -24,7 +24,6 @@ use datafusion::{ catalog::{ CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider, TableProvider, }, - common::exec_err, datasource::MemTable, error::{DataFusionError, Result}, }; diff --git a/examples/datafusion-ffi-example/src/lib.rs b/examples/datafusion-ffi-example/src/lib.rs index 3a4cf2247..79af276fd 100644 --- a/examples/datafusion-ffi-example/src/lib.rs +++ b/examples/datafusion-ffi-example/src/lib.rs @@ -16,18 +16,27 @@ // under the License. use crate::catalog_provider::MyCatalogProvider; +use crate::aggregate_udf::MySumUDF; +use crate::scalar_udf::IsNullUDF; use crate::table_function::MyTableFunction; use crate::table_provider::MyTableProvider; +use crate::window_udf::MyRankUDF; use pyo3::prelude::*; pub(crate) mod catalog_provider; +pub(crate) mod aggregate_udf; +pub(crate) mod scalar_udf; pub(crate) mod table_function; pub(crate) mod table_provider; +pub(crate) mod window_udf; #[pymodule] fn datafusion_ffi_example(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/examples/datafusion-ffi-example/src/scalar_udf.rs b/examples/datafusion-ffi-example/src/scalar_udf.rs new file mode 100644 index 000000000..727666638 --- /dev/null +++ b/examples/datafusion-ffi-example/src/scalar_udf.rs @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{Array, BooleanArray}; +use arrow_schema::DataType; +use datafusion::common::ScalarValue; +use datafusion::error::Result as DataFusionResult; +use datafusion::logical_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, TypeSignature, + Volatility, +}; +use datafusion_ffi::udf::FFI_ScalarUDF; +use pyo3::types::PyCapsule; +use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; +use std::any::Any; +use std::sync::Arc; + +#[pyclass(name = "IsNullUDF", module = "datafusion_ffi_example", subclass)] +#[derive(Debug, Clone)] +pub(crate) struct IsNullUDF { + signature: Signature, +} + +#[pymethods] +impl IsNullUDF { + #[new] + fn new() -> Self { + Self { + signature: Signature::new(TypeSignature::Any(1), Volatility::Immutable), + } + } + + fn __datafusion_scalar_udf__<'py>(&self, py: Python<'py>) -> PyResult> { + let name = cr"datafusion_scalar_udf".into(); + + let func = Arc::new(ScalarUDF::from(self.clone())); + let provider = FFI_ScalarUDF::from(func); + + PyCapsule::new(py, provider, Some(name)) + } +} + +impl ScalarUDFImpl for IsNullUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "my_custom_is_null" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> DataFusionResult { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> DataFusionResult { + let input = &args.args[0]; + + Ok(match input { + ColumnarValue::Array(arr) => match arr.is_nullable() { + true => { + let nulls = arr.nulls().unwrap(); + let nulls = BooleanArray::from_iter(nulls.iter().map(|x| Some(!x))); + ColumnarValue::Array(Arc::new(nulls)) + } + false => ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))), + }, + ColumnarValue::Scalar(sv) => { + ColumnarValue::Scalar(ScalarValue::Boolean(Some(sv == &ScalarValue::Null))) + } + }) + } +} diff --git a/examples/datafusion-ffi-example/src/window_udf.rs b/examples/datafusion-ffi-example/src/window_udf.rs new file mode 100644 index 000000000..e0d397956 --- /dev/null +++ b/examples/datafusion-ffi-example/src/window_udf.rs @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::{DataType, FieldRef}; +use datafusion::error::Result as DataFusionResult; +use datafusion::functions_window::rank::rank_udwf; +use datafusion::logical_expr::function::{PartitionEvaluatorArgs, WindowUDFFieldArgs}; +use datafusion::logical_expr::{PartitionEvaluator, Signature, WindowUDF, WindowUDFImpl}; +use datafusion_ffi::udwf::FFI_WindowUDF; +use pyo3::types::PyCapsule; +use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; +use std::any::Any; +use std::sync::Arc; + +#[pyclass(name = "MyRankUDF", module = "datafusion_ffi_example", subclass)] +#[derive(Debug, Clone)] +pub(crate) struct MyRankUDF { + inner: Arc, +} + +#[pymethods] +impl MyRankUDF { + #[new] + fn new() -> Self { + Self { inner: rank_udwf() } + } + + fn __datafusion_window_udf__<'py>(&self, py: Python<'py>) -> PyResult> { + let name = cr"datafusion_window_udf".into(); + + let func = Arc::new(WindowUDF::from(self.clone())); + let provider = FFI_WindowUDF::from(func); + + PyCapsule::new(py, provider, Some(name)) + } +} + +impl WindowUDFImpl for MyRankUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "my_custom_rank" + } + + fn signature(&self) -> &Signature { + self.inner.signature() + } + + fn partition_evaluator( + &self, + partition_evaluator_args: PartitionEvaluatorArgs, + ) -> DataFusionResult> { + self.inner + .inner() + .partition_evaluator(partition_evaluator_args) + } + + fn field(&self, field_args: WindowUDFFieldArgs) -> DataFusionResult { + self.inner.inner().field(field_args) + } + + fn coerce_types(&self, arg_types: &[DataType]) -> DataFusionResult> { + self.inner.coerce_types(arg_types) + } +} diff --git a/python/datafusion/user_defined.py b/python/datafusion/user_defined.py index dd634c7fb..bd686acbb 100644 --- a/python/datafusion/user_defined.py +++ b/python/datafusion/user_defined.py @@ -22,7 +22,7 @@ import functools from abc import ABCMeta, abstractmethod from enum import Enum -from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, overload +from typing import TYPE_CHECKING, Any, Callable, Optional, Protocol, TypeVar, overload import pyarrow as pa @@ -77,6 +77,12 @@ def __str__(self) -> str: return self.name.lower() +class ScalarUDFExportable(Protocol): + """Type hint for object that has __datafusion_scalar_udf__ PyCapsule.""" + + def __datafusion_scalar_udf__(self) -> object: ... # noqa: D105 + + class ScalarUDF: """Class for performing scalar user-defined functions (UDF). @@ -96,6 +102,9 @@ def __init__( See helper method :py:func:`udf` for argument details. """ + if hasattr(func, "__datafusion_scalar_udf__"): + self._udf = df_internal.ScalarUDF.from_pycapsule(func) + return if isinstance(input_types, pa.DataType): input_types = [input_types] self._udf = df_internal.ScalarUDF( @@ -134,6 +143,10 @@ def udf( name: Optional[str] = None, ) -> ScalarUDF: ... + @overload + @staticmethod + def udf(func: ScalarUDFExportable) -> ScalarUDF: ... + @staticmethod def udf(*args: Any, **kwargs: Any): # noqa: D417 """Create a new User-Defined Function (UDF). @@ -147,7 +160,10 @@ def udf(*args: Any, **kwargs: Any): # noqa: D417 Args: func (Callable, optional): Only needed when calling as a function. - Skip this argument when using ``udf`` as a decorator. + Skip this argument when using `udf` as a decorator. If you have a Rust + backed ScalarUDF within a PyCapsule, you can pass this parameter + and ignore the rest. They will be determined directly from the + underlying function. See the online documentation for more information. input_types (list[pa.DataType]): The data types of the arguments to ``func``. This list must be of the same length as the number of arguments. @@ -215,12 +231,31 @@ def wrapper(*args: Any, **kwargs: Any): return decorator + if hasattr(args[0], "__datafusion_scalar_udf__"): + return ScalarUDF.from_pycapsule(args[0]) + if args and callable(args[0]): # Case 1: Used as a function, require the first parameter to be callable return _function(*args, **kwargs) # Case 2: Used as a decorator with parameters return _decorator(*args, **kwargs) + @staticmethod + def from_pycapsule(func: ScalarUDFExportable) -> ScalarUDF: + """Create a Scalar UDF from ScalarUDF PyCapsule object. + + This function will instantiate a Scalar UDF that uses a DataFusion + ScalarUDF that is exported via the FFI bindings. + """ + name = str(func.__class__) + return ScalarUDF( + name=name, + func=func, + input_types=None, + return_type=None, + volatility=None, + ) + class Accumulator(metaclass=ABCMeta): """Defines how an :py:class:`AggregateUDF` accumulates values.""" @@ -242,6 +277,12 @@ def evaluate(self) -> pa.Scalar: """Return the resultant value.""" +class AggregateUDFExportable(Protocol): + """Type hint for object that has __datafusion_aggregate_udf__ PyCapsule.""" + + def __datafusion_aggregate_udf__(self) -> object: ... # noqa: D105 + + class AggregateUDF: """Class for performing scalar user-defined functions (UDF). @@ -263,6 +304,9 @@ def __init__( See :py:func:`udaf` for a convenience function and argument descriptions. """ + if hasattr(accumulator, "__datafusion_aggregate_udf__"): + self._udaf = df_internal.AggregateUDF.from_pycapsule(accumulator) + return self._udaf = df_internal.AggregateUDF( name, accumulator, @@ -307,7 +351,7 @@ def udaf( ) -> AggregateUDF: ... @staticmethod - def udaf(*args: Any, **kwargs: Any): # noqa: D417 + def udaf(*args: Any, **kwargs: Any): # noqa: D417, C901 """Create a new User-Defined Aggregate Function (UDAF). This class allows you to define an aggregate function that can be used in @@ -364,6 +408,10 @@ def udf4() -> Summarize: Args: accum: The accumulator python function. Only needed when calling as a function. Skip this argument when using ``udaf`` as a decorator. + If you have a Rust backed AggregateUDF within a PyCapsule, you can + pass this parameter and ignore the rest. They will be determined + directly from the underlying function. See the online documentation + for more information. input_types: The data types of the arguments to ``accum``. return_type: The data type of the return value. state_type: The data types of the intermediate accumulation. @@ -422,12 +470,32 @@ def wrapper(*args: Any, **kwargs: Any) -> Expr: return decorator + if hasattr(args[0], "__datafusion_aggregate_udf__"): + return AggregateUDF.from_pycapsule(args[0]) + if args and callable(args[0]): # Case 1: Used as a function, require the first parameter to be callable return _function(*args, **kwargs) # Case 2: Used as a decorator with parameters return _decorator(*args, **kwargs) + @staticmethod + def from_pycapsule(func: AggregateUDFExportable) -> AggregateUDF: + """Create an Aggregate UDF from AggregateUDF PyCapsule object. + + This function will instantiate a Aggregate UDF that uses a DataFusion + AggregateUDF that is exported via the FFI bindings. + """ + name = str(func.__class__) + return AggregateUDF( + name=name, + accumulator=func, + input_types=None, + return_type=None, + state_type=None, + volatility=None, + ) + class WindowEvaluator: """Evaluator class for user-defined window functions (UDWF). @@ -588,6 +656,12 @@ def include_rank(self) -> bool: return False +class WindowUDFExportable(Protocol): + """Type hint for object that has __datafusion_window_udf__ PyCapsule.""" + + def __datafusion_window_udf__(self) -> object: ... # noqa: D105 + + class WindowUDF: """Class for performing window user-defined functions (UDF). @@ -608,6 +682,9 @@ def __init__( See :py:func:`udwf` for a convenience function and argument descriptions. """ + if hasattr(func, "__datafusion_window_udf__"): + self._udwf = df_internal.WindowUDF.from_pycapsule(func) + return self._udwf = df_internal.WindowUDF( name, func, input_types, return_type, str(volatility) ) @@ -683,7 +760,10 @@ def biased_numbers() -> BiasedNumbers: Args: func: Only needed when calling as a function. Skip this argument when - using ``udwf`` as a decorator. + using ``udwf`` as a decorator. If you have a Rust backed WindowUDF + within a PyCapsule, you can pass this parameter and ignore the rest. + They will be determined directly from the underlying function. See + the online documentation for more information. input_types: The data types of the arguments. return_type: The data type of the return value. volatility: See :py:class:`Volatility` for allowed values. @@ -692,6 +772,9 @@ def biased_numbers() -> BiasedNumbers: Returns: A user-defined window function that can be used in window function calls. """ + if hasattr(args[0], "__datafusion_window_udf__"): + return WindowUDF.from_pycapsule(args[0]) + if args and callable(args[0]): # Case 1: Used as a function, require the first parameter to be callable return WindowUDF._create_window_udf(*args, **kwargs) @@ -759,6 +842,22 @@ def wrapper(*args: Any, **kwargs: Any) -> Expr: return decorator + @staticmethod + def from_pycapsule(func: WindowUDFExportable) -> WindowUDF: + """Create a Window UDF from WindowUDF PyCapsule object. + + This function will instantiate a Window UDF that uses a DataFusion + WindowUDF that is exported via the FFI bindings. + """ + name = str(func.__class__) + return WindowUDF( + name=name, + func=func, + input_types=None, + return_type=None, + volatility=None, + ) + class TableFunction: """Class for performing user-defined table functions (UDTF). diff --git a/src/functions.rs b/src/functions.rs index b40500b8b..eeef48385 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -682,7 +682,7 @@ pub fn approx_percentile_cont_with_weight( add_builder_fns_to_aggregate(agg_fn, None, filter, None, None) } -// We handle first_value explicitly because the signature expects an order_by +// We handle last_value explicitly because the signature expects an order_by // https://github.com/apache/datafusion/issues/12376 #[pyfunction] #[pyo3(signature = (expr, distinct=None, filter=None, order_by=None, null_treatment=None))] diff --git a/src/udaf.rs b/src/udaf.rs index 34a9cd51d..78f4e2b0c 100644 --- a/src/udaf.rs +++ b/src/udaf.rs @@ -19,6 +19,10 @@ use std::sync::Arc; use pyo3::{prelude::*, types::PyTuple}; +use crate::common::data_type::PyScalarValue; +use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionResult}; +use crate::expr::PyExpr; +use crate::utils::{parse_volatility, validate_pycapsule}; use datafusion::arrow::array::{Array, ArrayRef}; use datafusion::arrow::datatypes::DataType; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; @@ -27,11 +31,8 @@ use datafusion::error::{DataFusionError, Result}; use datafusion::logical_expr::{ create_udaf, Accumulator, AccumulatorFactoryFunction, AggregateUDF, }; - -use crate::common::data_type::PyScalarValue; -use crate::errors::to_datafusion_err; -use crate::expr::PyExpr; -use crate::utils::parse_volatility; +use datafusion_ffi::udaf::{FFI_AggregateUDF, ForeignAggregateUDF}; +use pyo3::types::PyCapsule; #[derive(Debug)] struct RustAccumulator { @@ -183,6 +184,26 @@ impl PyAggregateUDF { Ok(Self { function }) } + #[staticmethod] + pub fn from_pycapsule(func: Bound<'_, PyAny>) -> PyDataFusionResult { + if func.hasattr("__datafusion_aggregate_udf__")? { + let capsule = func.getattr("__datafusion_aggregate_udf__")?.call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_aggregate_udf")?; + + let udaf = unsafe { capsule.reference::() }; + let udaf: ForeignAggregateUDF = udaf.try_into()?; + + Ok(Self { + function: udaf.into(), + }) + } else { + Err(crate::errors::PyDataFusionError::Common( + "__datafusion_aggregate_udf__ does not exist on AggregateUDF object.".to_string(), + )) + } + } + /// creates a new PyExpr with the call of the udf #[pyo3(signature = (*args))] fn __call__(&self, args: Vec) -> PyResult { diff --git a/src/udf.rs b/src/udf.rs index 574c9d7b5..de1e3f18c 100644 --- a/src/udf.rs +++ b/src/udf.rs @@ -17,6 +17,8 @@ use std::sync::Arc; +use datafusion_ffi::udf::{FFI_ScalarUDF, ForeignScalarUDF}; +use pyo3::types::PyCapsule; use pyo3::{prelude::*, types::PyTuple}; use datafusion::arrow::array::{make_array, Array, ArrayData, ArrayRef}; @@ -29,8 +31,9 @@ use datafusion::logical_expr::ScalarUDF; use datafusion::logical_expr::{create_udf, ColumnarValue}; use crate::errors::to_datafusion_err; +use crate::errors::{py_datafusion_err, PyDataFusionResult}; use crate::expr::PyExpr; -use crate::utils::parse_volatility; +use crate::utils::{parse_volatility, validate_pycapsule}; /// Create a Rust callable function from a python function that expects pyarrow arrays fn pyarrow_function_to_rust( @@ -105,6 +108,26 @@ impl PyScalarUDF { Ok(Self { function }) } + #[staticmethod] + pub fn from_pycapsule(func: Bound<'_, PyAny>) -> PyDataFusionResult { + if func.hasattr("__datafusion_scalar_udf__")? { + let capsule = func.getattr("__datafusion_scalar_udf__")?.call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_scalar_udf")?; + + let udf = unsafe { capsule.reference::() }; + let udf: ForeignScalarUDF = udf.try_into()?; + + Ok(Self { + function: udf.into(), + }) + } else { + Err(crate::errors::PyDataFusionError::Common( + "__datafusion_scalar_udf__ does not exist on ScalarUDF object.".to_string(), + )) + } + } + /// creates a new PyExpr with the call of the udf #[pyo3(signature = (*args))] fn __call__(&self, args: Vec) -> PyResult { diff --git a/src/udwf.rs b/src/udwf.rs index a0c8cc59a..4fb98916b 100644 --- a/src/udwf.rs +++ b/src/udwf.rs @@ -27,16 +27,17 @@ use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use crate::common::data_type::PyScalarValue; -use crate::errors::to_datafusion_err; +use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionResult}; use crate::expr::PyExpr; -use crate::utils::parse_volatility; +use crate::utils::{parse_volatility, validate_pycapsule}; use datafusion::arrow::datatypes::DataType; use datafusion::arrow::pyarrow::{FromPyArrow, PyArrowType, ToPyArrow}; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_expr::{ PartitionEvaluator, PartitionEvaluatorFactory, Signature, Volatility, WindowUDF, WindowUDFImpl, }; -use pyo3::types::{PyList, PyTuple}; +use datafusion_ffi::udwf::{FFI_WindowUDF, ForeignWindowUDF}; +use pyo3::types::{PyCapsule, PyList, PyTuple}; #[derive(Debug)] struct RustPartitionEvaluator { @@ -245,6 +246,26 @@ impl PyWindowUDF { Ok(self.function.call(args).into()) } + #[staticmethod] + pub fn from_pycapsule(func: Bound<'_, PyAny>) -> PyDataFusionResult { + if func.hasattr("__datafusion_window_udf__")? { + let capsule = func.getattr("__datafusion_window_udf__")?.call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_window_udf")?; + + let udwf = unsafe { capsule.reference::() }; + let udwf: ForeignWindowUDF = udwf.try_into()?; + + Ok(Self { + function: udwf.into(), + }) + } else { + Err(crate::errors::PyDataFusionError::Common( + "__datafusion_window_udf__ does not exist on WindowUDF object.".to_string(), + )) + } + } + fn __repr__(&self) -> PyResult { Ok(format!("WindowUDF({})", self.function.name())) } From 2e1b71369eefc97c22b82be84bbabb414f748fb9 Mon Sep 17 00:00:00 2001 From: kosiew Date: Fri, 4 Jul 2025 20:36:05 +0800 Subject: [PATCH 058/145] refactor: style loading logic in DataFrameHtmlFormatter (#1177) --- python/datafusion/dataframe_formatter.py | 117 ++++++++--------------- python/tests/test_dataframe.py | 60 +++++------- 2 files changed, 63 insertions(+), 114 deletions(-) diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py index 27f00f9c3..2323224b8 100644 --- a/python/datafusion/dataframe_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -135,9 +135,6 @@ class DataFrameHtmlFormatter: session """ - # Class variable to track if styles have been loaded in the notebook - _styles_loaded = False - def __init__( self, max_cell_length: int = 25, @@ -260,23 +257,6 @@ def set_custom_header_builder(self, builder: Callable[[Any], str]) -> None: """ self._custom_header_builder = builder - @classmethod - def is_styles_loaded(cls) -> bool: - """Check if HTML styles have been loaded in the current session. - - This method is primarily intended for debugging UI rendering issues - related to style loading. - - Returns: - True if styles have been loaded, False otherwise - - Example: - >>> from datafusion.dataframe_formatter import DataFrameHtmlFormatter - >>> DataFrameHtmlFormatter.is_styles_loaded() - False - """ - return cls._styles_loaded - def format_html( self, batches: list, @@ -315,18 +295,7 @@ def format_html( # Build HTML components html = [] - # Only include styles and scripts if: - # 1. Not using shared styles, OR - # 2. Using shared styles but they haven't been loaded yet - include_styles = ( - not self.use_shared_styles or not DataFrameHtmlFormatter._styles_loaded - ) - - if include_styles: - html.extend(self._build_html_header()) - # If we're using shared styles, mark them as loaded - if self.use_shared_styles: - DataFrameHtmlFormatter._styles_loaded = True + html.extend(self._build_html_header()) html.extend(self._build_table_container_start()) @@ -338,7 +307,7 @@ def format_html( html.append("") # Add footer (JavaScript and messages) - if include_styles and self.enable_cell_expansion: + if self.enable_cell_expansion: html.append(self._get_javascript()) # Always add truncation message if needed (independent of styles) @@ -375,14 +344,20 @@ def format_str( def _build_html_header(self) -> list[str]: """Build the HTML header with CSS styles.""" - html = [] - html.append("") + html.append(f"") return html def _build_table_container_start(self) -> list[str]: @@ -570,28 +545,31 @@ def _get_default_css(self) -> str: def _get_javascript(self) -> str: """Get JavaScript code for interactive elements.""" return """ - - """ + +""" class FormatterManager: @@ -712,24 +690,9 @@ def reset_formatter() -> None: >>> reset_formatter() # Reset formatter to default settings """ formatter = DataFrameHtmlFormatter() - # Reset the styles_loaded flag to ensure styles will be reloaded - DataFrameHtmlFormatter._styles_loaded = False set_formatter(formatter) -def reset_styles_loaded_state() -> None: - """Reset the styles loaded state to force reloading of styles. - - This can be useful when switching between notebook sessions or - when styles need to be refreshed. - - Example: - >>> from datafusion.html_formatter import reset_styles_loaded_state - >>> reset_styles_loaded_state() # Force styles to reload in next render - """ - DataFrameHtmlFormatter._styles_loaded = False - - def _refresh_formatter_reference() -> None: """Refresh formatter reference in any modules using it. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index c9ae38d8e..a3870ead8 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -42,7 +42,6 @@ configure_formatter, get_formatter, reset_formatter, - reset_styles_loaded_state, ) from datafusion.expr import Window from pyarrow.csv import write_csv @@ -2177,27 +2176,15 @@ def test_html_formatter_shared_styles(df, clean_formatter_state): # First, ensure we're using shared styles configure_formatter(use_shared_styles=True) - # Get HTML output for first table - should include styles html_first = df._repr_html_() - - # Verify styles are included in first render - assert "
" - f"{field.name}
" - f"
" - "" - "" - f"{formatted_value}" - f"" - f"
" - f"
{formatted_value}