From 98dc06b553b69576d50157542f761b44f456dcf2 Mon Sep 17 00:00:00 2001 From: Nuno Faria Date: Sat, 21 Jun 2025 15:20:21 +0100 Subject: [PATCH 01/96] feat: Support Parquet writer options (#1123) * feat: Support Parquet writer options * Create dedicated write_parquet_options function * Rename write_parquet_options to write_parquet_with_options * Merge remote-tracking branch 'origin/main' into write_parquet_options * Fix ruff errors --- python/datafusion/__init__.py | 4 +- python/datafusion/dataframe.py | 221 ++++++++++++++++++ python/tests/test_dataframe.py | 406 +++++++++++++++++++++++++++++++++ src/dataframe.rs | 126 +++++++++- src/lib.rs | 2 + 5 files changed, 757 insertions(+), 2 deletions(-) diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 4f7700251..16d65f685 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -46,7 +46,7 @@ SessionContext, SQLOptions, ) -from .dataframe import DataFrame +from .dataframe import DataFrame, ParquetColumnOptions, ParquetWriterOptions from .expr import ( Expr, WindowFrame, @@ -80,6 +80,8 @@ "ExecutionPlan", "Expr", "LogicalPlan", + "ParquetColumnOptions", + "ParquetWriterOptions", "RecordBatch", "RecordBatchStream", "RuntimeEnvBuilder", diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index a1df7e080..769271c7e 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -38,6 +38,8 @@ from typing_extensions import deprecated # Python 3.12 from datafusion._internal import DataFrame as DataFrameInternal +from datafusion._internal import ParquetColumnOptions as ParquetColumnOptionsInternal +from datafusion._internal import ParquetWriterOptions as ParquetWriterOptionsInternal from datafusion.expr import Expr, SortExpr, sort_or_default from datafusion.plan import ExecutionPlan, LogicalPlan from datafusion.record_batch import RecordBatchStream @@ -114,6 +116,173 @@ def get_default_level(self) -> Optional[int]: return None +class ParquetWriterOptions: + """Advanced parquet writer options. + + Allows settings the writer options that apply to the entire file. Some options can + also be set on a column by column basis, with the field `column_specific_options` + (see `ParquetColumnOptions`). + + Attributes: + data_pagesize_limit: Sets best effort maximum size of data page in bytes. + write_batch_size: Sets write_batch_size in bytes. + writer_version: Sets parquet writer version. Valid values are `1.0` and + `2.0`. + skip_arrow_metadata: Skip encoding the embedded arrow metadata in the + KV_meta. + compression: Compression type to use. Default is "zstd(3)". + Available compression types are + - "uncompressed": No compression. + - "snappy": Snappy compression. + - "gzip(n)": Gzip compression with level n. + - "brotli(n)": Brotli compression with level n. + - "lz4": LZ4 compression. + - "lz4_raw": LZ4_RAW compression. + - "zstd(n)": Zstandard compression with level n. + dictionary_enabled: Sets if dictionary encoding is enabled. If None, uses + the default parquet writer setting. + dictionary_page_size_limit: Sets best effort maximum dictionary page size, + in bytes. + statistics_enabled: Sets if statistics are enabled for any column Valid + values are `none`, `chunk`, and `page`. If None, uses the default + parquet writer setting. + max_row_group_size: Target maximum number of rows in each row group + (defaults to 1M rows). Writing larger row groups requires more memory to + write, but can get better compression and be faster to read. + created_by: Sets "created by" property. + column_index_truncate_length: Sets column index truncate length. + statistics_truncate_length: Sets statistics truncate length. If None, uses + the default parquet writer setting. + data_page_row_count_limit: Sets best effort maximum number of rows in a data + page. + encoding: Sets default encoding for any column. Valid values are `plain`, + `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`, + `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and + `byte_stream_split`. If None, uses the default parquet writer setting. + bloom_filter_on_write: Write bloom filters for all columns when creating + parquet files. + bloom_filter_fpp: Sets bloom filter false positive probability. If None, + uses the default parquet writer setting + bloom_filter_ndv: Sets bloom filter number of distinct values. If None, uses + the default parquet writer setting. + allow_single_file_parallelism: Controls whether DataFusion will attempt to + speed up writing parquet files by serializing them in parallel. Each + column in each row group in each output file are serialized in parallel + leveraging a maximum possible core count of n_files * n_row_groups * + n_columns. + maximum_parallel_row_group_writers: By default parallel parquet writer is + tuned for minimum memory usage in a streaming execution plan. You may + see a performance benefit when writing large parquet files by increasing + `maximum_parallel_row_group_writers` and + `maximum_buffered_record_batches_per_stream` if your system has idle + cores and can tolerate additional memory usage. Boosting these values is + likely worthwhile when writing out already in-memory data, such as from + a cached data frame. + maximum_buffered_record_batches_per_stream: See + `maximum_parallel_row_group_writers`. + column_specific_options: Overrides options for specific columns. If a column + is not a part of this dictionary, it will use the parameters provided here. + """ + + def __init__( + self, + data_pagesize_limit: int = 1024 * 1024, + write_batch_size: int = 1024, + writer_version: str = "1.0", + skip_arrow_metadata: bool = False, + compression: Optional[str] = "zstd(3)", + dictionary_enabled: Optional[bool] = True, + dictionary_page_size_limit: int = 1024 * 1024, + statistics_enabled: Optional[str] = "page", + max_row_group_size: int = 1024 * 1024, + created_by: str = "datafusion-python", + column_index_truncate_length: Optional[int] = 64, + statistics_truncate_length: Optional[int] = None, + data_page_row_count_limit: int = 20_000, + encoding: Optional[str] = None, + bloom_filter_on_write: bool = False, + bloom_filter_fpp: Optional[float] = None, + bloom_filter_ndv: Optional[int] = None, + allow_single_file_parallelism: bool = True, + maximum_parallel_row_group_writers: int = 1, + maximum_buffered_record_batches_per_stream: int = 2, + column_specific_options: Optional[dict[str, ParquetColumnOptions]] = None, + ) -> None: + """Initialize the ParquetWriterOptions.""" + self.data_pagesize_limit = data_pagesize_limit + self.write_batch_size = write_batch_size + self.writer_version = writer_version + self.skip_arrow_metadata = skip_arrow_metadata + self.compression = compression + self.dictionary_enabled = dictionary_enabled + self.dictionary_page_size_limit = dictionary_page_size_limit + self.statistics_enabled = statistics_enabled + self.max_row_group_size = max_row_group_size + self.created_by = created_by + self.column_index_truncate_length = column_index_truncate_length + self.statistics_truncate_length = statistics_truncate_length + self.data_page_row_count_limit = data_page_row_count_limit + self.encoding = encoding + self.bloom_filter_on_write = bloom_filter_on_write + self.bloom_filter_fpp = bloom_filter_fpp + self.bloom_filter_ndv = bloom_filter_ndv + self.allow_single_file_parallelism = allow_single_file_parallelism + self.maximum_parallel_row_group_writers = maximum_parallel_row_group_writers + self.maximum_buffered_record_batches_per_stream = ( + maximum_buffered_record_batches_per_stream + ) + self.column_specific_options = column_specific_options + + +class ParquetColumnOptions: + """Parquet options for individual columns. + + Contains the available options that can be applied for an individual Parquet column, + replacing the global options in `ParquetWriterOptions`. + + Attributes: + encoding: Sets encoding for the column path. Valid values are: `plain`, + `plain_dictionary`, `rle`, `bit_packed`, `delta_binary_packed`, + `delta_length_byte_array`, `delta_byte_array`, `rle_dictionary`, and + `byte_stream_split`. These values are not case-sensitive. If `None`, uses + the default parquet options + dictionary_enabled: Sets if dictionary encoding is enabled for the column path. + If `None`, uses the default parquet options + compression: Sets default parquet compression codec for the column path. Valid + values are `uncompressed`, `snappy`, `gzip(level)`, `lzo`, `brotli(level)`, + `lz4`, `zstd(level)`, and `lz4_raw`. These values are not case-sensitive. If + `None`, uses the default parquet options. + statistics_enabled: Sets if statistics are enabled for the column Valid values + are: `none`, `chunk`, and `page` These values are not case sensitive. If + `None`, uses the default parquet options. + bloom_filter_enabled: Sets if bloom filter is enabled for the column path. If + `None`, uses the default parquet options. + bloom_filter_fpp: Sets bloom filter false positive probability for the column + path. If `None`, uses the default parquet options. + bloom_filter_ndv: Sets bloom filter number of distinct values. If `None`, uses + the default parquet options. + """ + + def __init__( + self, + encoding: Optional[str] = None, + dictionary_enabled: Optional[bool] = None, + compression: Optional[str] = None, + statistics_enabled: Optional[str] = None, + bloom_filter_enabled: Optional[bool] = None, + bloom_filter_fpp: Optional[float] = None, + bloom_filter_ndv: Optional[int] = None, + ) -> None: + """Initialize the ParquetColumnOptions.""" + self.encoding = encoding + self.dictionary_enabled = dictionary_enabled + self.compression = compression + self.statistics_enabled = statistics_enabled + self.bloom_filter_enabled = bloom_filter_enabled + self.bloom_filter_fpp = bloom_filter_fpp + self.bloom_filter_ndv = bloom_filter_ndv + + class DataFrame: """Two dimensional table representation of data. @@ -737,6 +906,58 @@ def write_parquet( self.df.write_parquet(str(path), compression.value, compression_level) + def write_parquet_with_options( + self, path: str | pathlib.Path, options: ParquetWriterOptions + ) -> None: + """Execute the :py:class:`DataFrame` and write the results to a Parquet file. + + Allows advanced writer options to be set with `ParquetWriterOptions`. + + Args: + path: Path of the Parquet file to write. + options: Sets the writer parquet options (see `ParquetWriterOptions`). + """ + options_internal = ParquetWriterOptionsInternal( + options.data_pagesize_limit, + options.write_batch_size, + options.writer_version, + options.skip_arrow_metadata, + options.compression, + options.dictionary_enabled, + options.dictionary_page_size_limit, + options.statistics_enabled, + options.max_row_group_size, + options.created_by, + options.column_index_truncate_length, + options.statistics_truncate_length, + options.data_page_row_count_limit, + options.encoding, + options.bloom_filter_on_write, + options.bloom_filter_fpp, + options.bloom_filter_ndv, + options.allow_single_file_parallelism, + options.maximum_parallel_row_group_writers, + options.maximum_buffered_record_batches_per_stream, + ) + + column_specific_options_internal = {} + for column, opts in (options.column_specific_options or {}).items(): + column_specific_options_internal[column] = ParquetColumnOptionsInternal( + bloom_filter_enabled=opts.bloom_filter_enabled, + encoding=opts.encoding, + dictionary_enabled=opts.dictionary_enabled, + compression=opts.compression, + statistics_enabled=opts.statistics_enabled, + bloom_filter_fpp=opts.bloom_filter_fpp, + bloom_filter_ndv=opts.bloom_filter_ndv, + ) + + self.df.write_parquet_with_options( + str(path), + options_internal, + column_specific_options_internal, + ) + def write_json(self, path: str | pathlib.Path) -> None: """Execute the :py:class:`DataFrame` and write the results to a JSON file. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 64220ce9c..3c9b97f23 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -27,6 +27,8 @@ import pytest from datafusion import ( DataFrame, + ParquetColumnOptions, + ParquetWriterOptions, SessionContext, WindowFrame, column, @@ -66,6 +68,21 @@ def df(): return ctx.from_arrow(batch) +@pytest.fixture +def large_df(): + ctx = SessionContext() + + rows = 100000 + data = { + "a": list(range(rows)), + "b": [f"s-{i}" for i in range(rows)], + "c": [float(i + 0.1) for i in range(rows)], + } + batch = pa.record_batch(data) + + return ctx.from_arrow(batch) + + @pytest.fixture def struct_df(): ctx = SessionContext() @@ -1632,6 +1649,395 @@ def test_write_compressed_parquet_default_compression_level(df, tmp_path, compre df.write_parquet(str(path), compression=compression) +def test_write_parquet_with_options_default_compression(df, tmp_path): + """Test that the default compression is ZSTD.""" + df.write_parquet(tmp_path) + + for file in tmp_path.rglob("*.parquet"): + metadata = pq.ParquetFile(file).metadata.to_dict() + for row_group in metadata["row_groups"]: + for col in row_group["columns"]: + assert col["compression"].lower() == "zstd" + + +@pytest.mark.parametrize( + "compression", + ["gzip(6)", "brotli(7)", "zstd(15)", "snappy", "uncompressed"], +) +def test_write_parquet_with_options_compression(df, tmp_path, compression): + import re + + path = tmp_path + df.write_parquet_with_options( + str(path), ParquetWriterOptions(compression=compression) + ) + + # test that the actual compression scheme is the one written + for _root, _dirs, files in os.walk(path): + for file in files: + if file.endswith(".parquet"): + metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict() + for row_group in metadata["row_groups"]: + for col in row_group["columns"]: + assert col["compression"].lower() == re.sub( + r"\(\d+\)", "", compression + ) + + result = pq.read_table(str(path)).to_pydict() + expected = df.to_pydict() + + assert result == expected + + +@pytest.mark.parametrize( + "compression", + ["gzip(12)", "brotli(15)", "zstd(23)"], +) +def test_write_parquet_with_options_wrong_compression_level(df, tmp_path, compression): + path = tmp_path + + with pytest.raises(Exception, match=r"valid compression range .*? exceeded."): + df.write_parquet_with_options( + str(path), ParquetWriterOptions(compression=compression) + ) + + +@pytest.mark.parametrize("compression", ["wrong", "wrong(12)"]) +def test_write_parquet_with_options_invalid_compression(df, tmp_path, compression): + path = tmp_path + + with pytest.raises(Exception, match="Unknown or unsupported parquet compression"): + df.write_parquet_with_options( + str(path), ParquetWriterOptions(compression=compression) + ) + + +@pytest.mark.parametrize( + ("writer_version", "format_version"), + [("1.0", "1.0"), ("2.0", "2.6"), (None, "1.0")], +) +def test_write_parquet_with_options_writer_version( + df, tmp_path, writer_version, format_version +): + """Test the Parquet writer version. Note that writer_version=2.0 results in + format_version=2.6""" + if writer_version is None: + df.write_parquet_with_options(tmp_path, ParquetWriterOptions()) + else: + df.write_parquet_with_options( + tmp_path, ParquetWriterOptions(writer_version=writer_version) + ) + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + assert metadata["format_version"] == format_version + + +@pytest.mark.parametrize("writer_version", ["1.2.3", "custom-version", "0"]) +def test_write_parquet_with_options_wrong_writer_version(df, tmp_path, writer_version): + """Test that invalid writer versions in Parquet throw an exception.""" + with pytest.raises( + Exception, match="Unknown or unsupported parquet writer version" + ): + df.write_parquet_with_options( + tmp_path, ParquetWriterOptions(writer_version=writer_version) + ) + + +@pytest.mark.parametrize("dictionary_enabled", [True, False, None]) +def test_write_parquet_with_options_dictionary_enabled( + df, tmp_path, dictionary_enabled +): + """Test enabling/disabling the dictionaries in Parquet.""" + df.write_parquet_with_options( + tmp_path, ParquetWriterOptions(dictionary_enabled=dictionary_enabled) + ) + # by default, the dictionary is enabled, so None results in True + result = dictionary_enabled if dictionary_enabled is not None else True + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + + for row_group in metadata["row_groups"]: + for col in row_group["columns"]: + assert col["has_dictionary_page"] == result + + +@pytest.mark.parametrize( + ("statistics_enabled", "has_statistics"), + [("page", True), ("chunk", True), ("none", False), (None, True)], +) +def test_write_parquet_with_options_statistics_enabled( + df, tmp_path, statistics_enabled, has_statistics +): + """Test configuring the statistics in Parquet. In pyarrow we can only check for + column-level statistics, so "page" and "chunk" are tested in the same way.""" + df.write_parquet_with_options( + tmp_path, ParquetWriterOptions(statistics_enabled=statistics_enabled) + ) + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + + for row_group in metadata["row_groups"]: + for col in row_group["columns"]: + if has_statistics: + assert col["statistics"] is not None + else: + assert col["statistics"] is None + + +@pytest.mark.parametrize("max_row_group_size", [1000, 5000, 10000, 100000]) +def test_write_parquet_with_options_max_row_group_size( + large_df, tmp_path, max_row_group_size +): + """Test configuring the max number of rows per group in Parquet. These test cases + guarantee that the number of rows for each row group is max_row_group_size, given + the total number of rows is a multiple of max_row_group_size.""" + large_df.write_parquet_with_options( + tmp_path, ParquetWriterOptions(max_row_group_size=max_row_group_size) + ) + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + for row_group in metadata["row_groups"]: + assert row_group["num_rows"] == max_row_group_size + + +@pytest.mark.parametrize("created_by", ["datafusion", "datafusion-python", "custom"]) +def test_write_parquet_with_options_created_by(df, tmp_path, created_by): + """Test configuring the created by metadata in Parquet.""" + df.write_parquet_with_options(tmp_path, ParquetWriterOptions(created_by=created_by)) + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + assert metadata["created_by"] == created_by + + +@pytest.mark.parametrize("statistics_truncate_length", [5, 25, 50]) +def test_write_parquet_with_options_statistics_truncate_length( + df, tmp_path, statistics_truncate_length +): + """Test configuring the truncate limit in Parquet's row-group-level statistics.""" + ctx = SessionContext() + data = { + "a": [ + "a_the_quick_brown_fox_jumps_over_the_lazy_dog", + "m_the_quick_brown_fox_jumps_over_the_lazy_dog", + "z_the_quick_brown_fox_jumps_over_the_lazy_dog", + ], + "b": ["a_smaller", "m_smaller", "z_smaller"], + } + df = ctx.from_arrow(pa.record_batch(data)) + df.write_parquet_with_options( + tmp_path, + ParquetWriterOptions(statistics_truncate_length=statistics_truncate_length), + ) + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + + for row_group in metadata["row_groups"]: + for col in row_group["columns"]: + statistics = col["statistics"] + assert len(statistics["min"]) <= statistics_truncate_length + assert len(statistics["max"]) <= statistics_truncate_length + + +def test_write_parquet_with_options_default_encoding(tmp_path): + """Test that, by default, Parquet files are written with dictionary encoding. + Note that dictionary encoding is not used for boolean values, so it is not tested + here.""" + ctx = SessionContext() + data = { + "a": [1, 2, 3], + "b": ["1", "2", "3"], + "c": [1.01, 2.02, 3.03], + } + df = ctx.from_arrow(pa.record_batch(data)) + df.write_parquet_with_options(tmp_path, ParquetWriterOptions()) + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + + for row_group in metadata["row_groups"]: + for col in row_group["columns"]: + assert col["encodings"] == ("PLAIN", "RLE", "RLE_DICTIONARY") + + +@pytest.mark.parametrize( + ("encoding", "data_types", "result"), + [ + ("plain", ["int", "float", "str", "bool"], ("PLAIN", "RLE")), + ("rle", ["bool"], ("RLE",)), + ("delta_binary_packed", ["int"], ("RLE", "DELTA_BINARY_PACKED")), + ("delta_length_byte_array", ["str"], ("RLE", "DELTA_LENGTH_BYTE_ARRAY")), + ("delta_byte_array", ["str"], ("RLE", "DELTA_BYTE_ARRAY")), + ("byte_stream_split", ["int", "float"], ("RLE", "BYTE_STREAM_SPLIT")), + ], +) +def test_write_parquet_with_options_encoding(tmp_path, encoding, data_types, result): + """Test different encodings in Parquet in their respective support column types.""" + ctx = SessionContext() + + data = {} + for data_type in data_types: + if data_type == "int": + data["int"] = [1, 2, 3] + elif data_type == "float": + data["float"] = [1.01, 2.02, 3.03] + elif data_type == "str": + data["str"] = ["a", "b", "c"] + elif data_type == "bool": + data["bool"] = [True, False, True] + + df = ctx.from_arrow(pa.record_batch(data)) + df.write_parquet_with_options( + tmp_path, ParquetWriterOptions(encoding=encoding, dictionary_enabled=False) + ) + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + + for row_group in metadata["row_groups"]: + for col in row_group["columns"]: + assert col["encodings"] == result + + +@pytest.mark.parametrize("encoding", ["bit_packed"]) +def test_write_parquet_with_options_unsupported_encoding(df, tmp_path, encoding): + """Test that unsupported Parquet encodings do not work.""" + # BaseException is used since this throws a Rust panic: https://github.com/PyO3/pyo3/issues/3519 + with pytest.raises(BaseException, match="Encoding .*? is not supported"): + df.write_parquet_with_options(tmp_path, ParquetWriterOptions(encoding=encoding)) + + +@pytest.mark.parametrize("encoding", ["non_existent", "unknown", "plain123"]) +def test_write_parquet_with_options_invalid_encoding(df, tmp_path, encoding): + """Test that invalid Parquet encodings do not work.""" + with pytest.raises(Exception, match="Unknown or unsupported parquet encoding"): + df.write_parquet_with_options(tmp_path, ParquetWriterOptions(encoding=encoding)) + + +@pytest.mark.parametrize("encoding", ["plain_dictionary", "rle_dictionary"]) +def test_write_parquet_with_options_dictionary_encoding_fallback( + df, tmp_path, encoding +): + """Test that the dictionary encoding cannot be used as fallback in Parquet.""" + # BaseException is used since this throws a Rust panic: https://github.com/PyO3/pyo3/issues/3519 + with pytest.raises( + BaseException, match="Dictionary encoding can not be used as fallback encoding" + ): + df.write_parquet_with_options(tmp_path, ParquetWriterOptions(encoding=encoding)) + + +def test_write_parquet_with_options_bloom_filter(df, tmp_path): + """Test Parquet files with and without (default) bloom filters. Since pyarrow does + not expose any information about bloom filters, the easiest way to confirm that they + are actually written is to compare the file size.""" + path_no_bloom_filter = tmp_path / "1" + path_bloom_filter = tmp_path / "2" + + df.write_parquet_with_options(path_no_bloom_filter, ParquetWriterOptions()) + df.write_parquet_with_options( + path_bloom_filter, ParquetWriterOptions(bloom_filter_on_write=True) + ) + + size_no_bloom_filter = 0 + for file in path_no_bloom_filter.rglob("*.parquet"): + size_no_bloom_filter += os.path.getsize(file) + + size_bloom_filter = 0 + for file in path_bloom_filter.rglob("*.parquet"): + size_bloom_filter += os.path.getsize(file) + + assert size_no_bloom_filter < size_bloom_filter + + +def test_write_parquet_with_options_column_options(df, tmp_path): + """Test writing Parquet files with different options for each column, which replace + the global configs (when provided).""" + data = { + "a": [1, 2, 3], + "b": ["a", "b", "c"], + "c": [False, True, False], + "d": [1.01, 2.02, 3.03], + "e": [4, 5, 6], + } + + column_specific_options = { + "a": ParquetColumnOptions(statistics_enabled="none"), + "b": ParquetColumnOptions(encoding="plain", dictionary_enabled=False), + "c": ParquetColumnOptions( + compression="snappy", encoding="rle", dictionary_enabled=False + ), + "d": ParquetColumnOptions( + compression="zstd(6)", + encoding="byte_stream_split", + dictionary_enabled=False, + statistics_enabled="none", + ), + # column "e" will use the global configs + } + + results = { + "a": { + "statistics": False, + "compression": "brotli", + "encodings": ("PLAIN", "RLE", "RLE_DICTIONARY"), + }, + "b": { + "statistics": True, + "compression": "brotli", + "encodings": ("PLAIN", "RLE"), + }, + "c": { + "statistics": True, + "compression": "snappy", + "encodings": ("RLE",), + }, + "d": { + "statistics": False, + "compression": "zstd", + "encodings": ("RLE", "BYTE_STREAM_SPLIT"), + }, + "e": { + "statistics": True, + "compression": "brotli", + "encodings": ("PLAIN", "RLE", "RLE_DICTIONARY"), + }, + } + + ctx = SessionContext() + df = ctx.from_arrow(pa.record_batch(data)) + df.write_parquet_with_options( + tmp_path, + ParquetWriterOptions( + compression="brotli(8)", column_specific_options=column_specific_options + ), + ) + + for file in tmp_path.rglob("*.parquet"): + parquet = pq.ParquetFile(file) + metadata = parquet.metadata.to_dict() + + for row_group in metadata["row_groups"]: + for col in row_group["columns"]: + column_name = col["path_in_schema"] + result = results[column_name] + assert (col["statistics"] is not None) == result["statistics"] + assert col["compression"].lower() == result["compression"].lower() + assert col["encodings"] == result["encodings"] + + def test_dataframe_export(df) -> None: # Guarantees that we have the canonical implementation # reading our dataframe export diff --git a/src/dataframe.rs b/src/dataframe.rs index 7711a0782..3d68db279 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use std::collections::HashMap; use std::ffi::CString; use std::sync::Arc; @@ -27,7 +28,7 @@ use datafusion::arrow::datatypes::Schema; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::arrow::util::pretty; use datafusion::common::UnnestOptions; -use datafusion::config::{CsvOptions, TableParquetOptions}; +use datafusion::config::{CsvOptions, ParquetColumnOptions, ParquetOptions, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::datasource::TableProvider; use datafusion::error::DataFusionError; @@ -185,6 +186,101 @@ fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> PyResult< Ok(config) } +/// Python mapping of `ParquetOptions` (includes just the writer-related options). +#[pyclass(name = "ParquetWriterOptions", module = "datafusion", subclass)] +#[derive(Clone, Default)] +pub struct PyParquetWriterOptions { + options: ParquetOptions, +} + +#[pymethods] +impl PyParquetWriterOptions { + #[new] + #[allow(clippy::too_many_arguments)] + pub fn new( + data_pagesize_limit: usize, + write_batch_size: usize, + writer_version: String, + skip_arrow_metadata: bool, + compression: Option, + dictionary_enabled: Option, + dictionary_page_size_limit: usize, + statistics_enabled: Option, + max_row_group_size: usize, + created_by: String, + column_index_truncate_length: Option, + statistics_truncate_length: Option, + data_page_row_count_limit: usize, + encoding: Option, + bloom_filter_on_write: bool, + bloom_filter_fpp: Option, + bloom_filter_ndv: Option, + allow_single_file_parallelism: bool, + maximum_parallel_row_group_writers: usize, + maximum_buffered_record_batches_per_stream: usize, + ) -> Self { + Self { + options: ParquetOptions { + data_pagesize_limit, + write_batch_size, + writer_version, + skip_arrow_metadata, + compression, + dictionary_enabled, + dictionary_page_size_limit, + statistics_enabled, + max_row_group_size, + created_by, + column_index_truncate_length, + statistics_truncate_length, + data_page_row_count_limit, + encoding, + bloom_filter_on_write, + bloom_filter_fpp, + bloom_filter_ndv, + allow_single_file_parallelism, + maximum_parallel_row_group_writers, + maximum_buffered_record_batches_per_stream, + ..Default::default() + }, + } + } +} + +/// Python mapping of `ParquetColumnOptions`. +#[pyclass(name = "ParquetColumnOptions", module = "datafusion", subclass)] +#[derive(Clone, Default)] +pub struct PyParquetColumnOptions { + options: ParquetColumnOptions, +} + +#[pymethods] +impl PyParquetColumnOptions { + #[new] + pub fn new( + bloom_filter_enabled: Option, + encoding: Option, + dictionary_enabled: Option, + compression: Option, + statistics_enabled: Option, + bloom_filter_fpp: Option, + bloom_filter_ndv: Option, + ) -> Self { + Self { + options: ParquetColumnOptions { + bloom_filter_enabled, + encoding, + dictionary_enabled, + compression, + statistics_enabled, + bloom_filter_fpp, + bloom_filter_ndv, + ..Default::default() + }, + } + } +} + /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. /// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment. @@ -689,6 +785,34 @@ impl PyDataFrame { Ok(()) } + /// Write a `DataFrame` to a Parquet file, using advanced options. + fn write_parquet_with_options( + &self, + path: &str, + options: PyParquetWriterOptions, + column_specific_options: HashMap, + py: Python, + ) -> PyDataFusionResult<()> { + let table_options = TableParquetOptions { + global: options.options, + column_specific_options: column_specific_options + .into_iter() + .map(|(k, v)| (k, v.options)) + .collect(), + ..Default::default() + }; + + wait_for_future( + py, + self.df.as_ref().clone().write_parquet( + path, + DataFrameWriteOptions::new(), + Option::from(table_options), + ), + )??; + Ok(()) + } + /// Executes a query and writes the results to a partitioned JSON file. fn write_json(&self, path: &str, py: Python) -> PyDataFusionResult<()> { wait_for_future( diff --git a/src/lib.rs b/src/lib.rs index 7dced1fbd..1293eee3c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -86,6 +86,8 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; From 1812a0d3e88976f51b40ccfd1a02fddb17ddb8bb Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Tue, 24 Jun 2025 10:21:53 -0700 Subject: [PATCH 02/96] Fix signature of `__arrow_c_stream__` (#1168) --- python/datafusion/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 769271c7e..1fd63bdc6 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -1053,7 +1053,7 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram columns = list(columns) return DataFrame(self.df.unnest_columns(columns, preserve_nulls=preserve_nulls)) - def __arrow_c_stream__(self, requested_schema: pa.Schema) -> Any: + def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: """Export an Arrow PyCapsule Stream. This will execute and collect the DataFrame. We will attempt to respect the From 0d3c37f9379fd12406022c7edc3fee056866a694 Mon Sep 17 00:00:00 2001 From: kosiew Date: Wed, 25 Jun 2025 07:33:03 +0800 Subject: [PATCH 03/96] Consolidate DataFrame Docs: Merge HTML Rendering Section as Subpage (#1161) * docs: unify dataframe documentation (#2) * docs: update links in basics and rendering documentation * docs: add API reference section to main index * docs: add license information to rendering documentation * Move API Reference under User Guide > Dataframe * Merge data from dataframe api reference page into main dataframe page --------- Co-authored-by: Tim Saucer --- docs/source/api/dataframe.rst | 387 ------------------ docs/source/api/index.rst | 27 -- docs/source/index.rst | 4 +- docs/source/user-guide/basics.rst | 2 +- docs/source/user-guide/dataframe/index.rst | 209 ++++++++++ .../rendering.rst} | 103 ++--- 6 files changed, 265 insertions(+), 467 deletions(-) delete mode 100644 docs/source/api/dataframe.rst delete mode 100644 docs/source/api/index.rst create mode 100644 docs/source/user-guide/dataframe/index.rst rename docs/source/user-guide/{dataframe.rst => dataframe/rendering.rst} (72%) diff --git a/docs/source/api/dataframe.rst b/docs/source/api/dataframe.rst deleted file mode 100644 index a9e9e47c8..000000000 --- a/docs/source/api/dataframe.rst +++ /dev/null @@ -1,387 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -================= -DataFrame API -================= - -Overview --------- - -The ``DataFrame`` class is the core abstraction in DataFusion that represents tabular data and operations -on that data. DataFrames provide a flexible API for transforming data through various operations such as -filtering, projection, aggregation, joining, and more. - -A DataFrame represents a logical plan that is lazily evaluated. The actual execution occurs only when -terminal operations like ``collect()``, ``show()``, or ``to_pandas()`` are called. - -Creating DataFrames -------------------- - -DataFrames can be created in several ways: - -* From SQL queries via a ``SessionContext``: - - .. code-block:: python - - from datafusion import SessionContext - - ctx = SessionContext() - df = ctx.sql("SELECT * FROM your_table") - -* From registered tables: - - .. code-block:: python - - df = ctx.table("your_table") - -* From various data sources: - - .. code-block:: python - - # From CSV files (see :ref:`io_csv` for detailed options) - df = ctx.read_csv("path/to/data.csv") - - # From Parquet files (see :ref:`io_parquet` for detailed options) - df = ctx.read_parquet("path/to/data.parquet") - - # From JSON files (see :ref:`io_json` for detailed options) - df = ctx.read_json("path/to/data.json") - - # From Avro files (see :ref:`io_avro` for detailed options) - df = ctx.read_avro("path/to/data.avro") - - # From Pandas DataFrame - import pandas as pd - pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df = ctx.from_pandas(pandas_df) - - # From Arrow data - import pyarrow as pa - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"] - ) - df = ctx.from_arrow(batch) - - For detailed information about reading from different data sources, see the :doc:`I/O Guide <../user-guide/io/index>`. - For custom data sources, see :ref:`io_custom_table_provider`. - -Common DataFrame Operations ---------------------------- - -DataFusion's DataFrame API offers a wide range of operations: - -.. code-block:: python - - from datafusion import column, literal - - # Select specific columns - df = df.select("col1", "col2") - - # Select with expressions - df = df.select(column("a") + column("b"), column("a") - column("b")) - - # Filter rows - df = df.filter(column("age") > literal(25)) - - # Add computed columns - df = df.with_column("full_name", column("first_name") + literal(" ") + column("last_name")) - - # Multiple column additions - df = df.with_columns( - (column("a") + column("b")).alias("sum"), - (column("a") * column("b")).alias("product") - ) - - # Sort data - df = df.sort(column("age").sort(ascending=False)) - - # Join DataFrames - df = df1.join(df2, on="user_id", how="inner") - - # Aggregate data - from datafusion import functions as f - df = df.aggregate( - [], # Group by columns (empty for global aggregation) - [f.sum(column("amount")).alias("total_amount")] - ) - - # Limit rows - df = df.limit(100) - - # Drop columns - df = df.drop("temporary_column") - -Terminal Operations -------------------- - -To materialize the results of your DataFrame operations: - -.. code-block:: python - - # Collect all data as PyArrow RecordBatches - result_batches = df.collect() - - # Convert to various formats - pandas_df = df.to_pandas() # Pandas DataFrame - polars_df = df.to_polars() # Polars DataFrame - arrow_table = df.to_arrow_table() # PyArrow Table - py_dict = df.to_pydict() # Python dictionary - py_list = df.to_pylist() # Python list of dictionaries - - # Display results - df.show() # Print tabular format to console - - # Count rows - count = df.count() - -HTML Rendering in Jupyter -------------------------- - -When working in Jupyter notebooks or other environments that support rich HTML display, -DataFusion DataFrames automatically render as nicely formatted HTML tables. This functionality -is provided by the ``_repr_html_`` method, which is automatically called by Jupyter. - -Basic HTML Rendering -~~~~~~~~~~~~~~~~~~~~ - -In a Jupyter environment, simply displaying a DataFrame object will trigger HTML rendering: - -.. code-block:: python - - # Will display as HTML table in Jupyter - df - - # Explicit display also uses HTML rendering - display(df) - -HTML Rendering Customization ----------------------------- - -DataFusion provides extensive customization options for HTML table rendering through the -``datafusion.html_formatter`` module. - -Configuring the HTML Formatter -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You can customize how DataFrames are rendered by configuring the formatter: - -.. code-block:: python - - from datafusion.html_formatter import configure_formatter - - configure_formatter( - max_cell_length=30, # Maximum length of cell content before truncation - max_width=800, # Maximum width of table in pixels - max_height=400, # Maximum height of table in pixels - max_memory_bytes=2 * 1024 * 1024,# Maximum memory used for rendering (2MB) - min_rows_display=10, # Minimum rows to display - repr_rows=20, # Number of rows to display in representation - enable_cell_expansion=True, # Allow cells to be expandable on click - custom_css=None, # Custom CSS to apply - show_truncation_message=True, # Show message when data is truncated - style_provider=None, # Custom style provider class - use_shared_styles=True # Share styles across tables to reduce duplication - ) - -Custom Style Providers -~~~~~~~~~~~~~~~~~~~~~~ - -For advanced styling needs, you can create a custom style provider class: - -.. code-block:: python - - from datafusion.html_formatter import configure_formatter - - class CustomStyleProvider: - def get_cell_style(self) -> str: - return "background-color: #f5f5f5; color: #333; padding: 8px; border: 1px solid #ddd;" - - def get_header_style(self) -> str: - return "background-color: #4285f4; color: white; font-weight: bold; padding: 10px;" - - # Apply custom styling - configure_formatter(style_provider=CustomStyleProvider()) - -Custom Type Formatters -~~~~~~~~~~~~~~~~~~~~~~ - -You can register custom formatters for specific data types: - -.. code-block:: python - - from datafusion.html_formatter import get_formatter - - formatter = get_formatter() - - # Format integers with color based on value - def format_int(value): - return f' 100 else "blue"}">{value}' - - formatter.register_formatter(int, format_int) - - # Format date values - def format_date(value): - return f'{value.isoformat()}' - - formatter.register_formatter(datetime.date, format_date) - -Custom Cell Builders -~~~~~~~~~~~~~~~~~~~~ - -For complete control over cell rendering: - -.. code-block:: python - - formatter = get_formatter() - - def custom_cell_builder(value, row, col, table_id): - try: - num_value = float(value) - if num_value > 0: # Positive values get green - return f'{value}' - if num_value < 0: # Negative values get red - return f'{value}' - except (ValueError, TypeError): - pass - - # Default styling for non-numeric or zero values - return f'{value}' - - formatter.set_custom_cell_builder(custom_cell_builder) - -Custom Header Builders -~~~~~~~~~~~~~~~~~~~~~~ - -Similarly, you can customize the rendering of table headers: - -.. code-block:: python - - def custom_header_builder(field): - tooltip = f"Type: {field.type}" - return f'{field.name}' - - formatter.set_custom_header_builder(custom_header_builder) - -Managing Formatter State ------------------------~ - -The HTML formatter maintains global state that can be managed: - -.. code-block:: python - - from datafusion.html_formatter import reset_formatter, reset_styles_loaded_state, get_formatter - - # Reset the formatter to default settings - reset_formatter() - - # Reset only the styles loaded state (useful when styles were loaded but need reloading) - reset_styles_loaded_state() - - # Get the current formatter instance to make changes - formatter = get_formatter() - -Advanced Example: Dashboard-Style Formatting -------------------------------------------~~ - -This example shows how to create a dashboard-like styling for your DataFrames: - -.. code-block:: python - - from datafusion.html_formatter import configure_formatter, get_formatter - - # Define custom CSS - custom_css = """ - .datafusion-table { - font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; - border-collapse: collapse; - width: 100%; - box-shadow: 0 2px 3px rgba(0,0,0,0.1); - } - .datafusion-table th { - position: sticky; - top: 0; - z-index: 10; - } - .datafusion-table tr:hover td { - background-color: #f1f7fa !important; - } - .datafusion-table .numeric-positive { - color: #0a7c00; - } - .datafusion-table .numeric-negative { - color: #d13438; - } - """ - - class DashboardStyleProvider: - def get_cell_style(self) -> str: - return "padding: 8px 12px; border-bottom: 1px solid #e0e0e0;" - - def get_header_style(self) -> str: - return ("background-color: #0078d4; color: white; font-weight: 600; " - "padding: 12px; text-align: left; border-bottom: 2px solid #005a9e;") - - # Apply configuration - configure_formatter( - max_height=500, - enable_cell_expansion=True, - custom_css=custom_css, - style_provider=DashboardStyleProvider(), - max_cell_length=50 - ) - - # Add custom formatters for numbers - formatter = get_formatter() - - def format_number(value): - try: - num = float(value) - cls = "numeric-positive" if num > 0 else "numeric-negative" if num < 0 else "" - return f'{value:,}' if cls else f'{value:,}' - except (ValueError, TypeError): - return str(value) - - formatter.register_formatter(int, format_number) - formatter.register_formatter(float, format_number) - -Best Practices --------------- - -1. **Memory Management**: For large datasets, use ``max_memory_bytes`` to limit memory usage. - -2. **Responsive Design**: Set reasonable ``max_width`` and ``max_height`` values to ensure tables display well on different screens. - -3. **Style Optimization**: Use ``use_shared_styles=True`` to avoid duplicate style definitions when displaying multiple tables. - -4. **Reset When Needed**: Call ``reset_formatter()`` when you want to start fresh with default settings. - -5. **Cell Expansion**: Use ``enable_cell_expansion=True`` when cells might contain longer content that users may want to see in full. - -Additional Resources --------------------- - -* :doc:`../user-guide/dataframe` - Complete guide to using DataFrames -* :doc:`../user-guide/io/index` - I/O Guide for reading data from various sources -* :doc:`../user-guide/data-sources` - Comprehensive data sources guide -* :ref:`io_csv` - CSV file reading -* :ref:`io_parquet` - Parquet file reading -* :ref:`io_json` - JSON file reading -* :ref:`io_avro` - Avro file reading -* :ref:`io_custom_table_provider` - Custom table providers -* `API Reference `_ - Full API reference diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst deleted file mode 100644 index 7f58227ca..000000000 --- a/docs/source/api/index.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -============= -API Reference -============= - -This section provides detailed API documentation for the DataFusion Python library. - -.. toctree:: - :maxdepth: 2 - - dataframe diff --git a/docs/source/index.rst b/docs/source/index.rst index ff1e47280..adec60f48 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -72,7 +72,7 @@ Example user-guide/introduction user-guide/basics user-guide/data-sources - user-guide/dataframe + user-guide/dataframe/index user-guide/common-operations/index user-guide/io/index user-guide/configuration @@ -93,5 +93,3 @@ Example :hidden: :maxdepth: 1 :caption: API - - api/index diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst index 2975d9a6b..7c6820461 100644 --- a/docs/source/user-guide/basics.rst +++ b/docs/source/user-guide/basics.rst @@ -73,7 +73,7 @@ DataFrames are typically created by calling a method on :py:class:`~datafusion.c calling the transformation methods, such as :py:func:`~datafusion.dataframe.DataFrame.filter`, :py:func:`~datafusion.dataframe.DataFrame.select`, :py:func:`~datafusion.dataframe.DataFrame.aggregate`, and :py:func:`~datafusion.dataframe.DataFrame.limit` to build up a query definition. -For more details on working with DataFrames, including visualization options and conversion to other formats, see :doc:`dataframe`. +For more details on working with DataFrames, including visualization options and conversion to other formats, see :doc:`dataframe/index`. Expressions ----------- diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst new file mode 100644 index 000000000..f69485af7 --- /dev/null +++ b/docs/source/user-guide/dataframe/index.rst @@ -0,0 +1,209 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +DataFrames +========== + +Overview +-------- + +The ``DataFrame`` class is the core abstraction in DataFusion that represents tabular data and operations +on that data. DataFrames provide a flexible API for transforming data through various operations such as +filtering, projection, aggregation, joining, and more. + +A DataFrame represents a logical plan that is lazily evaluated. The actual execution occurs only when +terminal operations like ``collect()``, ``show()``, or ``to_pandas()`` are called. + +Creating DataFrames +------------------- + +DataFrames can be created in several ways: + +* From SQL queries via a ``SessionContext``: + + .. code-block:: python + + from datafusion import SessionContext + + ctx = SessionContext() + df = ctx.sql("SELECT * FROM your_table") + +* From registered tables: + + .. code-block:: python + + df = ctx.table("your_table") + +* From various data sources: + + .. code-block:: python + + # From CSV files (see :ref:`io_csv` for detailed options) + df = ctx.read_csv("path/to/data.csv") + + # From Parquet files (see :ref:`io_parquet` for detailed options) + df = ctx.read_parquet("path/to/data.parquet") + + # From JSON files (see :ref:`io_json` for detailed options) + df = ctx.read_json("path/to/data.json") + + # From Avro files (see :ref:`io_avro` for detailed options) + df = ctx.read_avro("path/to/data.avro") + + # From Pandas DataFrame + import pandas as pd + pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = ctx.from_pandas(pandas_df) + + # From Arrow data + import pyarrow as pa + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"] + ) + df = ctx.from_arrow(batch) + +For detailed information about reading from different data sources, see the :doc:`I/O Guide <../io/index>`. +For custom data sources, see :ref:`io_custom_table_provider`. + +Common DataFrame Operations +--------------------------- + +DataFusion's DataFrame API offers a wide range of operations: + +.. code-block:: python + + from datafusion import column, literal + + # Select specific columns + df = df.select("col1", "col2") + + # Select with expressions + df = df.select(column("a") + column("b"), column("a") - column("b")) + + # Filter rows + df = df.filter(column("age") > literal(25)) + + # Add computed columns + df = df.with_column("full_name", column("first_name") + literal(" ") + column("last_name")) + + # Multiple column additions + df = df.with_columns( + (column("a") + column("b")).alias("sum"), + (column("a") * column("b")).alias("product") + ) + + # Sort data + df = df.sort(column("age").sort(ascending=False)) + + # Join DataFrames + df = df1.join(df2, on="user_id", how="inner") + + # Aggregate data + from datafusion import functions as f + df = df.aggregate( + [], # Group by columns (empty for global aggregation) + [f.sum(column("amount")).alias("total_amount")] + ) + + # Limit rows + df = df.limit(100) + + # Drop columns + df = df.drop("temporary_column") + +Terminal Operations +------------------- + +To materialize the results of your DataFrame operations: + +.. code-block:: python + + # Collect all data as PyArrow RecordBatches + result_batches = df.collect() + + # Convert to various formats + pandas_df = df.to_pandas() # Pandas DataFrame + polars_df = df.to_polars() # Polars DataFrame + arrow_table = df.to_arrow_table() # PyArrow Table + py_dict = df.to_pydict() # Python dictionary + py_list = df.to_pylist() # Python list of dictionaries + + # Display results + df.show() # Print tabular format to console + + # Count rows + count = df.count() + +HTML Rendering +-------------- + +When working in Jupyter notebooks or other environments that support HTML rendering, DataFrames will +automatically display as formatted HTML tables. For detailed information about customizing HTML +rendering, formatting options, and advanced styling, see :doc:`rendering`. + +Core Classes +------------ + +**DataFrame** + The main DataFrame class for building and executing queries. + + See: :py:class:`datafusion.DataFrame` + +**SessionContext** + The primary entry point for creating DataFrames from various data sources. + + Key methods for DataFrame creation: + + * :py:meth:`~datafusion.SessionContext.read_csv` - Read CSV files + * :py:meth:`~datafusion.SessionContext.read_parquet` - Read Parquet files + * :py:meth:`~datafusion.SessionContext.read_json` - Read JSON files + * :py:meth:`~datafusion.SessionContext.read_avro` - Read Avro files + * :py:meth:`~datafusion.SessionContext.table` - Access registered tables + * :py:meth:`~datafusion.SessionContext.sql` - Execute SQL queries + * :py:meth:`~datafusion.SessionContext.from_pandas` - Create from Pandas DataFrame + * :py:meth:`~datafusion.SessionContext.from_arrow` - Create from Arrow data + + See: :py:class:`datafusion.SessionContext` + +Expression Classes +------------------ + +**Expr** + Represents expressions that can be used in DataFrame operations. + + See: :py:class:`datafusion.Expr` + +**Functions for creating expressions:** + +* :py:func:`datafusion.column` - Reference a column by name +* :py:func:`datafusion.literal` - Create a literal value expression + +Built-in Functions +------------------ + +DataFusion provides many built-in functions for data manipulation: + +* :py:mod:`datafusion.functions` - Mathematical, string, date/time, and aggregation functions + +For a complete list of available functions, see the :py:mod:`datafusion.functions` module documentation. + + +.. toctree:: + :maxdepth: 1 + + rendering diff --git a/docs/source/user-guide/dataframe.rst b/docs/source/user-guide/dataframe/rendering.rst similarity index 72% rename from docs/source/user-guide/dataframe.rst rename to docs/source/user-guide/dataframe/rendering.rst index 23c65b5f6..4c37c7471 100644 --- a/docs/source/user-guide/dataframe.rst +++ b/docs/source/user-guide/dataframe/rendering.rst @@ -15,59 +15,37 @@ .. specific language governing permissions and limitations .. under the License. -DataFrames -========== +HTML Rendering in Jupyter +========================= -Overview --------- +When working in Jupyter notebooks or other environments that support rich HTML display, +DataFusion DataFrames automatically render as nicely formatted HTML tables. This functionality +is provided by the ``_repr_html_`` method, which is automatically called by Jupyter to provide +a richer visualization than plain text output. -DataFusion's DataFrame API provides a powerful interface for building and executing queries against data sources. -It offers a familiar API similar to pandas and other DataFrame libraries, but with the performance benefits of Rust -and Arrow. +Basic HTML Rendering +-------------------- -A DataFrame represents a logical plan that can be composed through operations like filtering, projection, and aggregation. -The actual execution happens when terminal operations like ``collect()`` or ``show()`` are called. - -Basic Usage ------------ +In a Jupyter environment, simply displaying a DataFrame object will trigger HTML rendering: .. code-block:: python - import datafusion - from datafusion import col, lit + # Will display as HTML table in Jupyter + df - # Create a context and register a data source - ctx = datafusion.SessionContext() - ctx.register_csv("my_table", "path/to/data.csv") - - # Create and manipulate a DataFrame - df = ctx.sql("SELECT * FROM my_table") - - # Or use the DataFrame API directly - df = (ctx.table("my_table") - .filter(col("age") > lit(25)) - .select([col("name"), col("age")])) - - # Execute and collect results - result = df.collect() - - # Display the first few rows - df.show() + # Explicit display also uses HTML rendering + display(df) -HTML Rendering --------------- - -When working in Jupyter notebooks or other environments that support HTML rendering, DataFrames will -automatically display as formatted HTML tables, making it easier to visualize your data. +Customizing HTML Rendering +--------------------------- -The ``_repr_html_`` method is called automatically by Jupyter to render a DataFrame. This method -controls how DataFrames appear in notebook environments, providing a richer visualization than -plain text output. +DataFusion provides extensive customization options for HTML table rendering through the +``datafusion.html_formatter`` module. -Customizing HTML Rendering --------------------------- +Configuring the HTML Formatter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can customize how DataFrames are rendered in HTML by configuring the formatter: +You can customize how DataFrames are rendered by configuring the formatter: .. code-block:: python @@ -91,7 +69,7 @@ You can customize how DataFrames are rendered in HTML by configuring the formatt The formatter settings affect all DataFrames displayed after configuration. Custom Style Providers ----------------------- +----------------------- For advanced styling needs, you can create a custom style provider: @@ -118,7 +96,8 @@ For advanced styling needs, you can create a custom style provider: configure_formatter(style_provider=MyStyleProvider()) Performance Optimization with Shared Styles -------------------------------------------- +-------------------------------------------- + The ``use_shared_styles`` parameter (enabled by default) optimizes performance when displaying multiple DataFrames in notebook environments: @@ -138,7 +117,7 @@ When ``use_shared_styles=True``: - Applies consistent styling across all DataFrames Creating a Custom Formatter ---------------------------- +---------------------------- For complete control over rendering, you can implement a custom formatter: @@ -184,7 +163,7 @@ Get the current formatter settings: print(formatter.theme) Contextual Formatting ---------------------- +---------------------- You can also use a context manager to temporarily change formatting settings: @@ -207,12 +186,38 @@ Memory and Display Controls You can control how much data is displayed and how much memory is used for rendering: - .. code-block:: python - +.. code-block:: python + configure_formatter( max_memory_bytes=4 * 1024 * 1024, # 4MB maximum memory for display min_rows_display=50, # Always show at least 50 rows repr_rows=20 # Show 20 rows in __repr__ output ) -These parameters help balance comprehensive data display against performance considerations. \ No newline at end of file +These parameters help balance comprehensive data display against performance considerations. + +Best Practices +-------------- + +1. **Global Configuration**: Use ``configure_formatter()`` at the beginning of your notebook to set up consistent formatting for all DataFrames. + +2. **Memory Management**: Set appropriate ``max_memory_bytes`` limits to prevent performance issues with large datasets. + +3. **Shared Styles**: Keep ``use_shared_styles=True`` (default) for better performance in notebooks with multiple DataFrames. + +4. **Reset When Needed**: Call ``reset_formatter()`` when you want to start fresh with default settings. + +5. **Cell Expansion**: Use ``enable_cell_expansion=True`` when cells might contain longer content that users may want to see in full. + +Additional Resources +-------------------- + +* :doc:`../dataframe/index` - Complete guide to using DataFrames +* :doc:`../io/index` - I/O Guide for reading data from various sources +* :doc:`../data-sources` - Comprehensive data sources guide +* :ref:`io_csv` - CSV file reading +* :ref:`io_parquet` - Parquet file reading +* :ref:`io_json` - JSON file reading +* :ref:`io_avro` - Avro file reading +* :ref:`io_custom_table_provider` - Custom table providers +* `API Reference `_ - Full API reference From ef62fa897e6e8334779f556aad4c0da89ebdc86f Mon Sep 17 00:00:00 2001 From: kosiew Date: Wed, 25 Jun 2025 19:27:01 +0800 Subject: [PATCH 04/96] Add compression_level support to ParquetWriterOptions and enhance write_parquet to accept full options object (#1169) * feat: Add Parquet writer option autodetection * Add compression_level to ParquetWriterOptions * fix ruff errors * feat: Add overloads for write_parquet method to support various compression options --- python/datafusion/__init__.py | 3 +-- python/datafusion/dataframe.py | 40 +++++++++++++++++++++++++++++++--- python/tests/test_dataframe.py | 16 ++++++++++++++ 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 16d65f685..69062fd30 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -30,13 +30,12 @@ except ImportError: import importlib_metadata -from datafusion.col import col, column - from . import functions, object_store, substrait, unparser # The following imports are okay to remain as opaque to the user. from ._internal import Config from .catalog import Catalog, Database, Table +from .col import col, column from .common import ( DFSchema, ) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 1fd63bdc6..49c0b2f8b 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -191,6 +191,7 @@ def __init__( writer_version: str = "1.0", skip_arrow_metadata: bool = False, compression: Optional[str] = "zstd(3)", + compression_level: Optional[int] = None, dictionary_enabled: Optional[bool] = True, dictionary_page_size_limit: int = 1024 * 1024, statistics_enabled: Optional[str] = "page", @@ -213,7 +214,10 @@ def __init__( self.write_batch_size = write_batch_size self.writer_version = writer_version self.skip_arrow_metadata = skip_arrow_metadata - self.compression = compression + if compression_level is not None: + self.compression = f"{compression}({compression_level})" + else: + self.compression = compression self.dictionary_enabled = dictionary_enabled self.dictionary_page_size_limit = dictionary_page_size_limit self.statistics_enabled = statistics_enabled @@ -870,10 +874,34 @@ def write_csv(self, path: str | pathlib.Path, with_header: bool = False) -> None """ self.df.write_csv(str(path), with_header) + @overload + def write_parquet( + self, + path: str | pathlib.Path, + compression: str, + compression_level: int | None = None, + ) -> None: ... + + @overload + def write_parquet( + self, + path: str | pathlib.Path, + compression: Compression = Compression.ZSTD, + compression_level: int | None = None, + ) -> None: ... + + @overload + def write_parquet( + self, + path: str | pathlib.Path, + compression: ParquetWriterOptions, + compression_level: None = None, + ) -> None: ... + def write_parquet( self, path: str | pathlib.Path, - compression: Union[str, Compression] = Compression.ZSTD, + compression: Union[str, Compression, ParquetWriterOptions] = Compression.ZSTD, compression_level: int | None = None, ) -> None: """Execute the :py:class:`DataFrame` and write the results to a Parquet file. @@ -894,7 +922,13 @@ def write_parquet( recommended range is 1 to 22, with the default being 4. Higher levels provide better compression but slower speed. """ - # Convert string to Compression enum if necessary + if isinstance(compression, ParquetWriterOptions): + if compression_level is not None: + msg = "compression_level should be None when using ParquetWriterOptions" + raise ValueError(msg) + self.write_parquet_with_options(path, compression) + return + if isinstance(compression, str): compression = Compression.from_str(compression) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 3c9b97f23..deaa30b3d 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -2038,6 +2038,22 @@ def test_write_parquet_with_options_column_options(df, tmp_path): assert col["encodings"] == result["encodings"] +def test_write_parquet_options(df, tmp_path): + options = ParquetWriterOptions(compression="gzip", compression_level=6) + df.write_parquet(str(tmp_path), options) + + result = pq.read_table(str(tmp_path)).to_pydict() + expected = df.to_pydict() + + assert result == expected + + +def test_write_parquet_options_error(df, tmp_path): + options = ParquetWriterOptions(compression="gzip", compression_level=6) + with pytest.raises(ValueError): + df.write_parquet(str(tmp_path), options, compression_level=1) + + def test_dataframe_export(df) -> None: # Guarantees that we have the canonical implementation # reading our dataframe export From 03921d9629c576afc43b63ad67172f70abcd73e8 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 25 Jun 2025 08:20:49 -0400 Subject: [PATCH 05/96] feat: dataframe string formatter (#1170) * Add string formatter * Rename html_formatter to dataframe_formatter * Add deprecation warning * Small adjustments based on user feedback --- python/datafusion/__init__.py | 2 +- python/datafusion/dataframe.py | 15 +- python/datafusion/dataframe_formatter.py | 739 +++++++++++++++++++++++ python/datafusion/html_formatter.py | 700 +-------------------- python/tests/test_dataframe.py | 4 +- src/dataframe.rs | 109 ++-- 6 files changed, 829 insertions(+), 740 deletions(-) create mode 100644 python/datafusion/dataframe_formatter.py diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 69062fd30..fd7f4fc06 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -46,11 +46,11 @@ SQLOptions, ) from .dataframe import DataFrame, ParquetColumnOptions, ParquetWriterOptions +from .dataframe_formatter import configure_formatter from .expr import ( Expr, WindowFrame, ) -from .html_formatter import configure_formatter from .io import read_avro, read_csv, read_json, read_parquet from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 49c0b2f8b..991e6875a 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -52,7 +52,6 @@ import polars as pl import pyarrow as pa - from datafusion._internal import DataFrame as DataFrameInternal from datafusion._internal import expr as expr_internal from enum import Enum @@ -327,6 +326,20 @@ def __repr__(self) -> str: def _repr_html_(self) -> str: return self.df._repr_html_() + @staticmethod + def default_str_repr( + batches: list[pa.RecordBatch], + schema: pa.Schema, + has_more: bool, + table_uuid: str | None = None, + ) -> str: + """Return the default string representation of a DataFrame. + + This method is used by the default formatter and implemented in Rust for + performance reasons. + """ + return DataFrameInternal.default_str_repr(batches, schema, has_more, table_uuid) + def describe(self) -> DataFrame: """Return the statistics for this DataFrame. diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py new file mode 100644 index 000000000..27f00f9c3 --- /dev/null +++ b/python/datafusion/dataframe_formatter.py @@ -0,0 +1,739 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""HTML formatting utilities for DataFusion DataFrames.""" + +from __future__ import annotations + +from typing import ( + Any, + Callable, + Optional, + Protocol, + runtime_checkable, +) + +from datafusion._internal import DataFrame as DataFrameInternal + + +def _validate_positive_int(value: Any, param_name: str) -> None: + """Validate that a parameter is a positive integer. + + Args: + value: The value to validate + param_name: Name of the parameter (used in error message) + + Raises: + ValueError: If the value is not a positive integer + """ + if not isinstance(value, int) or value <= 0: + msg = f"{param_name} must be a positive integer" + raise ValueError(msg) + + +def _validate_bool(value: Any, param_name: str) -> None: + """Validate that a parameter is a boolean. + + Args: + value: The value to validate + param_name: Name of the parameter (used in error message) + + Raises: + TypeError: If the value is not a boolean + """ + if not isinstance(value, bool): + msg = f"{param_name} must be a boolean" + raise TypeError(msg) + + +@runtime_checkable +class CellFormatter(Protocol): + """Protocol for cell value formatters.""" + + def __call__(self, value: Any) -> str: + """Format a cell value to string representation.""" + ... + + +@runtime_checkable +class StyleProvider(Protocol): + """Protocol for HTML style providers.""" + + def get_cell_style(self) -> str: + """Get the CSS style for table cells.""" + ... + + def get_header_style(self) -> str: + """Get the CSS style for header cells.""" + ... + + +class DefaultStyleProvider: + """Default implementation of StyleProvider.""" + + def get_cell_style(self) -> str: + """Get the CSS style for table cells. + + Returns: + CSS style string + """ + return ( + "border: 1px solid black; padding: 8px; text-align: left; " + "white-space: nowrap;" + ) + + def get_header_style(self) -> str: + """Get the CSS style for header cells. + + Returns: + CSS style string + """ + return ( + "border: 1px solid black; padding: 8px; text-align: left; " + "background-color: #f2f2f2; white-space: nowrap; min-width: fit-content; " + "max-width: fit-content;" + ) + + +class DataFrameHtmlFormatter: + """Configurable HTML formatter for DataFusion DataFrames. + + This class handles the HTML rendering of DataFrames for display in + Jupyter notebooks and other rich display contexts. + + This class supports extension through composition. Key extension points: + - Provide a custom StyleProvider for styling cells and headers + - Register custom formatters for specific types + - Provide custom cell builders for specialized cell rendering + + Args: + max_cell_length: Maximum characters to display in a cell before truncation + max_width: Maximum width of the HTML table in pixels + max_height: Maximum height of the HTML table in pixels + max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB) + min_rows_display: Minimum number of rows to display + repr_rows: Default number of rows to display in repr output + enable_cell_expansion: Whether to add expand/collapse buttons for long cell + values + custom_css: Additional CSS to include in the HTML output + show_truncation_message: Whether to display a message when data is truncated + style_provider: Custom provider for cell and header styles + use_shared_styles: Whether to load styles and scripts only once per notebook + session + """ + + # Class variable to track if styles have been loaded in the notebook + _styles_loaded = False + + def __init__( + self, + max_cell_length: int = 25, + max_width: int = 1000, + max_height: int = 300, + max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB + min_rows_display: int = 20, + repr_rows: int = 10, + enable_cell_expansion: bool = True, + custom_css: Optional[str] = None, + show_truncation_message: bool = True, + style_provider: Optional[StyleProvider] = None, + use_shared_styles: bool = True, + ) -> None: + """Initialize the HTML formatter. + + Parameters + ---------- + max_cell_length : int, default 25 + Maximum length of cell content before truncation. + max_width : int, default 1000 + Maximum width of the displayed table in pixels. + max_height : int, default 300 + Maximum height of the displayed table in pixels. + max_memory_bytes : int, default 2097152 (2MB) + Maximum memory in bytes for rendered data. + min_rows_display : int, default 20 + Minimum number of rows to display. + repr_rows : int, default 10 + Default number of rows to display in repr output. + enable_cell_expansion : bool, default True + Whether to allow cells to expand when clicked. + custom_css : str, optional + Custom CSS to apply to the HTML table. + show_truncation_message : bool, default True + Whether to show a message indicating that content has been truncated. + style_provider : StyleProvider, optional + Provider of CSS styles for the HTML table. If None, DefaultStyleProvider + is used. + use_shared_styles : bool, default True + Whether to use shared styles across multiple tables. + + Raises: + ------ + ValueError + If max_cell_length, max_width, max_height, max_memory_bytes, + min_rows_display, or repr_rows is not a positive integer. + TypeError + If enable_cell_expansion, show_truncation_message, or use_shared_styles is + not a boolean, + or if custom_css is provided but is not a string, + or if style_provider is provided but does not implement the StyleProvider + protocol. + """ + # Validate numeric parameters + _validate_positive_int(max_cell_length, "max_cell_length") + _validate_positive_int(max_width, "max_width") + _validate_positive_int(max_height, "max_height") + _validate_positive_int(max_memory_bytes, "max_memory_bytes") + _validate_positive_int(min_rows_display, "min_rows_display") + _validate_positive_int(repr_rows, "repr_rows") + + # Validate boolean parameters + _validate_bool(enable_cell_expansion, "enable_cell_expansion") + _validate_bool(show_truncation_message, "show_truncation_message") + _validate_bool(use_shared_styles, "use_shared_styles") + + # Validate custom_css + if custom_css is not None and not isinstance(custom_css, str): + msg = "custom_css must be None or a string" + raise TypeError(msg) + + # Validate style_provider + if style_provider is not None and not isinstance(style_provider, StyleProvider): + msg = "style_provider must implement the StyleProvider protocol" + raise TypeError(msg) + + self.max_cell_length = max_cell_length + self.max_width = max_width + self.max_height = max_height + self.max_memory_bytes = max_memory_bytes + self.min_rows_display = min_rows_display + self.repr_rows = repr_rows + self.enable_cell_expansion = enable_cell_expansion + self.custom_css = custom_css + self.show_truncation_message = show_truncation_message + self.style_provider = style_provider or DefaultStyleProvider() + self.use_shared_styles = use_shared_styles + # Registry for custom type formatters + self._type_formatters: dict[type, CellFormatter] = {} + # Custom cell builders + self._custom_cell_builder: Optional[Callable[[Any, int, int, str], str]] = None + self._custom_header_builder: Optional[Callable[[Any], str]] = None + + def register_formatter(self, type_class: type, formatter: CellFormatter) -> None: + """Register a custom formatter for a specific data type. + + Args: + type_class: The type to register a formatter for + formatter: Function that takes a value of the given type and returns + a formatted string + """ + self._type_formatters[type_class] = formatter + + def set_custom_cell_builder( + self, builder: Callable[[Any, int, int, str], str] + ) -> None: + """Set a custom cell builder function. + + Args: + builder: Function that takes (value, row, col, table_id) and returns HTML + """ + self._custom_cell_builder = builder + + def set_custom_header_builder(self, builder: Callable[[Any], str]) -> None: + """Set a custom header builder function. + + Args: + builder: Function that takes a field and returns HTML + """ + self._custom_header_builder = builder + + @classmethod + def is_styles_loaded(cls) -> bool: + """Check if HTML styles have been loaded in the current session. + + This method is primarily intended for debugging UI rendering issues + related to style loading. + + Returns: + True if styles have been loaded, False otherwise + + Example: + >>> from datafusion.dataframe_formatter import DataFrameHtmlFormatter + >>> DataFrameHtmlFormatter.is_styles_loaded() + False + """ + return cls._styles_loaded + + def format_html( + self, + batches: list, + schema: Any, + has_more: bool = False, + table_uuid: str | None = None, + ) -> str: + """Format record batches as HTML. + + This method is used by DataFrame's _repr_html_ implementation and can be + called directly when custom HTML rendering is needed. + + Args: + batches: List of Arrow RecordBatch objects + schema: Arrow Schema object + has_more: Whether there are more batches not shown + table_uuid: Unique ID for the table, used for JavaScript interactions + + Returns: + HTML string representation of the data + + Raises: + TypeError: If schema is invalid and no batches are provided + """ + if not batches: + return "No data to display" + + # Validate schema + if schema is None or not hasattr(schema, "__iter__"): + msg = "Schema must be provided" + raise TypeError(msg) + + # Generate a unique ID if none provided + table_uuid = table_uuid or f"df-{id(batches)}" + + # Build HTML components + html = [] + + # Only include styles and scripts if: + # 1. Not using shared styles, OR + # 2. Using shared styles but they haven't been loaded yet + include_styles = ( + not self.use_shared_styles or not DataFrameHtmlFormatter._styles_loaded + ) + + if include_styles: + html.extend(self._build_html_header()) + # If we're using shared styles, mark them as loaded + if self.use_shared_styles: + DataFrameHtmlFormatter._styles_loaded = True + + html.extend(self._build_table_container_start()) + + # Add table header and body + html.extend(self._build_table_header(schema)) + html.extend(self._build_table_body(batches, table_uuid)) + + html.append("") + html.append("") + + # Add footer (JavaScript and messages) + if include_styles and self.enable_cell_expansion: + html.append(self._get_javascript()) + + # Always add truncation message if needed (independent of styles) + if has_more and self.show_truncation_message: + html.append("
Data truncated due to size.
") + + return "\n".join(html) + + def format_str( + self, + batches: list, + schema: Any, + has_more: bool = False, + table_uuid: str | None = None, + ) -> str: + """Format record batches as a string. + + This method is used by DataFrame's __repr__ implementation and can be + called directly when string rendering is needed. + + Args: + batches: List of Arrow RecordBatch objects + schema: Arrow Schema object + has_more: Whether there are more batches not shown + table_uuid: Unique ID for the table, used for JavaScript interactions + + Returns: + String representation of the data + + Raises: + TypeError: If schema is invalid and no batches are provided + """ + return DataFrameInternal.default_str_repr(batches, schema, has_more, table_uuid) + + def _build_html_header(self) -> list[str]: + """Build the HTML header with CSS styles.""" + html = [] + html.append("") + return html + + def _build_table_container_start(self) -> list[str]: + """Build the opening tags for the table container.""" + html = [] + html.append( + f'
' + ) + html.append('') + return html + + def _build_table_header(self, schema: Any) -> list[str]: + """Build the HTML table header with column names.""" + html = [] + html.append("") + html.append("") + for field in schema: + if self._custom_header_builder: + html.append(self._custom_header_builder(field)) + else: + html.append( + f"" + ) + html.append("") + html.append("") + return html + + def _build_table_body(self, batches: list, table_uuid: str) -> list[str]: + """Build the HTML table body with data rows.""" + html = [] + html.append("") + + row_count = 0 + for batch in batches: + for row_idx in range(batch.num_rows): + row_count += 1 + html.append("") + + for col_idx, column in enumerate(batch.columns): + # Get the raw value from the column + raw_value = self._get_cell_value(column, row_idx) + + # Always check for type formatters first to format the value + formatted_value = self._format_cell_value(raw_value) + + # Then apply either custom cell builder or standard cell formatting + if self._custom_cell_builder: + # Pass both the raw value and formatted value to let the + # builder decide + cell_html = self._custom_cell_builder( + raw_value, row_count, col_idx, table_uuid + ) + html.append(cell_html) + else: + # Standard cell formatting with formatted value + if ( + len(str(raw_value)) > self.max_cell_length + and self.enable_cell_expansion + ): + cell_html = self._build_expandable_cell( + formatted_value, row_count, col_idx, table_uuid + ) + else: + cell_html = self._build_regular_cell(formatted_value) + html.append(cell_html) + + html.append("") + + html.append("") + return html + + def _get_cell_value(self, column: Any, row_idx: int) -> Any: + """Extract a cell value from a column. + + Args: + column: Arrow array + row_idx: Row index + + Returns: + The raw cell value + """ + try: + value = column[row_idx] + + if hasattr(value, "as_py"): + return value.as_py() + except (AttributeError, TypeError): + pass + else: + return value + + def _format_cell_value(self, value: Any) -> str: + """Format a cell value for display. + + Uses registered type formatters if available. + + Args: + value: The cell value to format + + Returns: + Formatted cell value as string + """ + # Check for custom type formatters + for type_cls, formatter in self._type_formatters.items(): + if isinstance(value, type_cls): + return formatter(value) + + # If no formatter matched, return string representation + return str(value) + + def _build_expandable_cell( + self, formatted_value: str, row_count: int, col_idx: int, table_uuid: str + ) -> str: + """Build an expandable cell for long content.""" + short_value = str(formatted_value)[: self.max_cell_length] + return ( + f"" + ) + + def _build_regular_cell(self, formatted_value: str) -> str: + """Build a regular table cell.""" + return ( + f"" + ) + + def _build_html_footer(self, has_more: bool) -> list[str]: + """Build the HTML footer with JavaScript and messages.""" + html = [] + + # Add JavaScript for interactivity only if cell expansion is enabled + # and we're not using the shared styles approach + if self.enable_cell_expansion and not self.use_shared_styles: + html.append(self._get_javascript()) + + # Add truncation message if needed + if has_more and self.show_truncation_message: + html.append("
Data truncated due to size.
") + + return html + + def _get_default_css(self) -> str: + """Get default CSS styles for the HTML table.""" + return """ + .expandable-container { + display: inline-block; + max-width: 200px; + } + .expandable { + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + display: block; + } + .full-text { + display: none; + white-space: normal; + } + .expand-btn { + cursor: pointer; + color: blue; + text-decoration: underline; + border: none; + background: none; + font-size: inherit; + display: block; + margin-top: 5px; + } + """ + + def _get_javascript(self) -> str: + """Get JavaScript code for interactive elements.""" + return """ + + """ + + +class FormatterManager: + """Manager class for the global DataFrame HTML formatter instance.""" + + _default_formatter: DataFrameHtmlFormatter = DataFrameHtmlFormatter() + + @classmethod + def set_formatter(cls, formatter: DataFrameHtmlFormatter) -> None: + """Set the global DataFrame HTML formatter. + + Args: + formatter: The formatter instance to use globally + """ + cls._default_formatter = formatter + _refresh_formatter_reference() + + @classmethod + def get_formatter(cls) -> DataFrameHtmlFormatter: + """Get the current global DataFrame HTML formatter. + + Returns: + The global HTML formatter instance + """ + return cls._default_formatter + + +def get_formatter() -> DataFrameHtmlFormatter: + """Get the current global DataFrame HTML formatter. + + This function is used by the DataFrame._repr_html_ implementation to access + the shared formatter instance. It can also be used directly when custom + HTML rendering is needed. + + Returns: + The global HTML formatter instance + + Example: + >>> from datafusion.html_formatter import get_formatter + >>> formatter = get_formatter() + >>> formatter.max_cell_length = 50 # Increase cell length + """ + return FormatterManager.get_formatter() + + +def set_formatter(formatter: DataFrameHtmlFormatter) -> None: + """Set the global DataFrame HTML formatter. + + Args: + formatter: The formatter instance to use globally + + Example: + >>> from datafusion.html_formatter import get_formatter, set_formatter + >>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100) + >>> set_formatter(custom_formatter) + """ + FormatterManager.set_formatter(formatter) + + +def configure_formatter(**kwargs: Any) -> None: + """Configure the global DataFrame HTML formatter. + + This function creates a new formatter with the provided configuration + and sets it as the global formatter for all DataFrames. + + Args: + **kwargs: Formatter configuration parameters like max_cell_length, + max_width, max_height, enable_cell_expansion, etc. + + Raises: + ValueError: If any invalid parameters are provided + + Example: + >>> from datafusion.html_formatter import configure_formatter + >>> configure_formatter( + ... max_cell_length=50, + ... max_height=500, + ... enable_cell_expansion=True, + ... use_shared_styles=True + ... ) + """ + # Valid parameters accepted by DataFrameHtmlFormatter + valid_params = { + "max_cell_length", + "max_width", + "max_height", + "max_memory_bytes", + "min_rows_display", + "repr_rows", + "enable_cell_expansion", + "custom_css", + "show_truncation_message", + "style_provider", + "use_shared_styles", + } + + # Check for invalid parameters + invalid_params = set(kwargs) - valid_params + if invalid_params: + msg = ( + f"Invalid formatter parameters: {', '.join(invalid_params)}. " + f"Valid parameters are: {', '.join(valid_params)}" + ) + raise ValueError(msg) + + # Create and set formatter with validated parameters + set_formatter(DataFrameHtmlFormatter(**kwargs)) + + +def reset_formatter() -> None: + """Reset the global DataFrame HTML formatter to default settings. + + This function creates a new formatter with default configuration + and sets it as the global formatter for all DataFrames. + + Example: + >>> from datafusion.html_formatter import reset_formatter + >>> reset_formatter() # Reset formatter to default settings + """ + formatter = DataFrameHtmlFormatter() + # Reset the styles_loaded flag to ensure styles will be reloaded + DataFrameHtmlFormatter._styles_loaded = False + set_formatter(formatter) + + +def reset_styles_loaded_state() -> None: + """Reset the styles loaded state to force reloading of styles. + + This can be useful when switching between notebook sessions or + when styles need to be refreshed. + + Example: + >>> from datafusion.html_formatter import reset_styles_loaded_state + >>> reset_styles_loaded_state() # Force styles to reload in next render + """ + DataFrameHtmlFormatter._styles_loaded = False + + +def _refresh_formatter_reference() -> None: + """Refresh formatter reference in any modules using it. + + This helps ensure that changes to the formatter are reflected in existing + DataFrames that might be caching the formatter reference. + """ + # This is a no-op but signals modules to refresh their reference diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py index 12a7e4553..65eb1f042 100644 --- a/python/datafusion/html_formatter.py +++ b/python/datafusion/html_formatter.py @@ -14,698 +14,16 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""HTML formatting utilities for DataFusion DataFrames.""" -from __future__ import annotations +"""Deprecated module for dataframe formatting.""" -from typing import ( - Any, - Callable, - Optional, - Protocol, - runtime_checkable, -) - - -def _validate_positive_int(value: Any, param_name: str) -> None: - """Validate that a parameter is a positive integer. - - Args: - value: The value to validate - param_name: Name of the parameter (used in error message) - - Raises: - ValueError: If the value is not a positive integer - """ - if not isinstance(value, int) or value <= 0: - msg = f"{param_name} must be a positive integer" - raise ValueError(msg) - - -def _validate_bool(value: Any, param_name: str) -> None: - """Validate that a parameter is a boolean. - - Args: - value: The value to validate - param_name: Name of the parameter (used in error message) - - Raises: - TypeError: If the value is not a boolean - """ - if not isinstance(value, bool): - msg = f"{param_name} must be a boolean" - raise TypeError(msg) - - -@runtime_checkable -class CellFormatter(Protocol): - """Protocol for cell value formatters.""" - - def __call__(self, value: Any) -> str: - """Format a cell value to string representation.""" - ... - - -@runtime_checkable -class StyleProvider(Protocol): - """Protocol for HTML style providers.""" - - def get_cell_style(self) -> str: - """Get the CSS style for table cells.""" - ... - - def get_header_style(self) -> str: - """Get the CSS style for header cells.""" - ... - - -class DefaultStyleProvider: - """Default implementation of StyleProvider.""" - - def get_cell_style(self) -> str: - """Get the CSS style for table cells. - - Returns: - CSS style string - """ - return ( - "border: 1px solid black; padding: 8px; text-align: left; " - "white-space: nowrap;" - ) - - def get_header_style(self) -> str: - """Get the CSS style for header cells. - - Returns: - CSS style string - """ - return ( - "border: 1px solid black; padding: 8px; text-align: left; " - "background-color: #f2f2f2; white-space: nowrap; min-width: fit-content; " - "max-width: fit-content;" - ) - - -class DataFrameHtmlFormatter: - """Configurable HTML formatter for DataFusion DataFrames. - - This class handles the HTML rendering of DataFrames for display in - Jupyter notebooks and other rich display contexts. - - This class supports extension through composition. Key extension points: - - Provide a custom StyleProvider for styling cells and headers - - Register custom formatters for specific types - - Provide custom cell builders for specialized cell rendering - - Args: - max_cell_length: Maximum characters to display in a cell before truncation - max_width: Maximum width of the HTML table in pixels - max_height: Maximum height of the HTML table in pixels - max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB) - min_rows_display: Minimum number of rows to display - repr_rows: Default number of rows to display in repr output - enable_cell_expansion: Whether to add expand/collapse buttons for long cell - values - custom_css: Additional CSS to include in the HTML output - show_truncation_message: Whether to display a message when data is truncated - style_provider: Custom provider for cell and header styles - use_shared_styles: Whether to load styles and scripts only once per notebook - session - """ - - # Class variable to track if styles have been loaded in the notebook - _styles_loaded = False - - def __init__( - self, - max_cell_length: int = 25, - max_width: int = 1000, - max_height: int = 300, - max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB - min_rows_display: int = 20, - repr_rows: int = 10, - enable_cell_expansion: bool = True, - custom_css: Optional[str] = None, - show_truncation_message: bool = True, - style_provider: Optional[StyleProvider] = None, - use_shared_styles: bool = True, - ) -> None: - """Initialize the HTML formatter. - - Parameters - ---------- - max_cell_length : int, default 25 - Maximum length of cell content before truncation. - max_width : int, default 1000 - Maximum width of the displayed table in pixels. - max_height : int, default 300 - Maximum height of the displayed table in pixels. - max_memory_bytes : int, default 2097152 (2MB) - Maximum memory in bytes for rendered data. - min_rows_display : int, default 20 - Minimum number of rows to display. - repr_rows : int, default 10 - Default number of rows to display in repr output. - enable_cell_expansion : bool, default True - Whether to allow cells to expand when clicked. - custom_css : str, optional - Custom CSS to apply to the HTML table. - show_truncation_message : bool, default True - Whether to show a message indicating that content has been truncated. - style_provider : StyleProvider, optional - Provider of CSS styles for the HTML table. If None, DefaultStyleProvider - is used. - use_shared_styles : bool, default True - Whether to use shared styles across multiple tables. - - Raises: - ------ - ValueError - If max_cell_length, max_width, max_height, max_memory_bytes, - min_rows_display, or repr_rows is not a positive integer. - TypeError - If enable_cell_expansion, show_truncation_message, or use_shared_styles is - not a boolean, - or if custom_css is provided but is not a string, - or if style_provider is provided but does not implement the StyleProvider - protocol. - """ - # Validate numeric parameters - _validate_positive_int(max_cell_length, "max_cell_length") - _validate_positive_int(max_width, "max_width") - _validate_positive_int(max_height, "max_height") - _validate_positive_int(max_memory_bytes, "max_memory_bytes") - _validate_positive_int(min_rows_display, "min_rows_display") - _validate_positive_int(repr_rows, "repr_rows") - - # Validate boolean parameters - _validate_bool(enable_cell_expansion, "enable_cell_expansion") - _validate_bool(show_truncation_message, "show_truncation_message") - _validate_bool(use_shared_styles, "use_shared_styles") - - # Validate custom_css - if custom_css is not None and not isinstance(custom_css, str): - msg = "custom_css must be None or a string" - raise TypeError(msg) - - # Validate style_provider - if style_provider is not None and not isinstance(style_provider, StyleProvider): - msg = "style_provider must implement the StyleProvider protocol" - raise TypeError(msg) - - self.max_cell_length = max_cell_length - self.max_width = max_width - self.max_height = max_height - self.max_memory_bytes = max_memory_bytes - self.min_rows_display = min_rows_display - self.repr_rows = repr_rows - self.enable_cell_expansion = enable_cell_expansion - self.custom_css = custom_css - self.show_truncation_message = show_truncation_message - self.style_provider = style_provider or DefaultStyleProvider() - self.use_shared_styles = use_shared_styles - # Registry for custom type formatters - self._type_formatters: dict[type, CellFormatter] = {} - # Custom cell builders - self._custom_cell_builder: Optional[Callable[[Any, int, int, str], str]] = None - self._custom_header_builder: Optional[Callable[[Any], str]] = None - - def register_formatter(self, type_class: type, formatter: CellFormatter) -> None: - """Register a custom formatter for a specific data type. - - Args: - type_class: The type to register a formatter for - formatter: Function that takes a value of the given type and returns - a formatted string - """ - self._type_formatters[type_class] = formatter - - def set_custom_cell_builder( - self, builder: Callable[[Any, int, int, str], str] - ) -> None: - """Set a custom cell builder function. - - Args: - builder: Function that takes (value, row, col, table_id) and returns HTML - """ - self._custom_cell_builder = builder - - def set_custom_header_builder(self, builder: Callable[[Any], str]) -> None: - """Set a custom header builder function. - - Args: - builder: Function that takes a field and returns HTML - """ - self._custom_header_builder = builder - - @classmethod - def is_styles_loaded(cls) -> bool: - """Check if HTML styles have been loaded in the current session. - - This method is primarily intended for debugging UI rendering issues - related to style loading. - - Returns: - True if styles have been loaded, False otherwise - - Example: - >>> from datafusion.html_formatter import DataFrameHtmlFormatter - >>> DataFrameHtmlFormatter.is_styles_loaded() - False - """ - return cls._styles_loaded - - def format_html( - self, - batches: list, - schema: Any, - has_more: bool = False, - table_uuid: str | None = None, - ) -> str: - """Format record batches as HTML. - - This method is used by DataFrame's _repr_html_ implementation and can be - called directly when custom HTML rendering is needed. - - Args: - batches: List of Arrow RecordBatch objects - schema: Arrow Schema object - has_more: Whether there are more batches not shown - table_uuid: Unique ID for the table, used for JavaScript interactions - - Returns: - HTML string representation of the data - - Raises: - TypeError: If schema is invalid and no batches are provided - """ - if not batches: - return "No data to display" - - # Validate schema - if schema is None or not hasattr(schema, "__iter__"): - msg = "Schema must be provided" - raise TypeError(msg) - - # Generate a unique ID if none provided - table_uuid = table_uuid or f"df-{id(batches)}" - - # Build HTML components - html = [] - - # Only include styles and scripts if: - # 1. Not using shared styles, OR - # 2. Using shared styles but they haven't been loaded yet - include_styles = ( - not self.use_shared_styles or not DataFrameHtmlFormatter._styles_loaded - ) - - if include_styles: - html.extend(self._build_html_header()) - # If we're using shared styles, mark them as loaded - if self.use_shared_styles: - DataFrameHtmlFormatter._styles_loaded = True - - html.extend(self._build_table_container_start()) - - # Add table header and body - html.extend(self._build_table_header(schema)) - html.extend(self._build_table_body(batches, table_uuid)) - - html.append("
" + f"{field.name}
" + f"
" + "" + "" + f"{formatted_value}" + f"" + f"
" + f"
{formatted_value}
") - html.append("
") - - # Add footer (JavaScript and messages) - if include_styles and self.enable_cell_expansion: - html.append(self._get_javascript()) - - # Always add truncation message if needed (independent of styles) - if has_more and self.show_truncation_message: - html.append("
Data truncated due to size.
") - - return "\n".join(html) - - def _build_html_header(self) -> list[str]: - """Build the HTML header with CSS styles.""" - html = [] - html.append("") - return html +import warnings - def _build_table_container_start(self) -> list[str]: - """Build the opening tags for the table container.""" - html = [] - html.append( - f'
' - ) - html.append('') - return html +from datafusion.dataframe_formatter import * # noqa: F403 - def _build_table_header(self, schema: Any) -> list[str]: - """Build the HTML table header with column names.""" - html = [] - html.append("") - html.append("") - for field in schema: - if self._custom_header_builder: - html.append(self._custom_header_builder(field)) - else: - html.append( - f"" - ) - html.append("") - html.append("") - return html - - def _build_table_body(self, batches: list, table_uuid: str) -> list[str]: - """Build the HTML table body with data rows.""" - html = [] - html.append("") - - row_count = 0 - for batch in batches: - for row_idx in range(batch.num_rows): - row_count += 1 - html.append("") - - for col_idx, column in enumerate(batch.columns): - # Get the raw value from the column - raw_value = self._get_cell_value(column, row_idx) - - # Always check for type formatters first to format the value - formatted_value = self._format_cell_value(raw_value) - - # Then apply either custom cell builder or standard cell formatting - if self._custom_cell_builder: - # Pass both the raw value and formatted value to let the - # builder decide - cell_html = self._custom_cell_builder( - raw_value, row_count, col_idx, table_uuid - ) - html.append(cell_html) - else: - # Standard cell formatting with formatted value - if ( - len(str(raw_value)) > self.max_cell_length - and self.enable_cell_expansion - ): - cell_html = self._build_expandable_cell( - formatted_value, row_count, col_idx, table_uuid - ) - else: - cell_html = self._build_regular_cell(formatted_value) - html.append(cell_html) - - html.append("") - - html.append("") - return html - - def _get_cell_value(self, column: Any, row_idx: int) -> Any: - """Extract a cell value from a column. - - Args: - column: Arrow array - row_idx: Row index - - Returns: - The raw cell value - """ - try: - value = column[row_idx] - - if hasattr(value, "as_py"): - return value.as_py() - except (AttributeError, TypeError): - pass - else: - return value - - def _format_cell_value(self, value: Any) -> str: - """Format a cell value for display. - - Uses registered type formatters if available. - - Args: - value: The cell value to format - - Returns: - Formatted cell value as string - """ - # Check for custom type formatters - for type_cls, formatter in self._type_formatters.items(): - if isinstance(value, type_cls): - return formatter(value) - - # If no formatter matched, return string representation - return str(value) - - def _build_expandable_cell( - self, formatted_value: str, row_count: int, col_idx: int, table_uuid: str - ) -> str: - """Build an expandable cell for long content.""" - short_value = str(formatted_value)[: self.max_cell_length] - return ( - f"" - ) - - def _build_regular_cell(self, formatted_value: str) -> str: - """Build a regular table cell.""" - return ( - f"" - ) - - def _build_html_footer(self, has_more: bool) -> list[str]: - """Build the HTML footer with JavaScript and messages.""" - html = [] - - # Add JavaScript for interactivity only if cell expansion is enabled - # and we're not using the shared styles approach - if self.enable_cell_expansion and not self.use_shared_styles: - html.append(self._get_javascript()) - - # Add truncation message if needed - if has_more and self.show_truncation_message: - html.append("
Data truncated due to size.
") - - return html - - def _get_default_css(self) -> str: - """Get default CSS styles for the HTML table.""" - return """ - .expandable-container { - display: inline-block; - max-width: 200px; - } - .expandable { - white-space: nowrap; - overflow: hidden; - text-overflow: ellipsis; - display: block; - } - .full-text { - display: none; - white-space: normal; - } - .expand-btn { - cursor: pointer; - color: blue; - text-decoration: underline; - border: none; - background: none; - font-size: inherit; - display: block; - margin-top: 5px; - } - """ - - def _get_javascript(self) -> str: - """Get JavaScript code for interactive elements.""" - return """ - - """ - - -class FormatterManager: - """Manager class for the global DataFrame HTML formatter instance.""" - - _default_formatter: DataFrameHtmlFormatter = DataFrameHtmlFormatter() - - @classmethod - def set_formatter(cls, formatter: DataFrameHtmlFormatter) -> None: - """Set the global DataFrame HTML formatter. - - Args: - formatter: The formatter instance to use globally - """ - cls._default_formatter = formatter - _refresh_formatter_reference() - - @classmethod - def get_formatter(cls) -> DataFrameHtmlFormatter: - """Get the current global DataFrame HTML formatter. - - Returns: - The global HTML formatter instance - """ - return cls._default_formatter - - -def get_formatter() -> DataFrameHtmlFormatter: - """Get the current global DataFrame HTML formatter. - - This function is used by the DataFrame._repr_html_ implementation to access - the shared formatter instance. It can also be used directly when custom - HTML rendering is needed. - - Returns: - The global HTML formatter instance - - Example: - >>> from datafusion.html_formatter import get_formatter - >>> formatter = get_formatter() - >>> formatter.max_cell_length = 50 # Increase cell length - """ - return FormatterManager.get_formatter() - - -def set_formatter(formatter: DataFrameHtmlFormatter) -> None: - """Set the global DataFrame HTML formatter. - - Args: - formatter: The formatter instance to use globally - - Example: - >>> from datafusion.html_formatter import get_formatter, set_formatter - >>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100) - >>> set_formatter(custom_formatter) - """ - FormatterManager.set_formatter(formatter) - - -def configure_formatter(**kwargs: Any) -> None: - """Configure the global DataFrame HTML formatter. - - This function creates a new formatter with the provided configuration - and sets it as the global formatter for all DataFrames. - - Args: - **kwargs: Formatter configuration parameters like max_cell_length, - max_width, max_height, enable_cell_expansion, etc. - - Raises: - ValueError: If any invalid parameters are provided - - Example: - >>> from datafusion.html_formatter import configure_formatter - >>> configure_formatter( - ... max_cell_length=50, - ... max_height=500, - ... enable_cell_expansion=True, - ... use_shared_styles=True - ... ) - """ - # Valid parameters accepted by DataFrameHtmlFormatter - valid_params = { - "max_cell_length", - "max_width", - "max_height", - "max_memory_bytes", - "min_rows_display", - "repr_rows", - "enable_cell_expansion", - "custom_css", - "show_truncation_message", - "style_provider", - "use_shared_styles", - } - - # Check for invalid parameters - invalid_params = set(kwargs) - valid_params - if invalid_params: - msg = ( - f"Invalid formatter parameters: {', '.join(invalid_params)}. " - f"Valid parameters are: {', '.join(valid_params)}" - ) - raise ValueError(msg) - - # Create and set formatter with validated parameters - set_formatter(DataFrameHtmlFormatter(**kwargs)) - - -def reset_formatter() -> None: - """Reset the global DataFrame HTML formatter to default settings. - - This function creates a new formatter with default configuration - and sets it as the global formatter for all DataFrames. - - Example: - >>> from datafusion.html_formatter import reset_formatter - >>> reset_formatter() # Reset formatter to default settings - """ - formatter = DataFrameHtmlFormatter() - # Reset the styles_loaded flag to ensure styles will be reloaded - DataFrameHtmlFormatter._styles_loaded = False - set_formatter(formatter) - - -def reset_styles_loaded_state() -> None: - """Reset the styles loaded state to force reloading of styles. - - This can be useful when switching between notebook sessions or - when styles need to be refreshed. - - Example: - >>> from datafusion.html_formatter import reset_styles_loaded_state - >>> reset_styles_loaded_state() # Force styles to reload in next render - """ - DataFrameHtmlFormatter._styles_loaded = False - - -def _refresh_formatter_reference() -> None: - """Refresh formatter reference in any modules using it. - - This helps ensure that changes to the formatter are reflected in existing - DataFrames that might be caching the formatter reference. - """ - # This is a no-op but signals modules to refresh their reference +warnings.warn( + "The module 'html_formatter' is deprecated and will be removed in the next release." + "Please use 'dataframe_formatter' instead.", + DeprecationWarning, + stacklevel=3, +) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index deaa30b3d..c9ae38d8e 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -37,14 +37,14 @@ from datafusion import ( functions as f, ) -from datafusion.expr import Window -from datafusion.html_formatter import ( +from datafusion.dataframe_formatter import ( DataFrameHtmlFormatter, configure_formatter, get_formatter, reset_formatter, reset_styles_loaded_state, ) +from datafusion.expr import Window from pyarrow.csv import write_csv MB = 1024 * 1024 diff --git a/src/dataframe.rs b/src/dataframe.rs index 3d68db279..c2ad4771e 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -24,6 +24,7 @@ use arrow::compute::can_cast_types; use arrow::error::ArrowError; use arrow::ffi::FFI_ArrowSchema; use arrow::ffi_stream::FFI_ArrowArrayStream; +use arrow::pyarrow::FromPyArrow; use datafusion::arrow::datatypes::Schema; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::arrow::util::pretty; @@ -150,9 +151,9 @@ fn get_python_formatter_with_config(py: Python) -> PyResult { Ok(PythonFormatter { formatter, config }) } -/// Get the Python formatter from the datafusion.html_formatter module +/// Get the Python formatter from the datafusion.dataframe_formatter module fn import_python_formatter(py: Python) -> PyResult> { - let formatter_module = py.import("datafusion.html_formatter")?; + let formatter_module = py.import("datafusion.dataframe_formatter")?; let get_formatter = formatter_module.getattr("get_formatter")?; get_formatter.call0() } @@ -295,6 +296,46 @@ impl PyDataFrame { pub fn new(df: DataFrame) -> Self { Self { df: Arc::new(df) } } + + fn prepare_repr_string(&self, py: Python, as_html: bool) -> PyDataFusionResult { + // Get the Python formatter and config + let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display(self.df.as_ref().clone(), config), + )??; + if batches.is_empty() { + // This should not be reached, but do it for safety since we index into the vector below + return Ok("No data to display".to_string()); + } + + let table_uuid = uuid::Uuid::new_v4().to_string(); + + // Convert record batches to PyObject list + let py_batches = batches + .into_iter() + .map(|rb| rb.to_pyarrow(py)) + .collect::>>()?; + + let py_schema = self.schema().into_pyobject(py)?; + + let kwargs = pyo3::types::PyDict::new(py); + let py_batches_list = PyList::new(py, py_batches.as_slice())?; + kwargs.set_item("batches", py_batches_list)?; + kwargs.set_item("schema", py_schema)?; + kwargs.set_item("has_more", has_more)?; + kwargs.set_item("table_uuid", table_uuid)?; + + let method_name = match as_html { + true => "format_html", + false => "format_str", + }; + + let html_result = formatter.call_method(method_name, (), Some(&kwargs))?; + let html_str: String = html_result.extract()?; + + Ok(html_str) + } } #[pymethods] @@ -321,18 +362,27 @@ impl PyDataFrame { } fn __repr__(&self, py: Python) -> PyDataFusionResult { - // Get the Python formatter config - let PythonFormatter { - formatter: _, - config, - } = get_python_formatter_with_config(py)?; - let (batches, has_more) = wait_for_future( - py, - collect_record_batches_to_display(self.df.as_ref().clone(), config), - )??; + self.prepare_repr_string(py, false) + } + + #[staticmethod] + #[expect(unused_variables)] + fn default_str_repr<'py>( + batches: Vec>, + schema: &Bound<'py, PyAny>, + has_more: bool, + table_uuid: &str, + ) -> PyResult { + let batches = batches + .into_iter() + .map(|batch| RecordBatch::from_pyarrow_bound(&batch)) + .collect::>>()? + .into_iter() + .filter(|batch| batch.num_rows() > 0) + .collect::>(); + if batches.is_empty() { - // This should not be reached, but do it for safety since we index into the vector below - return Ok("No data to display".to_string()); + return Ok("No data to display".to_owned()); } let batches_as_displ = @@ -347,38 +397,7 @@ impl PyDataFrame { } fn _repr_html_(&self, py: Python) -> PyDataFusionResult { - // Get the Python formatter and config - let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; - let (batches, has_more) = wait_for_future( - py, - collect_record_batches_to_display(self.df.as_ref().clone(), config), - )??; - if batches.is_empty() { - // This should not be reached, but do it for safety since we index into the vector below - return Ok("No data to display".to_string()); - } - - let table_uuid = uuid::Uuid::new_v4().to_string(); - - // Convert record batches to PyObject list - let py_batches = batches - .into_iter() - .map(|rb| rb.to_pyarrow(py)) - .collect::>>()?; - - let py_schema = self.schema().into_pyobject(py)?; - - let kwargs = pyo3::types::PyDict::new(py); - let py_batches_list = PyList::new(py, py_batches.as_slice())?; - kwargs.set_item("batches", py_batches_list)?; - kwargs.set_item("schema", py_schema)?; - kwargs.set_item("has_more", has_more)?; - kwargs.set_item("table_uuid", table_uuid)?; - - let html_result = formatter.call_method("format_html", (), Some(&kwargs))?; - let html_str: String = html_result.extract()?; - - Ok(html_str) + self.prepare_repr_string(py, true) } /// Calculate summary statistics for a DataFrame From 954563429384078a9e85c56ad553c7e3be7ac52a Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 25 Jun 2025 11:29:35 -0400 Subject: [PATCH 06/96] feat: collect once during display() in jupyter notebooks (#1167) * Only collect one time during display() in jupyter notebooks * Check for juypter notebook environment specifically * Remove approach of checking environment which could not differentiate between jupyter console and notebook * Instead of trying to detect notebook vs console, collect one time when we have any kind if ipython environment. --- src/dataframe.rs | 36 ++++++++++++++++++++++++++---------- src/utils.rs | 11 +++++++++++ 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/dataframe.rs b/src/dataframe.rs index c2ad4771e..ab4749e35 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -51,7 +51,7 @@ use crate::physical_plan::PyExecutionPlan; use crate::record_batch::PyRecordBatchStream; use crate::sql::logical::PyLogicalPlan; use crate::utils::{ - get_tokio_runtime, py_obj_to_scalar_value, validate_pycapsule, wait_for_future, + get_tokio_runtime, is_ipython_env, py_obj_to_scalar_value, validate_pycapsule, wait_for_future, }; use crate::{ errors::PyDataFusionResult, @@ -289,21 +289,33 @@ impl PyParquetColumnOptions { #[derive(Clone)] pub struct PyDataFrame { df: Arc, + + // In IPython environment cache batches between __repr__ and _repr_html_ calls. + batches: Option<(Vec, bool)>, } impl PyDataFrame { /// creates a new PyDataFrame pub fn new(df: DataFrame) -> Self { - Self { df: Arc::new(df) } + Self { + df: Arc::new(df), + batches: None, + } } - fn prepare_repr_string(&self, py: Python, as_html: bool) -> PyDataFusionResult { + fn prepare_repr_string(&mut self, py: Python, as_html: bool) -> PyDataFusionResult { // Get the Python formatter and config let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; - let (batches, has_more) = wait_for_future( - py, - collect_record_batches_to_display(self.df.as_ref().clone(), config), - )??; + + let should_cache = *is_ipython_env(py) && self.batches.is_none(); + let (batches, has_more) = match self.batches.take() { + Some(b) => b, + None => wait_for_future( + py, + collect_record_batches_to_display(self.df.as_ref().clone(), config), + )??, + }; + if batches.is_empty() { // This should not be reached, but do it for safety since we index into the vector below return Ok("No data to display".to_string()); @@ -313,7 +325,7 @@ impl PyDataFrame { // Convert record batches to PyObject list let py_batches = batches - .into_iter() + .iter() .map(|rb| rb.to_pyarrow(py)) .collect::>>()?; @@ -334,6 +346,10 @@ impl PyDataFrame { let html_result = formatter.call_method(method_name, (), Some(&kwargs))?; let html_str: String = html_result.extract()?; + if should_cache { + self.batches = Some((batches, has_more)); + } + Ok(html_str) } } @@ -361,7 +377,7 @@ impl PyDataFrame { } } - fn __repr__(&self, py: Python) -> PyDataFusionResult { + fn __repr__(&mut self, py: Python) -> PyDataFusionResult { self.prepare_repr_string(py, false) } @@ -396,7 +412,7 @@ impl PyDataFrame { Ok(format!("DataFrame()\n{batches_as_displ}{additional_str}")) } - fn _repr_html_(&self, py: Python) -> PyDataFusionResult { + fn _repr_html_(&mut self, py: Python) -> PyDataFusionResult { self.prepare_repr_string(py, true) } diff --git a/src/utils.rs b/src/utils.rs index 90d654385..f4e121fd5 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -39,6 +39,17 @@ pub(crate) fn get_tokio_runtime() -> &'static TokioRuntime { RUNTIME.get_or_init(|| TokioRuntime(tokio::runtime::Runtime::new().unwrap())) } +#[inline] +pub(crate) fn is_ipython_env(py: Python) -> &'static bool { + static IS_IPYTHON_ENV: OnceLock = OnceLock::new(); + IS_IPYTHON_ENV.get_or_init(|| { + py.import("IPython") + .and_then(|ipython| ipython.call_method0("get_ipython")) + .map(|ipython| !ipython.is_none()) + .unwrap_or(false) + }) +} + /// Utility to get the Global Datafussion CTX #[inline] pub(crate) fn get_global_ctx() -> &'static SessionContext { From 9362f53150e5423581757ed56883b3ca2c95b8a2 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 2 Jul 2025 08:08:53 -0400 Subject: [PATCH 07/96] feat: python based catalog and schema provider (#1156) * Exposing FFI to python * Exposing FFI to python * Workin progress on python catalog * Flushing out schema and catalog providers * Adding implementation of python based catalog and schema providers * Small updates after rebase * Add default in memory options for adding schema and catalogs * Add support for creating in memory catalog and schema * Update from database to schema in unit tests * xfailed label no longer applies to these unit tests * Defining abstract methods for catalog and schema providers * Working through issues between custom catalog and build in schema * Check types on schema provider to return * Add docstring * Add documentation about how to use catalog and schema providers * Re-add module to all after rebase * Minor bugfix * Clippy updates from the new rust version --------- Co-authored-by: renato2099 --- Cargo.lock | 19 + Cargo.toml | 2 + docs/source/user-guide/data-sources.rst | 56 ++ examples/datafusion-ffi-example/Cargo.lock | 1 + examples/datafusion-ffi-example/Cargo.toml | 1 + .../python/tests/_test_catalog_provider.py | 60 +++ .../src/catalog_provider.rs | 179 +++++++ examples/datafusion-ffi-example/src/lib.rs | 3 + python/datafusion/__init__.py | 1 + python/datafusion/catalog.py | 195 ++++++- python/datafusion/context.py | 24 +- python/datafusion/dataframe.py | 9 +- python/tests/test_catalog.py | 173 ++++++- python/tests/test_context.py | 40 +- python/tests/test_sql.py | 30 +- python/tests/test_substrait.py | 4 +- src/catalog.rs | 490 ++++++++++++++++-- src/common/data_type.rs | 120 ++--- src/context.rs | 61 ++- src/expr.rs | 15 +- src/expr/aggregate.rs | 2 +- src/expr/aggregate_expr.rs | 2 +- src/expr/alias.rs | 2 +- src/expr/analyze.rs | 2 +- src/expr/between.rs | 2 +- src/expr/column.rs | 2 +- src/expr/copy_to.rs | 4 +- src/expr/create_catalog.rs | 2 +- src/expr/create_catalog_schema.rs | 2 +- src/expr/create_external_table.rs | 2 +- src/expr/create_function.rs | 2 +- src/expr/create_index.rs | 2 +- src/expr/create_memory_table.rs | 2 +- src/expr/create_view.rs | 2 +- src/expr/describe_table.rs | 2 +- src/expr/distinct.rs | 5 +- src/expr/drop_catalog_schema.rs | 2 +- src/expr/drop_function.rs | 2 +- src/expr/drop_table.rs | 2 +- src/expr/drop_view.rs | 2 +- src/expr/empty_relation.rs | 2 +- src/expr/filter.rs | 2 +- src/expr/join.rs | 2 +- src/expr/like.rs | 6 +- src/expr/limit.rs | 2 +- src/expr/projection.rs | 2 +- src/expr/recursive_query.rs | 2 +- src/expr/repartition.rs | 2 +- src/expr/sort.rs | 2 +- src/expr/sort_expr.rs | 2 +- src/expr/subquery.rs | 2 +- src/expr/subquery_alias.rs | 2 +- src/expr/table_scan.rs | 2 +- src/expr/union.rs | 2 +- src/expr/unnest.rs | 2 +- src/expr/unnest_expr.rs | 2 +- src/expr/window.rs | 11 +- src/functions.rs | 2 +- src/lib.rs | 10 +- src/physical_plan.rs | 3 +- src/sql/logical.rs | 3 +- src/utils.rs | 5 +- 62 files changed, 1340 insertions(+), 258 deletions(-) create mode 100644 examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py create mode 100644 examples/datafusion-ffi-example/src/catalog_provider.rs diff --git a/Cargo.lock b/Cargo.lock index 112167cb4..a3e9336cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -165,6 +165,12 @@ dependencies = [ "zstd", ] +[[package]] +name = "arc-swap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" + [[package]] name = "arrayref" version = "0.3.9" @@ -1503,6 +1509,7 @@ dependencies = [ "datafusion-proto", "datafusion-substrait", "futures", + "log", "mimalloc", "object_store", "prost", @@ -1510,6 +1517,7 @@ dependencies = [ "pyo3", "pyo3-async-runtimes", "pyo3-build-config", + "pyo3-log", "tokio", "url", "uuid", @@ -2953,6 +2961,17 @@ dependencies = [ "pyo3-build-config", ] +[[package]] +name = "pyo3-log" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45192e5e4a4d2505587e27806c7b710c231c40c56f3bfc19535d0bb25df52264" +dependencies = [ + "arc-swap", + "log", + "pyo3", +] + [[package]] name = "pyo3-macros" version = "0.24.2" diff --git a/Cargo.toml b/Cargo.toml index 4135e64e2..1f7895a50 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,7 @@ substrait = ["dep:datafusion-substrait"] tokio = { version = "1.45", features = ["macros", "rt", "rt-multi-thread", "sync"] } pyo3 = { version = "0.24", features = ["extension-module", "abi3", "abi3-py39"] } pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"]} +pyo3-log = "0.12.4" arrow = { version = "55.1.0", features = ["pyarrow"] } datafusion = { version = "48.0.0", features = ["avro", "unicode_expressions"] } datafusion-substrait = { version = "48.0.0", optional = true } @@ -49,6 +50,7 @@ async-trait = "0.1.88" futures = "0.3" object_store = { version = "0.12.1", features = ["aws", "gcp", "azure", "http"] } url = "2" +log = "0.4.27" [build-dependencies] prost-types = "0.13.1" # keep in line with `datafusion-substrait` diff --git a/docs/source/user-guide/data-sources.rst b/docs/source/user-guide/data-sources.rst index ba5967c97..9c95d58e0 100644 --- a/docs/source/user-guide/data-sources.rst +++ b/docs/source/user-guide/data-sources.rst @@ -185,3 +185,59 @@ the interface as describe in the :ref:`Custom Table Provider `_ is provided in the DataFusion repository. + +Catalog +======= + +A common technique for organizing tables is using a three level hierarchical approach. DataFusion +supports this form of organizing using the :py:class:`~datafusion.catalog.Catalog`, +:py:class:`~datafusion.catalog.Schema`, and :py:class:`~datafusion.catalog.Table`. By default, +a :py:class:`~datafusion.context.SessionContext` comes with a single Catalog and a single Schema +with the names ``datafusion`` and ``default``, respectively. + +The default implementation uses an in-memory approach to the catalog and schema. We have support +for adding additional in-memory catalogs and schemas. This can be done like in the following +example: + +.. code-block:: python + + from datafusion.catalog import Catalog, Schema + + my_catalog = Catalog.memory_catalog() + my_schema = Schema.memory_schema() + + my_catalog.register_schema("my_schema_name", my_schema) + + ctx.register_catalog("my_catalog_name", my_catalog) + +You could then register tables in ``my_schema`` and access them either through the DataFrame +API or via sql commands such as ``"SELECT * from my_catalog_name.my_schema_name.my_table"``. + +User Defined Catalog and Schema +------------------------------- + +If the in-memory catalogs are insufficient for your uses, there are two approaches you can take +to implementing a custom catalog and/or schema. In the below discussion, we describe how to +implement these for a Catalog, but the approach to implementing for a Schema is nearly +identical. + +DataFusion supports Catalogs written in either Rust or Python. If you write a Catalog in Rust, +you will need to export it as a Python library via PyO3. There is a complete example of a +catalog implemented this way in the +`examples folder `_ +of our repository. Writing catalog providers in Rust provides typically can lead to significant +performance improvements over the Python based approach. + +To implement a Catalog in Python, you will need to inherit from the abstract base class +:py:class:`~datafusion.catalog.CatalogProvider`. There are examples in the +`unit tests `_ of +implementing a basic Catalog in Python where we simply keep a dictionary of the +registered Schemas. + +One important note for developers is that when we have a Catalog defined in Python, we have +two different ways of accessing this Catalog. First, we register the catalog with a Rust +wrapper. This allows for any rust based code to call the Python functions as necessary. +Second, if the user access the Catalog via the Python API, we identify this and return back +the original Python object that implements the Catalog. This is an important distinction +for developers because we do *not* return a Python wrapper around the Rust wrapper of the +original Python object. diff --git a/examples/datafusion-ffi-example/Cargo.lock b/examples/datafusion-ffi-example/Cargo.lock index 075ebd5a1..e5a1ca8d1 100644 --- a/examples/datafusion-ffi-example/Cargo.lock +++ b/examples/datafusion-ffi-example/Cargo.lock @@ -1448,6 +1448,7 @@ dependencies = [ "arrow", "arrow-array", "arrow-schema", + "async-trait", "datafusion", "datafusion-ffi", "pyo3", diff --git a/examples/datafusion-ffi-example/Cargo.toml b/examples/datafusion-ffi-example/Cargo.toml index 0e17567b9..319163554 100644 --- a/examples/datafusion-ffi-example/Cargo.toml +++ b/examples/datafusion-ffi-example/Cargo.toml @@ -27,6 +27,7 @@ pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] arrow = { version = "55.0.0" } arrow-array = { version = "55.0.0" } arrow-schema = { version = "55.0.0" } +async-trait = "0.1.88" [build-dependencies] pyo3-build-config = "0.23" diff --git a/examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py b/examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py new file mode 100644 index 000000000..72aadf64c --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pyarrow as pa +from datafusion import SessionContext +from datafusion_ffi_example import MyCatalogProvider + + +def test_catalog_provider(): + ctx = SessionContext() + + my_catalog_name = "my_catalog" + expected_schema_name = "my_schema" + expected_table_name = "my_table" + expected_table_columns = ["units", "price"] + + catalog_provider = MyCatalogProvider() + ctx.register_catalog_provider(my_catalog_name, catalog_provider) + my_catalog = ctx.catalog(my_catalog_name) + + my_catalog_schemas = my_catalog.names() + assert expected_schema_name in my_catalog_schemas + my_database = my_catalog.database(expected_schema_name) + assert expected_table_name in my_database.names() + my_table = my_database.table(expected_table_name) + assert expected_table_columns == my_table.schema.names + + result = ctx.table( + f"{my_catalog_name}.{expected_schema_name}.{expected_table_name}" + ).collect() + assert len(result) == 2 + + col0_result = [r.column(0) for r in result] + col1_result = [r.column(1) for r in result] + expected_col0 = [ + pa.array([10, 20, 30], type=pa.int32()), + pa.array([5, 7], type=pa.int32()), + ] + expected_col1 = [ + pa.array([1, 2, 5], type=pa.float64()), + pa.array([1.5, 2.5], type=pa.float64()), + ] + assert col0_result == expected_col0 + assert col1_result == expected_col1 diff --git a/examples/datafusion-ffi-example/src/catalog_provider.rs b/examples/datafusion-ffi-example/src/catalog_provider.rs new file mode 100644 index 000000000..54e61cf3e --- /dev/null +++ b/examples/datafusion-ffi-example/src/catalog_provider.rs @@ -0,0 +1,179 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; +use std::{any::Any, fmt::Debug, sync::Arc}; + +use arrow::datatypes::Schema; +use async_trait::async_trait; +use datafusion::{ + catalog::{ + CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider, TableProvider, + }, + common::exec_err, + datasource::MemTable, + error::{DataFusionError, Result}, +}; +use datafusion_ffi::catalog_provider::FFI_CatalogProvider; +use pyo3::types::PyCapsule; + +pub fn my_table() -> Arc { + use arrow::datatypes::{DataType, Field}; + use datafusion::common::record_batch; + + let schema = Arc::new(Schema::new(vec![ + Field::new("units", DataType::Int32, true), + Field::new("price", DataType::Float64, true), + ])); + + let partitions = vec![ + record_batch!( + ("units", Int32, vec![10, 20, 30]), + ("price", Float64, vec![1.0, 2.0, 5.0]) + ) + .unwrap(), + record_batch!( + ("units", Int32, vec![5, 7]), + ("price", Float64, vec![1.5, 2.5]) + ) + .unwrap(), + ]; + + Arc::new(MemTable::try_new(schema, vec![partitions]).unwrap()) +} + +#[derive(Debug)] +pub struct FixedSchemaProvider { + inner: MemorySchemaProvider, +} + +impl Default for FixedSchemaProvider { + fn default() -> Self { + let inner = MemorySchemaProvider::new(); + + let table = my_table(); + + let _ = inner.register_table("my_table".to_string(), table).unwrap(); + + Self { inner } + } +} + +#[async_trait] +impl SchemaProvider for FixedSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + self.inner.table_names() + } + + async fn table(&self, name: &str) -> Result>, DataFusionError> { + self.inner.table(name).await + } + + fn register_table( + &self, + name: String, + table: Arc, + ) -> Result>> { + self.inner.register_table(name, table) + } + + fn deregister_table(&self, name: &str) -> Result>> { + self.inner.deregister_table(name) + } + + fn table_exist(&self, name: &str) -> bool { + self.inner.table_exist(name) + } +} + +/// This catalog provider is intended only for unit tests. It prepopulates with one +/// schema and only allows for schemas named after four types of fruit. +#[pyclass( + name = "MyCatalogProvider", + module = "datafusion_ffi_example", + subclass +)] +#[derive(Debug)] +pub(crate) struct MyCatalogProvider { + inner: MemoryCatalogProvider, +} + +impl Default for MyCatalogProvider { + fn default() -> Self { + let inner = MemoryCatalogProvider::new(); + + let schema_name: &str = "my_schema"; + let _ = inner.register_schema(schema_name, Arc::new(FixedSchemaProvider::default())); + + Self { inner } + } +} + +impl CatalogProvider for MyCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + self.inner.schema_names() + } + + fn schema(&self, name: &str) -> Option> { + self.inner.schema(name) + } + + fn register_schema( + &self, + name: &str, + schema: Arc, + ) -> Result>> { + self.inner.register_schema(name, schema) + } + + fn deregister_schema( + &self, + name: &str, + cascade: bool, + ) -> Result>> { + self.inner.deregister_schema(name, cascade) + } +} + +#[pymethods] +impl MyCatalogProvider { + #[new] + pub fn new() -> Self { + Self { + inner: Default::default(), + } + } + + pub fn __datafusion_catalog_provider__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let name = cr"datafusion_catalog_provider".into(); + let catalog_provider = + FFI_CatalogProvider::new(Arc::new(MyCatalogProvider::default()), None); + + PyCapsule::new(py, catalog_provider, Some(name)) + } +} diff --git a/examples/datafusion-ffi-example/src/lib.rs b/examples/datafusion-ffi-example/src/lib.rs index ae08c3b65..3a4cf2247 100644 --- a/examples/datafusion-ffi-example/src/lib.rs +++ b/examples/datafusion-ffi-example/src/lib.rs @@ -15,10 +15,12 @@ // specific language governing permissions and limitations // under the License. +use crate::catalog_provider::MyCatalogProvider; use crate::table_function::MyTableFunction; use crate::table_provider::MyTableProvider; use pyo3::prelude::*; +pub(crate) mod catalog_provider; pub(crate) mod table_function; pub(crate) mod table_provider; @@ -26,5 +28,6 @@ pub(crate) mod table_provider; fn datafusion_ffi_example(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index fd7f4fc06..e9d2dba75 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -92,6 +92,7 @@ "TableFunction", "WindowFrame", "WindowUDF", + "catalog", "col", "column", "common", diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index 67ab3ead2..536b3a790 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -19,18 +19,33 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Protocol import datafusion._internal as df_internal if TYPE_CHECKING: import pyarrow as pa +try: + from warnings import deprecated # Python 3.13+ +except ImportError: + from typing_extensions import deprecated # Python 3.12 + + +__all__ = [ + "Catalog", + "CatalogProvider", + "Schema", + "SchemaProvider", + "Table", +] + class Catalog: """DataFusion data catalog.""" - def __init__(self, catalog: df_internal.Catalog) -> None: + def __init__(self, catalog: df_internal.catalog.RawCatalog) -> None: """This constructor is not typically called by the end user.""" self.catalog = catalog @@ -38,39 +53,95 @@ def __repr__(self) -> str: """Print a string representation of the catalog.""" return self.catalog.__repr__() - def names(self) -> list[str]: - """Returns the list of databases in this catalog.""" - return self.catalog.names() + def names(self) -> set[str]: + """This is an alias for `schema_names`.""" + return self.schema_names() + + def schema_names(self) -> set[str]: + """Returns the list of schemas in this catalog.""" + return self.catalog.schema_names() + + @staticmethod + def memory_catalog() -> Catalog: + """Create an in-memory catalog provider.""" + catalog = df_internal.catalog.RawCatalog.memory_catalog() + return Catalog(catalog) - def database(self, name: str = "public") -> Database: + def schema(self, name: str = "public") -> Schema: """Returns the database with the given ``name`` from this catalog.""" - return Database(self.catalog.database(name)) + schema = self.catalog.schema(name) + + return ( + Schema(schema) + if isinstance(schema, df_internal.catalog.RawSchema) + else schema + ) + + @deprecated("Use `schema` instead.") + def database(self, name: str = "public") -> Schema: + """Returns the database with the given ``name`` from this catalog.""" + return self.schema(name) + + def register_schema(self, name, schema) -> Schema | None: + """Register a schema with this catalog.""" + if isinstance(schema, Schema): + return self.catalog.register_schema(name, schema._raw_schema) + return self.catalog.register_schema(name, schema) + + def deregister_schema(self, name: str, cascade: bool = True) -> Schema | None: + """Deregister a schema from this catalog.""" + return self.catalog.deregister_schema(name, cascade) -class Database: - """DataFusion Database.""" +class Schema: + """DataFusion Schema.""" - def __init__(self, db: df_internal.Database) -> None: + def __init__(self, schema: df_internal.catalog.RawSchema) -> None: """This constructor is not typically called by the end user.""" - self.db = db + self._raw_schema = schema def __repr__(self) -> str: - """Print a string representation of the database.""" - return self.db.__repr__() + """Print a string representation of the schema.""" + return self._raw_schema.__repr__() + + @staticmethod + def memory_schema() -> Schema: + """Create an in-memory schema provider.""" + schema = df_internal.catalog.RawSchema.memory_schema() + return Schema(schema) def names(self) -> set[str]: - """Returns the list of all tables in this database.""" - return self.db.names() + """This is an alias for `table_names`.""" + return self.table_names() + + def table_names(self) -> set[str]: + """Returns the list of all tables in this schema.""" + return self._raw_schema.table_names def table(self, name: str) -> Table: - """Return the table with the given ``name`` from this database.""" - return Table(self.db.table(name)) + """Return the table with the given ``name`` from this schema.""" + return Table(self._raw_schema.table(name)) + + def register_table(self, name, table) -> None: + """Register a table provider in this schema.""" + if isinstance(table, Table): + return self._raw_schema.register_table(name, table.table) + return self._raw_schema.register_table(name, table) + + def deregister_table(self, name: str) -> None: + """Deregister a table provider from this schema.""" + return self._raw_schema.deregister_table(name) + + +@deprecated("Use `Schema` instead.") +class Database(Schema): + """See `Schema`.""" class Table: """DataFusion table.""" - def __init__(self, table: df_internal.Table) -> None: + def __init__(self, table: df_internal.catalog.RawTable) -> None: """This constructor is not typically called by the end user.""" self.table = table @@ -78,6 +149,11 @@ def __repr__(self) -> str: """Print a string representation of the table.""" return self.table.__repr__() + @staticmethod + def from_dataset(dataset: pa.dataset.Dataset) -> Table: + """Turn a pyarrow Dataset into a Table.""" + return Table(df_internal.catalog.RawTable.from_dataset(dataset)) + @property def schema(self) -> pa.Schema: """Returns the schema associated with this table.""" @@ -87,3 +163,86 @@ def schema(self) -> pa.Schema: def kind(self) -> str: """Returns the kind of table.""" return self.table.kind + + +class CatalogProvider(ABC): + """Abstract class for defining a Python based Catalog Provider.""" + + @abstractmethod + def schema_names(self) -> set[str]: + """Set of the names of all schemas in this catalog.""" + ... + + @abstractmethod + def schema(self, name: str) -> Schema | None: + """Retrieve a specific schema from this catalog.""" + ... + + def register_schema( # noqa: B027 + self, name: str, schema: SchemaProviderExportable | SchemaProvider | Schema + ) -> None: + """Add a schema to this catalog. + + This method is optional. If your catalog provides a fixed list of schemas, you + do not need to implement this method. + """ + + def deregister_schema(self, name: str, cascade: bool) -> None: # noqa: B027 + """Remove a schema from this catalog. + + This method is optional. If your catalog provides a fixed list of schemas, you + do not need to implement this method. + + Args: + name: The name of the schema to remove. + cascade: If true, deregister the tables within the schema. + """ + + +class SchemaProvider(ABC): + """Abstract class for defining a Python based Schema Provider.""" + + def owner_name(self) -> str | None: + """Returns the owner of the schema. + + This is an optional method. The default return is None. + """ + return None + + @abstractmethod + def table_names(self) -> set[str]: + """Set of the names of all tables in this schema.""" + ... + + @abstractmethod + def table(self, name: str) -> Table | None: + """Retrieve a specific table from this schema.""" + ... + + def register_table(self, name: str, table: Table) -> None: # noqa: B027 + """Add a table from this schema. + + This method is optional. If your schema provides a fixed list of tables, you do + not need to implement this method. + """ + + def deregister_table(self, name, cascade: bool) -> None: # noqa: B027 + """Remove a table from this schema. + + This method is optional. If your schema provides a fixed list of tables, you do + not need to implement this method. + """ + + @abstractmethod + def table_exist(self, name: str) -> bool: + """Returns true if the table exists in this schema.""" + ... + + +class SchemaProviderExportable(Protocol): + """Type hint for object that has __datafusion_schema_provider__ PyCapsule. + + https://docs.rs/datafusion/latest/datafusion/catalog/trait.SchemaProvider.html + """ + + def __datafusion_schema_provider__(self) -> object: ... diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 5b99b0d26..bce51d644 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -29,7 +29,7 @@ except ImportError: from typing_extensions import deprecated # Python 3.12 -from datafusion.catalog import Catalog, Table +from datafusion.catalog import Catalog, CatalogProvider, Table from datafusion.dataframe import DataFrame from datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list from datafusion.record_batch import RecordBatchStream @@ -80,6 +80,15 @@ class TableProviderExportable(Protocol): def __datafusion_table_provider__(self) -> object: ... # noqa: D105 +class CatalogProviderExportable(Protocol): + """Type hint for object that has __datafusion_catalog_provider__ PyCapsule. + + https://docs.rs/datafusion/latest/datafusion/catalog/trait.CatalogProvider.html + """ + + def __datafusion_catalog_provider__(self) -> object: ... # noqa: D105 + + class SessionConfig: """Session configuration options.""" @@ -749,6 +758,19 @@ def deregister_table(self, name: str) -> None: """Remove a table from the session.""" self.ctx.deregister_table(name) + def catalog_names(self) -> set[str]: + """Returns the list of catalogs in this context.""" + return self.ctx.catalog_names() + + def register_catalog_provider( + self, name: str, provider: CatalogProviderExportable | CatalogProvider | Catalog + ) -> None: + """Register a catalog provider.""" + if isinstance(provider, Catalog): + self.ctx.register_catalog_provider(name, provider.catalog) + else: + self.ctx.register_catalog_provider(name, provider) + def register_table_provider( self, name: str, provider: TableProviderExportable ) -> None: diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 991e6875a..61cb09438 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -760,19 +760,16 @@ def join_on( exprs = [expr.expr for expr in on_exprs] return DataFrame(self.df.join_on(right.df, exprs, how)) - def explain(self, verbose: bool = False, analyze: bool = False) -> DataFrame: - """Return a DataFrame with the explanation of its plan so far. + def explain(self, verbose: bool = False, analyze: bool = False) -> None: + """Print an explanation of the DataFrame's plan so far. If ``analyze`` is specified, runs the plan and reports metrics. Args: verbose: If ``True``, more details will be included. analyze: If ``Tru`e``, the plan will run and metrics reported. - - Returns: - DataFrame with the explanation of its plan. """ - return DataFrame(self.df.explain(verbose, analyze)) + self.df.explain(verbose, analyze) def logical_plan(self) -> LogicalPlan: """Return the unoptimized ``LogicalPlan``. diff --git a/python/tests/test_catalog.py b/python/tests/test_catalog.py index 23b328458..1f9ecbfc3 100644 --- a/python/tests/test_catalog.py +++ b/python/tests/test_catalog.py @@ -14,9 +14,13 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations +import datafusion as dfn import pyarrow as pa +import pyarrow.dataset as ds import pytest +from datafusion import SessionContext, Table # Note we take in `database` as a variable even though we don't use @@ -27,9 +31,9 @@ def test_basic(ctx, database): ctx.catalog("non-existent") default = ctx.catalog() - assert default.names() == ["public"] + assert default.names() == {"public"} - for db in [default.database("public"), default.database()]: + for db in [default.schema("public"), default.schema()]: assert db.names() == {"csv1", "csv", "csv2"} table = db.table("csv") @@ -41,3 +45,168 @@ def test_basic(ctx, database): pa.field("float", pa.float64(), nullable=True), ] ) + + +def create_dataset() -> Table: + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"], + ) + dataset = ds.dataset([batch]) + return Table.from_dataset(dataset) + + +class CustomSchemaProvider(dfn.catalog.SchemaProvider): + def __init__(self): + self.tables = {"table1": create_dataset()} + + def table_names(self) -> set[str]: + return set(self.tables.keys()) + + def register_table(self, name: str, table: Table): + self.tables[name] = table + + def deregister_table(self, name, cascade: bool = True): + del self.tables[name] + + def table(self, name: str) -> Table | None: + return self.tables[name] + + def table_exist(self, name: str) -> bool: + return name in self.tables + + +class CustomCatalogProvider(dfn.catalog.CatalogProvider): + def __init__(self): + self.schemas = {"my_schema": CustomSchemaProvider()} + + def schema_names(self) -> set[str]: + return set(self.schemas.keys()) + + def schema(self, name: str): + return self.schemas[name] + + def register_schema(self, name: str, schema: dfn.catalog.Schema): + self.schemas[name] = schema + + def deregister_schema(self, name, cascade: bool): + del self.schemas[name] + + +def test_python_catalog_provider(ctx: SessionContext): + ctx.register_catalog_provider("my_catalog", CustomCatalogProvider()) + + # Check the default catalog provider + assert ctx.catalog("datafusion").names() == {"public"} + + my_catalog = ctx.catalog("my_catalog") + assert my_catalog.names() == {"my_schema"} + + my_catalog.register_schema("second_schema", CustomSchemaProvider()) + assert my_catalog.schema_names() == {"my_schema", "second_schema"} + + my_catalog.deregister_schema("my_schema") + assert my_catalog.schema_names() == {"second_schema"} + + +def test_in_memory_providers(ctx: SessionContext): + catalog = dfn.catalog.Catalog.memory_catalog() + ctx.register_catalog_provider("in_mem_catalog", catalog) + + assert ctx.catalog_names() == {"datafusion", "in_mem_catalog"} + + schema = dfn.catalog.Schema.memory_schema() + catalog.register_schema("in_mem_schema", schema) + + schema.register_table("my_table", create_dataset()) + + batches = ctx.sql("select * from in_mem_catalog.in_mem_schema.my_table").collect() + + assert len(batches) == 1 + assert batches[0].column(0) == pa.array([1, 2, 3]) + assert batches[0].column(1) == pa.array([4, 5, 6]) + + +def test_python_schema_provider(ctx: SessionContext): + catalog = ctx.catalog() + + catalog.deregister_schema("public") + + catalog.register_schema("test_schema1", CustomSchemaProvider()) + assert catalog.names() == {"test_schema1"} + + catalog.register_schema("test_schema2", CustomSchemaProvider()) + catalog.deregister_schema("test_schema1") + assert catalog.names() == {"test_schema2"} + + +def test_python_table_provider(ctx: SessionContext): + catalog = ctx.catalog() + + catalog.register_schema("custom_schema", CustomSchemaProvider()) + schema = catalog.schema("custom_schema") + + assert schema.table_names() == {"table1"} + + schema.deregister_table("table1") + schema.register_table("table2", create_dataset()) + assert schema.table_names() == {"table2"} + + # Use the default schema instead of our custom schema + + schema = catalog.schema() + + schema.register_table("table3", create_dataset()) + assert schema.table_names() == {"table3"} + + schema.deregister_table("table3") + schema.register_table("table4", create_dataset()) + assert schema.table_names() == {"table4"} + + +def test_in_end_to_end_python_providers(ctx: SessionContext): + """Test registering all python providers and running a query against them.""" + + all_catalog_names = [ + "datafusion", + "custom_catalog", + "in_mem_catalog", + ] + + all_schema_names = [ + "custom_schema", + "in_mem_schema", + ] + + ctx.register_catalog_provider(all_catalog_names[1], CustomCatalogProvider()) + ctx.register_catalog_provider( + all_catalog_names[2], dfn.catalog.Catalog.memory_catalog() + ) + + for catalog_name in all_catalog_names: + catalog = ctx.catalog(catalog_name) + + # Clean out previous schemas if they exist so we can start clean + for schema_name in catalog.schema_names(): + catalog.deregister_schema(schema_name, cascade=False) + + catalog.register_schema(all_schema_names[0], CustomSchemaProvider()) + catalog.register_schema(all_schema_names[1], dfn.catalog.Schema.memory_schema()) + + for schema_name in all_schema_names: + schema = catalog.schema(schema_name) + + for table_name in schema.table_names(): + schema.deregister_table(table_name) + + schema.register_table("test_table", create_dataset()) + + for catalog_name in all_catalog_names: + for schema_name in all_schema_names: + table_full_name = f"{catalog_name}.{schema_name}.test_table" + + batches = ctx.sql(f"select * from {table_full_name}").collect() + + assert len(batches) == 1 + assert batches[0].column(0) == pa.array([1, 2, 3]) + assert batches[0].column(1) == pa.array([4, 5, 6]) diff --git a/python/tests/test_context.py b/python/tests/test_context.py index 4a15ac9cf..6dbcc0d5e 100644 --- a/python/tests/test_context.py +++ b/python/tests/test_context.py @@ -57,7 +57,7 @@ def test_runtime_configs(tmp_path, path_to_str): ctx = SessionContext(config, runtime) assert ctx is not None - db = ctx.catalog("foo").database("bar") + db = ctx.catalog("foo").schema("bar") assert db is not None @@ -70,7 +70,7 @@ def test_temporary_files(tmp_path, path_to_str): ctx = SessionContext(config, runtime) assert ctx is not None - db = ctx.catalog("foo").database("bar") + db = ctx.catalog("foo").schema("bar") assert db is not None @@ -91,7 +91,7 @@ def test_create_context_with_all_valid_args(): ctx = SessionContext(config, runtime) # verify that at least some of the arguments worked - ctx.catalog("foo").database("bar") + ctx.catalog("foo").schema("bar") with pytest.raises(KeyError): ctx.catalog("datafusion") @@ -105,7 +105,7 @@ def test_register_record_batches(ctx): ctx.register_record_batches("t", [[batch]]) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} result = ctx.sql("SELECT a+b, a-b FROM t").collect() @@ -121,7 +121,7 @@ def test_create_dataframe_registers_unique_table_name(ctx): ) df = ctx.create_dataframe([[batch]]) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -141,7 +141,7 @@ def test_create_dataframe_registers_with_defined_table_name(ctx): ) df = ctx.create_dataframe([[batch]], name="tbl") - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -155,7 +155,7 @@ def test_from_arrow_table(ctx): # convert to DataFrame df = ctx.from_arrow(table) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -200,7 +200,7 @@ def test_from_arrow_table_with_name(ctx): # convert to DataFrame with optional name df = ctx.from_arrow(table, name="tbl") - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert tables[0] == "tbl" @@ -213,7 +213,7 @@ def test_from_arrow_table_empty(ctx): # convert to DataFrame df = ctx.from_arrow(table) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -228,7 +228,7 @@ def test_from_arrow_table_empty_no_schema(ctx): # convert to DataFrame df = ctx.from_arrow(table) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -246,7 +246,7 @@ def test_from_pylist(ctx): ] df = ctx.from_pylist(data) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -260,7 +260,7 @@ def test_from_pydict(ctx): data = {"a": [1, 2, 3], "b": [4, 5, 6]} df = ctx.from_pydict(data) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -276,7 +276,7 @@ def test_from_pandas(ctx): pandas_df = pd.DataFrame(data) df = ctx.from_pandas(pandas_df) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -292,7 +292,7 @@ def test_from_polars(ctx): polars_df = pd.DataFrame(data) df = ctx.from_polars(polars_df) - tables = list(ctx.catalog().database().names()) + tables = list(ctx.catalog().schema().names()) assert df assert len(tables) == 1 @@ -303,7 +303,7 @@ def test_from_polars(ctx): def test_register_table(ctx, database): default = ctx.catalog() - public = default.database("public") + public = default.schema("public") assert public.names() == {"csv", "csv1", "csv2"} table = public.table("csv") @@ -313,7 +313,7 @@ def test_register_table(ctx, database): def test_read_table(ctx, database): default = ctx.catalog() - public = default.database("public") + public = default.schema("public") assert public.names() == {"csv", "csv1", "csv2"} table = public.table("csv") @@ -323,7 +323,7 @@ def test_read_table(ctx, database): def test_deregister_table(ctx, database): default = ctx.catalog() - public = default.database("public") + public = default.schema("public") assert public.names() == {"csv", "csv1", "csv2"} ctx.deregister_table("csv") @@ -339,7 +339,7 @@ def test_register_dataset(ctx): dataset = ds.dataset([batch]) ctx.register_dataset("t", dataset) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} result = ctx.sql("SELECT a+b, a-b FROM t").collect() @@ -356,7 +356,7 @@ def test_dataset_filter(ctx, capfd): dataset = ds.dataset([batch]) ctx.register_dataset("t", dataset) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} df = ctx.sql("SELECT a+b, a-b FROM t WHERE a BETWEEN 2 and 3 AND b > 5") # Make sure the filter was pushed down in Physical Plan @@ -455,7 +455,7 @@ def test_dataset_filter_nested_data(ctx): dataset = ds.dataset([batch]) ctx.register_dataset("t", dataset) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} df = ctx.table("t") diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py index 41cee4ef3..c383edc60 100644 --- a/python/tests/test_sql.py +++ b/python/tests/test_sql.py @@ -75,7 +75,7 @@ def test_register_csv(ctx, tmp_path): ) ctx.register_csv("csv3", path, schema=alternative_schema) - assert ctx.catalog().database().names() == { + assert ctx.catalog().schema().names() == { "csv", "csv1", "csv2", @@ -150,7 +150,7 @@ def test_register_parquet(ctx, tmp_path): path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) ctx.register_parquet("t", path) ctx.register_parquet("t1", str(path)) - assert ctx.catalog().database().names() == {"t", "t1"} + assert ctx.catalog().schema().names() == {"t", "t1"} result = ctx.sql("SELECT COUNT(a) AS cnt FROM t").collect() result = pa.Table.from_batches(result) @@ -188,7 +188,7 @@ def test_register_parquet_partitioned(ctx, tmp_path, path_to_str, legacy_data_ty parquet_pruning=True, file_extension=".parquet", ) - assert ctx.catalog().database().names() == {"datapp"} + assert ctx.catalog().schema().names() == {"datapp"} result = ctx.sql("SELECT grp, COUNT(*) AS cnt FROM datapp GROUP BY grp").collect() result = pa.Table.from_batches(result) @@ -204,7 +204,7 @@ def test_register_dataset(ctx, tmp_path, path_to_str): dataset = ds.dataset(path, format="parquet") ctx.register_dataset("t", dataset) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} result = ctx.sql("SELECT COUNT(a) AS cnt FROM t").collect() result = pa.Table.from_batches(result) @@ -251,7 +251,7 @@ def test_register_json(ctx, tmp_path): ) ctx.register_json("json3", path, schema=alternative_schema) - assert ctx.catalog().database().names() == { + assert ctx.catalog().schema().names() == { "json", "json1", "json2", @@ -308,7 +308,7 @@ def test_execute(ctx, tmp_path): path = helpers.write_parquet(tmp_path / "a.parquet", pa.array(data)) ctx.register_parquet("t", path) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} # count result = ctx.sql("SELECT COUNT(a) AS cnt FROM t WHERE a IS NOT NULL").collect() @@ -451,18 +451,10 @@ def test_udf( id="datetime_ns", ), # Not writtable to parquet - pytest.param( - helpers.data_timedelta("s"), id="timedelta_s", marks=pytest.mark.xfail - ), - pytest.param( - helpers.data_timedelta("ms"), id="timedelta_ms", marks=pytest.mark.xfail - ), - pytest.param( - helpers.data_timedelta("us"), id="timedelta_us", marks=pytest.mark.xfail - ), - pytest.param( - helpers.data_timedelta("ns"), id="timedelta_ns", marks=pytest.mark.xfail - ), + pytest.param(helpers.data_timedelta("s"), id="timedelta_s"), + pytest.param(helpers.data_timedelta("ms"), id="timedelta_ms"), + pytest.param(helpers.data_timedelta("us"), id="timedelta_us"), + pytest.param(helpers.data_timedelta("ns"), id="timedelta_ns"), ], ) def test_simple_select(ctx, tmp_path, arr): @@ -524,7 +516,7 @@ def test_register_listing_table( schema=table.schema if pass_schema else None, file_sort_order=file_sort_order, ) - assert ctx.catalog().database().names() == {"my_table"} + assert ctx.catalog().schema().names() == {"my_table"} result = ctx.sql( "SELECT grp, COUNT(*) AS count FROM my_table GROUP BY grp" diff --git a/python/tests/test_substrait.py b/python/tests/test_substrait.py index f367a447d..43aa327d4 100644 --- a/python/tests/test_substrait.py +++ b/python/tests/test_substrait.py @@ -34,7 +34,7 @@ def test_substrait_serialization(ctx): ctx.register_record_batches("t", [[batch]]) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} # For now just make sure the method calls blow up substrait_plan = ss.Serde.serialize_to_plan("SELECT * FROM t", ctx) @@ -59,7 +59,7 @@ def test_substrait_file_serialization(ctx, tmp_path, path_to_str): ctx.register_record_batches("t", [[batch]]) - assert ctx.catalog().database().names() == {"t"} + assert ctx.catalog().schema().names() == {"t"} path = tmp_path / "substrait_plan" path = str(path) if path_to_str else path diff --git a/src/catalog.rs b/src/catalog.rs index 83f8d08cb..17d4ec3b8 100644 --- a/src/catalog.rs +++ b/src/catalog.rs @@ -15,44 +15,54 @@ // specific language governing permissions and limitations // under the License. -use std::collections::HashSet; -use std::sync::Arc; - -use pyo3::exceptions::PyKeyError; -use pyo3::prelude::*; - -use crate::errors::{PyDataFusionError, PyDataFusionResult}; -use crate::utils::wait_for_future; +use crate::dataset::Dataset; +use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionError, PyDataFusionResult}; +use crate::utils::{validate_pycapsule, wait_for_future}; +use async_trait::async_trait; +use datafusion::catalog::{MemoryCatalogProvider, MemorySchemaProvider}; +use datafusion::common::DataFusionError; use datafusion::{ arrow::pyarrow::ToPyArrow, catalog::{CatalogProvider, SchemaProvider}, datasource::{TableProvider, TableType}, }; +use datafusion_ffi::schema_provider::{FFI_SchemaProvider, ForeignSchemaProvider}; +use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider}; +use pyo3::exceptions::PyKeyError; +use pyo3::prelude::*; +use pyo3::types::PyCapsule; +use pyo3::IntoPyObjectExt; +use std::any::Any; +use std::collections::HashSet; +use std::sync::Arc; -#[pyclass(name = "Catalog", module = "datafusion", subclass)] +#[pyclass(name = "RawCatalog", module = "datafusion.catalog", subclass)] +#[derive(Clone)] pub struct PyCatalog { pub catalog: Arc, } -#[pyclass(name = "Database", module = "datafusion", subclass)] -pub struct PyDatabase { - pub database: Arc, +#[pyclass(name = "RawSchema", module = "datafusion.catalog", subclass)] +#[derive(Clone)] +pub struct PySchema { + pub schema: Arc, } -#[pyclass(name = "Table", module = "datafusion", subclass)] +#[pyclass(name = "RawTable", module = "datafusion.catalog", subclass)] +#[derive(Clone)] pub struct PyTable { pub table: Arc, } -impl PyCatalog { - pub fn new(catalog: Arc) -> Self { +impl From> for PyCatalog { + fn from(catalog: Arc) -> Self { Self { catalog } } } -impl PyDatabase { - pub fn new(database: Arc) -> Self { - Self { database } +impl From> for PySchema { + fn from(schema: Arc) -> Self { + Self { schema } } } @@ -68,36 +78,109 @@ impl PyTable { #[pymethods] impl PyCatalog { - fn names(&self) -> Vec { - self.catalog.schema_names() + #[new] + fn new(catalog: PyObject) -> Self { + let catalog_provider = + Arc::new(RustWrappedPyCatalogProvider::new(catalog)) as Arc; + catalog_provider.into() + } + + #[staticmethod] + fn memory_catalog() -> Self { + let catalog_provider = + Arc::new(MemoryCatalogProvider::default()) as Arc; + catalog_provider.into() + } + + fn schema_names(&self) -> HashSet { + self.catalog.schema_names().into_iter().collect() } #[pyo3(signature = (name="public"))] - fn database(&self, name: &str) -> PyResult { - match self.catalog.schema(name) { - Some(database) => Ok(PyDatabase::new(database)), - None => Err(PyKeyError::new_err(format!( - "Database with name {name} doesn't exist." - ))), - } + fn schema(&self, name: &str) -> PyResult { + let schema = self + .catalog + .schema(name) + .ok_or(PyKeyError::new_err(format!( + "Schema with name {name} doesn't exist." + )))?; + + Python::with_gil(|py| { + match schema + .as_any() + .downcast_ref::() + { + Some(wrapped_schema) => Ok(wrapped_schema.schema_provider.clone_ref(py)), + None => PySchema::from(schema).into_py_any(py), + } + }) + } + + fn register_schema(&self, name: &str, schema_provider: Bound<'_, PyAny>) -> PyResult<()> { + let provider = if schema_provider.hasattr("__datafusion_schema_provider__")? { + let capsule = schema_provider + .getattr("__datafusion_schema_provider__")? + .call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_schema_provider")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignSchemaProvider = provider.into(); + Arc::new(provider) as Arc + } else { + match schema_provider.extract::() { + Ok(py_schema) => py_schema.schema, + Err(_) => Arc::new(RustWrappedPySchemaProvider::new(schema_provider.into())) + as Arc, + } + }; + + let _ = self + .catalog + .register_schema(name, provider) + .map_err(py_datafusion_err)?; + + Ok(()) + } + + fn deregister_schema(&self, name: &str, cascade: bool) -> PyResult<()> { + let _ = self + .catalog + .deregister_schema(name, cascade) + .map_err(py_datafusion_err)?; + + Ok(()) } fn __repr__(&self) -> PyResult { - Ok(format!( - "Catalog(schema_names=[{}])", - self.names().join(";") - )) + let mut names: Vec = self.schema_names().into_iter().collect(); + names.sort(); + Ok(format!("Catalog(schema_names=[{}])", names.join(", "))) } } #[pymethods] -impl PyDatabase { - fn names(&self) -> HashSet { - self.database.table_names().into_iter().collect() +impl PySchema { + #[new] + fn new(schema_provider: PyObject) -> Self { + let schema_provider = + Arc::new(RustWrappedPySchemaProvider::new(schema_provider)) as Arc; + schema_provider.into() + } + + #[staticmethod] + fn memory_schema() -> Self { + let schema_provider = Arc::new(MemorySchemaProvider::default()) as Arc; + schema_provider.into() + } + + #[getter] + fn table_names(&self) -> HashSet { + self.schema.table_names().into_iter().collect() } fn table(&self, name: &str, py: Python) -> PyDataFusionResult { - if let Some(table) = wait_for_future(py, self.database.table(name))?? { + if let Some(table) = wait_for_future(py, self.schema.table(name))?? { Ok(PyTable::new(table)) } else { Err(PyDataFusionError::Common(format!( @@ -107,14 +190,49 @@ impl PyDatabase { } fn __repr__(&self) -> PyResult { - Ok(format!( - "Database(table_names=[{}])", - Vec::from_iter(self.names()).join(";") - )) + let mut names: Vec = self.table_names().into_iter().collect(); + names.sort(); + Ok(format!("Schema(table_names=[{}])", names.join(";"))) } - // register_table - // deregister_table + fn register_table(&self, name: &str, table_provider: Bound<'_, PyAny>) -> PyResult<()> { + let provider = if table_provider.hasattr("__datafusion_table_provider__")? { + let capsule = table_provider + .getattr("__datafusion_table_provider__")? + .call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_table_provider")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignTableProvider = provider.into(); + Arc::new(provider) as Arc + } else { + match table_provider.extract::() { + Ok(py_table) => py_table.table, + Err(_) => { + let py = table_provider.py(); + let provider = Dataset::new(&table_provider, py)?; + Arc::new(provider) as Arc + } + } + }; + + let _ = self + .schema + .register_table(name.to_string(), provider) + .map_err(py_datafusion_err)?; + + Ok(()) + } + + fn deregister_table(&self, name: &str) -> PyResult<()> { + let _ = self + .schema + .deregister_table(name) + .map_err(py_datafusion_err)?; + + Ok(()) + } } #[pymethods] @@ -125,6 +243,14 @@ impl PyTable { self.table.schema().to_pyarrow(py) } + #[staticmethod] + fn from_dataset(py: Python<'_>, dataset: &Bound<'_, PyAny>) -> PyResult { + let ds = Arc::new(Dataset::new(dataset, py).map_err(py_datafusion_err)?) + as Arc; + + Ok(Self::new(ds)) + } + /// Get the type of this table for metadata/catalog purposes. #[getter] fn kind(&self) -> &str { @@ -145,3 +271,285 @@ impl PyTable { // fn has_exact_statistics // fn supports_filter_pushdown } + +#[derive(Debug)] +pub(crate) struct RustWrappedPySchemaProvider { + schema_provider: PyObject, + owner_name: Option, +} + +impl RustWrappedPySchemaProvider { + pub fn new(schema_provider: PyObject) -> Self { + let owner_name = Python::with_gil(|py| { + schema_provider + .bind(py) + .getattr("owner_name") + .ok() + .map(|name| name.to_string()) + }); + + Self { + schema_provider, + owner_name, + } + } + + fn table_inner(&self, name: &str) -> PyResult>> { + Python::with_gil(|py| { + let provider = self.schema_provider.bind(py); + let py_table_method = provider.getattr("table")?; + + let py_table = py_table_method.call((name,), None)?; + if py_table.is_none() { + return Ok(None); + } + + if py_table.hasattr("__datafusion_table_provider__")? { + let capsule = provider.getattr("__datafusion_table_provider__")?.call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_table_provider")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignTableProvider = provider.into(); + + Ok(Some(Arc::new(provider) as Arc)) + } else { + if let Ok(inner_table) = py_table.getattr("table") { + if let Ok(inner_table) = inner_table.extract::() { + return Ok(Some(inner_table.table)); + } + } + + match py_table.extract::() { + Ok(py_table) => Ok(Some(py_table.table)), + Err(_) => { + let ds = Dataset::new(&py_table, py).map_err(py_datafusion_err)?; + Ok(Some(Arc::new(ds) as Arc)) + } + } + } + }) + } +} + +#[async_trait] +impl SchemaProvider for RustWrappedPySchemaProvider { + fn owner_name(&self) -> Option<&str> { + self.owner_name.as_deref() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + Python::with_gil(|py| { + let provider = self.schema_provider.bind(py); + + provider + .getattr("table_names") + .and_then(|names| names.extract::>()) + .unwrap_or_else(|err| { + log::error!("Unable to get table_names: {err}"); + Vec::default() + }) + }) + } + + async fn table( + &self, + name: &str, + ) -> datafusion::common::Result>, DataFusionError> { + self.table_inner(name).map_err(to_datafusion_err) + } + + fn register_table( + &self, + name: String, + table: Arc, + ) -> datafusion::common::Result>> { + let py_table = PyTable::new(table); + Python::with_gil(|py| { + let provider = self.schema_provider.bind(py); + let _ = provider + .call_method1("register_table", (name, py_table)) + .map_err(to_datafusion_err)?; + // Since the definition of `register_table` says that an error + // will be returned if the table already exists, there is no + // case where we want to return a table provider as output. + Ok(None) + }) + } + + fn deregister_table( + &self, + name: &str, + ) -> datafusion::common::Result>> { + Python::with_gil(|py| { + let provider = self.schema_provider.bind(py); + let table = provider + .call_method1("deregister_table", (name,)) + .map_err(to_datafusion_err)?; + if table.is_none() { + return Ok(None); + } + + // If we can turn this table provider into a `Dataset`, return it. + // Otherwise, return None. + let dataset = match Dataset::new(&table, py) { + Ok(dataset) => Some(Arc::new(dataset) as Arc), + Err(_) => None, + }; + + Ok(dataset) + }) + } + + fn table_exist(&self, name: &str) -> bool { + Python::with_gil(|py| { + let provider = self.schema_provider.bind(py); + provider + .call_method1("table_exist", (name,)) + .and_then(|pyobj| pyobj.extract()) + .unwrap_or(false) + }) + } +} + +#[derive(Debug)] +pub(crate) struct RustWrappedPyCatalogProvider { + pub(crate) catalog_provider: PyObject, +} + +impl RustWrappedPyCatalogProvider { + pub fn new(catalog_provider: PyObject) -> Self { + Self { catalog_provider } + } + + fn schema_inner(&self, name: &str) -> PyResult>> { + Python::with_gil(|py| { + let provider = self.catalog_provider.bind(py); + + let py_schema = provider.call_method1("schema", (name,))?; + if py_schema.is_none() { + return Ok(None); + } + + if py_schema.hasattr("__datafusion_schema_provider__")? { + let capsule = provider + .getattr("__datafusion_schema_provider__")? + .call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_schema_provider")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignSchemaProvider = provider.into(); + + Ok(Some(Arc::new(provider) as Arc)) + } else { + if let Ok(inner_schema) = py_schema.getattr("schema") { + if let Ok(inner_schema) = inner_schema.extract::() { + return Ok(Some(inner_schema.schema)); + } + } + match py_schema.extract::() { + Ok(inner_schema) => Ok(Some(inner_schema.schema)), + Err(_) => { + let py_schema = RustWrappedPySchemaProvider::new(py_schema.into()); + + Ok(Some(Arc::new(py_schema) as Arc)) + } + } + } + }) + } +} + +#[async_trait] +impl CatalogProvider for RustWrappedPyCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + Python::with_gil(|py| { + let provider = self.catalog_provider.bind(py); + provider + .getattr("schema_names") + .and_then(|names| names.extract::>()) + .unwrap_or_else(|err| { + log::error!("Unable to get schema_names: {err}"); + Vec::default() + }) + }) + } + + fn schema(&self, name: &str) -> Option> { + self.schema_inner(name).unwrap_or_else(|err| { + log::error!("CatalogProvider schema returned error: {err}"); + None + }) + } + + fn register_schema( + &self, + name: &str, + schema: Arc, + ) -> datafusion::common::Result>> { + // JRIGHT HERE + // let py_schema: PySchema = schema.into(); + Python::with_gil(|py| { + let py_schema = match schema + .as_any() + .downcast_ref::() + { + Some(wrapped_schema) => wrapped_schema.schema_provider.as_any(), + None => &PySchema::from(schema) + .into_py_any(py) + .map_err(to_datafusion_err)?, + }; + + let provider = self.catalog_provider.bind(py); + let schema = provider + .call_method1("register_schema", (name, py_schema)) + .map_err(to_datafusion_err)?; + if schema.is_none() { + return Ok(None); + } + + let schema = Arc::new(RustWrappedPySchemaProvider::new(schema.into())) + as Arc; + + Ok(Some(schema)) + }) + } + + fn deregister_schema( + &self, + name: &str, + cascade: bool, + ) -> datafusion::common::Result>> { + Python::with_gil(|py| { + let provider = self.catalog_provider.bind(py); + let schema = provider + .call_method1("deregister_schema", (name, cascade)) + .map_err(to_datafusion_err)?; + if schema.is_none() { + return Ok(None); + } + + let schema = Arc::new(RustWrappedPySchemaProvider::new(schema.into())) + as Arc; + + Ok(Some(schema)) + }) + } +} + +pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + + Ok(()) +} diff --git a/src/common/data_type.rs b/src/common/data_type.rs index f5f8a6b06..5cf9d6e9f 100644 --- a/src/common/data_type.rs +++ b/src/common/data_type.rs @@ -172,7 +172,7 @@ impl DataTypeMap { SqlType::DATE, )), DataType::Duration(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Interval(interval_unit) => Ok(DataTypeMap::new( DataType::Interval(*interval_unit), @@ -189,7 +189,7 @@ impl DataTypeMap { SqlType::BINARY, )), DataType::FixedSizeBinary(_) => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", arrow_type)), + DataFusionError::NotImplemented(format!("{arrow_type:?}")), )), DataType::LargeBinary => Ok(DataTypeMap::new( DataType::LargeBinary, @@ -207,23 +207,22 @@ impl DataTypeMap { SqlType::VARCHAR, )), DataType::List(_) => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - arrow_type + "{arrow_type:?}" )))), DataType::FixedSizeList(_, _) => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", arrow_type)), + DataFusionError::NotImplemented(format!("{arrow_type:?}")), )), DataType::LargeList(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Struct(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Union(_, _) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Dictionary(_, _) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Decimal128(precision, scale) => Ok(DataTypeMap::new( DataType::Decimal128(*precision, *scale), @@ -236,23 +235,22 @@ impl DataTypeMap { SqlType::DECIMAL, )), DataType::Map(_, _) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::RunEndEncoded(_, _) => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", arrow_type)), + DataFusionError::NotImplemented(format!("{arrow_type:?}")), )), DataType::BinaryView => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::Utf8View => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - arrow_type + "{arrow_type:?}" )))), DataType::ListView(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), DataType::LargeListView(_) => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", arrow_type), + format!("{arrow_type:?}"), ))), } } @@ -379,8 +377,7 @@ impl DataTypeMap { "double" => Ok(DataType::Float64), "byte_array" => Ok(DataType::Utf8), _ => Err(PyValueError::new_err(format!( - "Unable to determine Arrow Data Type from Parquet String type: {:?}", - parquet_str_type + "Unable to determine Arrow Data Type from Parquet String type: {parquet_str_type:?}" ))), }; DataTypeMap::map_from_arrow_type(&arrow_dtype?) @@ -404,12 +401,10 @@ impl DataTypeMap { pub fn py_map_from_sql_type(sql_type: &SqlType) -> PyResult { match sql_type { SqlType::ANY => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::ARRAY => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::BIGINT => Ok(DataTypeMap::new( DataType::Int64, @@ -432,11 +427,10 @@ impl DataTypeMap { SqlType::CHAR, )), SqlType::COLUMN_LIST => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::CURSOR => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::DATE => Ok(DataTypeMap::new( DataType::Date64, @@ -449,8 +443,7 @@ impl DataTypeMap { SqlType::DECIMAL, )), SqlType::DISTINCT => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::DOUBLE => Ok(DataTypeMap::new( DataType::Decimal256(1, 1), @@ -458,7 +451,7 @@ impl DataTypeMap { SqlType::DOUBLE, )), SqlType::DYNAMIC_STAR => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::FLOAT => Ok(DataTypeMap::new( DataType::Decimal128(1, 1), @@ -466,8 +459,7 @@ impl DataTypeMap { SqlType::FLOAT, )), SqlType::GEOMETRY => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::INTEGER => Ok(DataTypeMap::new( DataType::Int8, @@ -475,55 +467,52 @@ impl DataTypeMap { SqlType::INTEGER, )), SqlType::INTERVAL => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::INTERVAL_DAY => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_DAY_HOUR => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_DAY_MINUTE => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::INTERVAL_DAY_SECOND => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::INTERVAL_HOUR => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_HOUR_MINUTE => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::INTERVAL_HOUR_SECOND => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::INTERVAL_MINUTE => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_MINUTE_SECOND => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::INTERVAL_MONTH => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_SECOND => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_YEAR => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::INTERVAL_YEAR_MONTH => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::MAP => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::MULTISET => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::NULL => Ok(DataTypeMap::new( DataType::Null, @@ -531,20 +520,16 @@ impl DataTypeMap { SqlType::NULL, )), SqlType::OTHER => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::REAL => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::ROW => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::SARG => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::SMALLINT => Ok(DataTypeMap::new( DataType::Int16, @@ -552,25 +537,22 @@ impl DataTypeMap { SqlType::SMALLINT, )), SqlType::STRUCTURED => Err(py_datafusion_err(DataFusionError::NotImplemented( - format!("{:?}", sql_type), + format!("{sql_type:?}"), ))), SqlType::SYMBOL => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::TIME => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::TIME_WITH_LOCAL_TIME_ZONE => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::TIMESTAMP => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::TIMESTAMP_WITH_LOCAL_TIME_ZONE => Err(py_datafusion_err( - DataFusionError::NotImplemented(format!("{:?}", sql_type)), + DataFusionError::NotImplemented(format!("{sql_type:?}")), )), SqlType::TINYINT => Ok(DataTypeMap::new( DataType::Int8, @@ -578,8 +560,7 @@ impl DataTypeMap { SqlType::TINYINT, )), SqlType::UNKNOWN => Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - sql_type + "{sql_type:?}" )))), SqlType::VARBINARY => Ok(DataTypeMap::new( DataType::LargeBinary, @@ -682,8 +663,7 @@ impl PyDataType { "datetime64" => Ok(DataType::Date64), "object" => Ok(DataType::Utf8), _ => Err(PyValueError::new_err(format!( - "Unable to determine Arrow Data Type from Arrow String type: {:?}", - arrow_str_type + "Unable to determine Arrow Data Type from Arrow String type: {arrow_str_type:?}" ))), }; Ok(PyDataType { diff --git a/src/context.rs b/src/context.rs index 6ce1f12bc..36133a33d 100644 --- a/src/context.rs +++ b/src/context.rs @@ -31,7 +31,7 @@ use uuid::Uuid; use pyo3::exceptions::{PyKeyError, PyValueError}; use pyo3::prelude::*; -use crate::catalog::{PyCatalog, PyTable}; +use crate::catalog::{PyCatalog, PyTable, RustWrappedPyCatalogProvider}; use crate::dataframe::PyDataFrame; use crate::dataset::Dataset; use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionResult}; @@ -49,6 +49,7 @@ use crate::utils::{get_global_ctx, get_tokio_runtime, validate_pycapsule, wait_f use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::arrow::record_batch::RecordBatch; +use datafusion::catalog::CatalogProvider; use datafusion::common::TableReference; use datafusion::common::{exec_err, ScalarValue}; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; @@ -69,8 +70,10 @@ use datafusion::physical_plan::SendableRecordBatchStream; use datafusion::prelude::{ AvroReadOptions, CsvReadOptions, DataFrame, NdJsonReadOptions, ParquetReadOptions, }; +use datafusion_ffi::catalog_provider::{FFI_CatalogProvider, ForeignCatalogProvider}; use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider}; use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType}; +use pyo3::IntoPyObjectExt; use tokio::task::JoinHandle; /// Configuration options for a SessionContext @@ -365,7 +368,7 @@ impl PySessionContext { } else { &upstream_host }; - let url_string = format!("{}{}", scheme, derived_host); + let url_string = format!("{scheme}{derived_host}"); let url = Url::parse(&url_string).unwrap(); self.ctx.runtime_env().register_object_store(&url, store); Ok(()) @@ -614,6 +617,34 @@ impl PySessionContext { Ok(()) } + pub fn register_catalog_provider( + &mut self, + name: &str, + provider: Bound<'_, PyAny>, + ) -> PyDataFusionResult<()> { + let provider = if provider.hasattr("__datafusion_catalog_provider__")? { + let capsule = provider + .getattr("__datafusion_catalog_provider__")? + .call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_catalog_provider")?; + + let provider = unsafe { capsule.reference::() }; + let provider: ForeignCatalogProvider = provider.into(); + Arc::new(provider) as Arc + } else { + match provider.extract::() { + Ok(py_catalog) => py_catalog.catalog, + Err(_) => Arc::new(RustWrappedPyCatalogProvider::new(provider.into())) + as Arc, + } + }; + + let _ = self.ctx.register_catalog(name, provider); + + Ok(()) + } + /// Construct datafusion dataframe from Arrow Table pub fn register_table_provider( &mut self, @@ -845,14 +876,24 @@ impl PySessionContext { } #[pyo3(signature = (name="datafusion"))] - pub fn catalog(&self, name: &str) -> PyResult { - match self.ctx.catalog(name) { - Some(catalog) => Ok(PyCatalog::new(catalog)), - None => Err(PyKeyError::new_err(format!( - "Catalog with name {} doesn't exist.", - &name, - ))), - } + pub fn catalog(&self, name: &str) -> PyResult { + let catalog = self.ctx.catalog(name).ok_or(PyKeyError::new_err(format!( + "Catalog with name {name} doesn't exist." + )))?; + + Python::with_gil(|py| { + match catalog + .as_any() + .downcast_ref::() + { + Some(wrapped_schema) => Ok(wrapped_schema.catalog_provider.clone_ref(py)), + None => PyCatalog::from(catalog).into_py_any(py), + } + }) + } + + pub fn catalog_names(&self) -> HashSet { + self.ctx.catalog_names().into_iter().collect() } pub fn tables(&self) -> HashSet { diff --git a/src/expr.rs b/src/expr.rs index 6b1d01d65..f1e002367 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -171,12 +171,10 @@ impl PyExpr { Expr::Cast(value) => Ok(cast::PyCast::from(value.clone()).into_bound_py_any(py)?), Expr::TryCast(value) => Ok(cast::PyTryCast::from(value.clone()).into_bound_py_any(py)?), Expr::ScalarFunction(value) => Err(py_unsupported_variant_err(format!( - "Converting Expr::ScalarFunction to a Python object is not implemented: {:?}", - value + "Converting Expr::ScalarFunction to a Python object is not implemented: {value:?}" ))), Expr::WindowFunction(value) => Err(py_unsupported_variant_err(format!( - "Converting Expr::WindowFunction to a Python object is not implemented: {:?}", - value + "Converting Expr::WindowFunction to a Python object is not implemented: {value:?}" ))), Expr::InList(value) => Ok(in_list::PyInList::from(value.clone()).into_bound_py_any(py)?), Expr::Exists(value) => Ok(exists::PyExists::from(value.clone()).into_bound_py_any(py)?), @@ -188,8 +186,7 @@ impl PyExpr { } #[allow(deprecated)] Expr::Wildcard { qualifier, options } => Err(py_unsupported_variant_err(format!( - "Converting Expr::Wildcard to a Python object is not implemented : {:?} {:?}", - qualifier, options + "Converting Expr::Wildcard to a Python object is not implemented : {qualifier:?} {options:?}" ))), Expr::GroupingSet(value) => { Ok(grouping_set::PyGroupingSet::from(value.clone()).into_bound_py_any(py)?) @@ -198,8 +195,7 @@ impl PyExpr { Ok(placeholder::PyPlaceholder::from(value.clone()).into_bound_py_any(py)?) } Expr::OuterReferenceColumn(data_type, column) => Err(py_unsupported_variant_err(format!( - "Converting Expr::OuterReferenceColumn to a Python object is not implemented: {:?} - {:?}", - data_type, column + "Converting Expr::OuterReferenceColumn to a Python object is not implemented: {data_type:?} - {column:?}" ))), Expr::Unnest(value) => Ok(unnest_expr::PyUnnestExpr::from(value.clone()).into_bound_py_any(py)?), } @@ -755,8 +751,7 @@ impl PyExpr { Expr::Cast(Cast { expr: _, data_type }) => DataTypeMap::map_from_arrow_type(data_type), Expr::Literal(scalar_value, _) => DataTypeMap::map_from_scalar_value(scalar_value), _ => Err(py_type_err(format!( - "Non Expr::Literal encountered in types: {:?}", - expr + "Non Expr::Literal encountered in types: {expr:?}" ))), } } diff --git a/src/expr/aggregate.rs b/src/expr/aggregate.rs index a99d83d23..fd4393271 100644 --- a/src/expr/aggregate.rs +++ b/src/expr/aggregate.rs @@ -116,7 +116,7 @@ impl PyAggregate { } fn __repr__(&self) -> PyResult { - Ok(format!("Aggregate({})", self)) + Ok(format!("Aggregate({self})")) } } diff --git a/src/expr/aggregate_expr.rs b/src/expr/aggregate_expr.rs index c09f116e3..7c5d3d31f 100644 --- a/src/expr/aggregate_expr.rs +++ b/src/expr/aggregate_expr.rs @@ -75,6 +75,6 @@ impl PyAggregateFunction { /// Get a String representation of this column fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } } diff --git a/src/expr/alias.rs b/src/expr/alias.rs index e8e03cfad..40746f200 100644 --- a/src/expr/alias.rs +++ b/src/expr/alias.rs @@ -64,6 +64,6 @@ impl PyAlias { /// Get a String representation of this column fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } } diff --git a/src/expr/analyze.rs b/src/expr/analyze.rs index 62f93cd26..e8081e95b 100644 --- a/src/expr/analyze.rs +++ b/src/expr/analyze.rs @@ -69,7 +69,7 @@ impl PyAnalyze { } fn __repr__(&self) -> PyResult { - Ok(format!("Analyze({})", self)) + Ok(format!("Analyze({self})")) } } diff --git a/src/expr/between.rs b/src/expr/between.rs index a2cac1442..817f1baae 100644 --- a/src/expr/between.rs +++ b/src/expr/between.rs @@ -71,6 +71,6 @@ impl PyBetween { } fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } } diff --git a/src/expr/column.rs b/src/expr/column.rs index 365dbc0d2..50f316f1c 100644 --- a/src/expr/column.rs +++ b/src/expr/column.rs @@ -45,7 +45,7 @@ impl PyColumn { /// Get the column relation fn relation(&self) -> Option { - self.col.relation.as_ref().map(|r| format!("{}", r)) + self.col.relation.as_ref().map(|r| format!("{r}")) } /// Get the fully-qualified column name diff --git a/src/expr/copy_to.rs b/src/expr/copy_to.rs index ebfcb8ebc..473dabfed 100644 --- a/src/expr/copy_to.rs +++ b/src/expr/copy_to.rs @@ -106,7 +106,7 @@ impl PyCopyTo { } fn __repr__(&self) -> PyResult { - Ok(format!("CopyTo({})", self)) + Ok(format!("CopyTo({self})")) } fn __name__(&self) -> PyResult { @@ -129,7 +129,7 @@ impl Display for PyFileType { #[pymethods] impl PyFileType { fn __repr__(&self) -> PyResult { - Ok(format!("FileType({})", self)) + Ok(format!("FileType({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_catalog.rs b/src/expr/create_catalog.rs index f4ea0f517..d2d2ee8f6 100644 --- a/src/expr/create_catalog.rs +++ b/src/expr/create_catalog.rs @@ -81,7 +81,7 @@ impl PyCreateCatalog { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateCatalog({})", self)) + Ok(format!("CreateCatalog({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_catalog_schema.rs b/src/expr/create_catalog_schema.rs index 85f447e1e..e794962f5 100644 --- a/src/expr/create_catalog_schema.rs +++ b/src/expr/create_catalog_schema.rs @@ -81,7 +81,7 @@ impl PyCreateCatalogSchema { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateCatalogSchema({})", self)) + Ok(format!("CreateCatalogSchema({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_external_table.rs b/src/expr/create_external_table.rs index 01ce7d0ca..3e35af006 100644 --- a/src/expr/create_external_table.rs +++ b/src/expr/create_external_table.rs @@ -164,7 +164,7 @@ impl PyCreateExternalTable { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateExternalTable({})", self)) + Ok(format!("CreateExternalTable({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_function.rs b/src/expr/create_function.rs index 6f3c3f0ff..c02ceebb1 100644 --- a/src/expr/create_function.rs +++ b/src/expr/create_function.rs @@ -163,7 +163,7 @@ impl PyCreateFunction { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateFunction({})", self)) + Ok(format!("CreateFunction({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_index.rs b/src/expr/create_index.rs index 13dadbc3f..0f4b5011a 100644 --- a/src/expr/create_index.rs +++ b/src/expr/create_index.rs @@ -110,7 +110,7 @@ impl PyCreateIndex { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateIndex({})", self)) + Ok(format!("CreateIndex({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_memory_table.rs b/src/expr/create_memory_table.rs index 8872b2d47..37f4d3420 100644 --- a/src/expr/create_memory_table.rs +++ b/src/expr/create_memory_table.rs @@ -78,7 +78,7 @@ impl PyCreateMemoryTable { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateMemoryTable({})", self)) + Ok(format!("CreateMemoryTable({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/create_view.rs b/src/expr/create_view.rs index 87bb76876..718e404d0 100644 --- a/src/expr/create_view.rs +++ b/src/expr/create_view.rs @@ -75,7 +75,7 @@ impl PyCreateView { } fn __repr__(&self) -> PyResult { - Ok(format!("CreateView({})", self)) + Ok(format!("CreateView({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/describe_table.rs b/src/expr/describe_table.rs index 5658a13f2..6c48f3c77 100644 --- a/src/expr/describe_table.rs +++ b/src/expr/describe_table.rs @@ -61,7 +61,7 @@ impl PyDescribeTable { } fn __repr__(&self) -> PyResult { - Ok(format!("DescribeTable({})", self)) + Ok(format!("DescribeTable({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/distinct.rs b/src/expr/distinct.rs index b62b776f8..889e7099d 100644 --- a/src/expr/distinct.rs +++ b/src/expr/distinct.rs @@ -48,8 +48,7 @@ impl Display for PyDistinct { Distinct::All(input) => write!( f, "Distinct ALL - \nInput: {:?}", - input, + \nInput: {input:?}", ), Distinct::On(distinct_on) => { write!( @@ -71,7 +70,7 @@ impl PyDistinct { } fn __repr__(&self) -> PyResult { - Ok(format!("Distinct({})", self)) + Ok(format!("Distinct({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/drop_catalog_schema.rs b/src/expr/drop_catalog_schema.rs index b7420a99c..b4a4c521c 100644 --- a/src/expr/drop_catalog_schema.rs +++ b/src/expr/drop_catalog_schema.rs @@ -101,7 +101,7 @@ impl PyDropCatalogSchema { } fn __repr__(&self) -> PyResult { - Ok(format!("DropCatalogSchema({})", self)) + Ok(format!("DropCatalogSchema({self})")) } } diff --git a/src/expr/drop_function.rs b/src/expr/drop_function.rs index 9fbd78fdc..fca9eb94b 100644 --- a/src/expr/drop_function.rs +++ b/src/expr/drop_function.rs @@ -76,7 +76,7 @@ impl PyDropFunction { } fn __repr__(&self) -> PyResult { - Ok(format!("DropFunction({})", self)) + Ok(format!("DropFunction({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/drop_table.rs b/src/expr/drop_table.rs index 96983c1cf..3f442539a 100644 --- a/src/expr/drop_table.rs +++ b/src/expr/drop_table.rs @@ -70,7 +70,7 @@ impl PyDropTable { } fn __repr__(&self) -> PyResult { - Ok(format!("DropTable({})", self)) + Ok(format!("DropTable({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/drop_view.rs b/src/expr/drop_view.rs index 1d1ab1e59..6196c8bb5 100644 --- a/src/expr/drop_view.rs +++ b/src/expr/drop_view.rs @@ -83,7 +83,7 @@ impl PyDropView { } fn __repr__(&self) -> PyResult { - Ok(format!("DropView({})", self)) + Ok(format!("DropView({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/empty_relation.rs b/src/expr/empty_relation.rs index a1534ac15..758213423 100644 --- a/src/expr/empty_relation.rs +++ b/src/expr/empty_relation.rs @@ -65,7 +65,7 @@ impl PyEmptyRelation { /// Get a String representation of this column fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } fn __name__(&self) -> PyResult { diff --git a/src/expr/filter.rs b/src/expr/filter.rs index 9bdb667cd..4fcb600cd 100644 --- a/src/expr/filter.rs +++ b/src/expr/filter.rs @@ -72,7 +72,7 @@ impl PyFilter { } fn __repr__(&self) -> String { - format!("Filter({})", self) + format!("Filter({self})") } } diff --git a/src/expr/join.rs b/src/expr/join.rs index 76ec532e7..b8d1d9da7 100644 --- a/src/expr/join.rs +++ b/src/expr/join.rs @@ -177,7 +177,7 @@ impl PyJoin { } fn __repr__(&self) -> PyResult { - Ok(format!("Join({})", self)) + Ok(format!("Join({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/like.rs b/src/expr/like.rs index 2e1f060bd..f180f5d4c 100644 --- a/src/expr/like.rs +++ b/src/expr/like.rs @@ -75,7 +75,7 @@ impl PyLike { } fn __repr__(&self) -> String { - format!("Like({})", self) + format!("Like({self})") } } @@ -133,7 +133,7 @@ impl PyILike { } fn __repr__(&self) -> String { - format!("Like({})", self) + format!("Like({self})") } } @@ -191,6 +191,6 @@ impl PySimilarTo { } fn __repr__(&self) -> String { - format!("Like({})", self) + format!("Like({self})") } } diff --git a/src/expr/limit.rs b/src/expr/limit.rs index c2a33ff89..92552814e 100644 --- a/src/expr/limit.rs +++ b/src/expr/limit.rs @@ -81,7 +81,7 @@ impl PyLimit { } fn __repr__(&self) -> PyResult { - Ok(format!("Limit({})", self)) + Ok(format!("Limit({self})")) } } diff --git a/src/expr/projection.rs b/src/expr/projection.rs index dc7e5e3c1..b5a9ef34a 100644 --- a/src/expr/projection.rs +++ b/src/expr/projection.rs @@ -85,7 +85,7 @@ impl PyProjection { } fn __repr__(&self) -> PyResult { - Ok(format!("Projection({})", self)) + Ok(format!("Projection({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/recursive_query.rs b/src/expr/recursive_query.rs index 65181f7d3..2517b7417 100644 --- a/src/expr/recursive_query.rs +++ b/src/expr/recursive_query.rs @@ -89,7 +89,7 @@ impl PyRecursiveQuery { } fn __repr__(&self) -> PyResult { - Ok(format!("RecursiveQuery({})", self)) + Ok(format!("RecursiveQuery({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/repartition.rs b/src/expr/repartition.rs index 3e782d6af..48b5e7041 100644 --- a/src/expr/repartition.rs +++ b/src/expr/repartition.rs @@ -108,7 +108,7 @@ impl PyRepartition { } fn __repr__(&self) -> PyResult { - Ok(format!("Repartition({})", self)) + Ok(format!("Repartition({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/sort.rs b/src/expr/sort.rs index ed4947591..79a8aee50 100644 --- a/src/expr/sort.rs +++ b/src/expr/sort.rs @@ -87,7 +87,7 @@ impl PySort { } fn __repr__(&self) -> PyResult { - Ok(format!("Sort({})", self)) + Ok(format!("Sort({self})")) } } diff --git a/src/expr/sort_expr.rs b/src/expr/sort_expr.rs index 12f74e4d8..79e35d978 100644 --- a/src/expr/sort_expr.rs +++ b/src/expr/sort_expr.rs @@ -85,6 +85,6 @@ impl PySortExpr { } fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } } diff --git a/src/expr/subquery.rs b/src/expr/subquery.rs index 5ebfe6927..77f56f9a9 100644 --- a/src/expr/subquery.rs +++ b/src/expr/subquery.rs @@ -62,7 +62,7 @@ impl PySubquery { } fn __repr__(&self) -> PyResult { - Ok(format!("Subquery({})", self)) + Ok(format!("Subquery({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/subquery_alias.rs b/src/expr/subquery_alias.rs index 267a4d485..3302e7f23 100644 --- a/src/expr/subquery_alias.rs +++ b/src/expr/subquery_alias.rs @@ -72,7 +72,7 @@ impl PySubqueryAlias { } fn __repr__(&self) -> PyResult { - Ok(format!("SubqueryAlias({})", self)) + Ok(format!("SubqueryAlias({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/table_scan.rs b/src/expr/table_scan.rs index 6a0d53f0f..329964687 100644 --- a/src/expr/table_scan.rs +++ b/src/expr/table_scan.rs @@ -136,7 +136,7 @@ impl PyTableScan { } fn __repr__(&self) -> PyResult { - Ok(format!("TableScan({})", self)) + Ok(format!("TableScan({self})")) } } diff --git a/src/expr/union.rs b/src/expr/union.rs index 5a08ccc13..e0b221398 100644 --- a/src/expr/union.rs +++ b/src/expr/union.rs @@ -66,7 +66,7 @@ impl PyUnion { } fn __repr__(&self) -> PyResult { - Ok(format!("Union({})", self)) + Ok(format!("Union({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/unnest.rs b/src/expr/unnest.rs index 8e70e0990..c8833347f 100644 --- a/src/expr/unnest.rs +++ b/src/expr/unnest.rs @@ -66,7 +66,7 @@ impl PyUnnest { } fn __repr__(&self) -> PyResult { - Ok(format!("Unnest({})", self)) + Ok(format!("Unnest({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/unnest_expr.rs b/src/expr/unnest_expr.rs index 2234d24b1..634186ed8 100644 --- a/src/expr/unnest_expr.rs +++ b/src/expr/unnest_expr.rs @@ -58,7 +58,7 @@ impl PyUnnestExpr { } fn __repr__(&self) -> PyResult { - Ok(format!("UnnestExpr({})", self)) + Ok(format!("UnnestExpr({self})")) } fn __name__(&self) -> PyResult { diff --git a/src/expr/window.rs b/src/expr/window.rs index 052d9eeb4..a408731c2 100644 --- a/src/expr/window.rs +++ b/src/expr/window.rs @@ -185,8 +185,7 @@ impl PyWindowFrame { "groups" => WindowFrameUnits::Groups, _ => { return Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - units, + "{units:?}", )))); } }; @@ -197,8 +196,7 @@ impl PyWindowFrame { WindowFrameUnits::Rows => WindowFrameBound::Preceding(ScalarValue::UInt64(None)), WindowFrameUnits::Groups => { return Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - units, + "{units:?}", )))); } }, @@ -210,8 +208,7 @@ impl PyWindowFrame { WindowFrameUnits::Range => WindowFrameBound::Following(ScalarValue::UInt64(None)), WindowFrameUnits::Groups => { return Err(py_datafusion_err(DataFusionError::NotImplemented(format!( - "{:?}", - units, + "{units:?}", )))); } }, @@ -236,7 +233,7 @@ impl PyWindowFrame { /// Get a String representation of this window frame fn __repr__(&self) -> String { - format!("{}", self) + format!("{self}") } } diff --git a/src/functions.rs b/src/functions.rs index b2bafcb65..b40500b8b 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -937,7 +937,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(left))?; m.add_wrapped(wrap_pyfunction!(length))?; m.add_wrapped(wrap_pyfunction!(ln))?; - m.add_wrapped(wrap_pyfunction!(log))?; + m.add_wrapped(wrap_pyfunction!(self::log))?; m.add_wrapped(wrap_pyfunction!(log10))?; m.add_wrapped(wrap_pyfunction!(log2))?; m.add_wrapped(wrap_pyfunction!(lower))?; diff --git a/src/lib.rs b/src/lib.rs index 1293eee3c..29d3f41da 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -77,10 +77,10 @@ pub(crate) struct TokioRuntime(tokio::runtime::Runtime); /// datafusion directory. #[pymodule] fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { + // Initialize logging + pyo3_log::init(); + // Register the python classes - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -98,6 +98,10 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; + let catalog = PyModule::new(py, "catalog")?; + catalog::init_module(&catalog)?; + m.add_submodule(&catalog)?; + // Register `common` as a submodule. Matching `datafusion-common` https://docs.rs/datafusion-common/latest/datafusion_common/ let common = PyModule::new(py, "common")?; common::init_module(&common)?; diff --git a/src/physical_plan.rs b/src/physical_plan.rs index f0be45c6a..49db643e1 100644 --- a/src/physical_plan.rs +++ b/src/physical_plan.rs @@ -78,8 +78,7 @@ impl PyExecutionPlan { let proto_plan = datafusion_proto::protobuf::PhysicalPlanNode::decode(bytes).map_err(|e| { PyRuntimeError::new_err(format!( - "Unable to decode logical node from serialized bytes: {}", - e + "Unable to decode logical node from serialized bytes: {e}" )) })?; diff --git a/src/sql/logical.rs b/src/sql/logical.rs index 198d68bdc..97d320470 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -201,8 +201,7 @@ impl PyLogicalPlan { let proto_plan = datafusion_proto::protobuf::LogicalPlanNode::decode(bytes).map_err(|e| { PyRuntimeError::new_err(format!( - "Unable to decode logical node from serialized bytes: {}", - e + "Unable to decode logical node from serialized bytes: {e}" )) })?; diff --git a/src/utils.rs b/src/utils.rs index f4e121fd5..3b30de5de 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -109,8 +109,7 @@ pub(crate) fn validate_pycapsule(capsule: &Bound, name: &str) -> PyRe let capsule_name = capsule_name.unwrap().to_str()?; if capsule_name != name { return Err(PyValueError::new_err(format!( - "Expected name '{}' in PyCapsule, instead got '{}'", - name, capsule_name + "Expected name '{name}' in PyCapsule, instead got '{capsule_name}'" ))); } @@ -127,7 +126,7 @@ pub(crate) fn py_obj_to_scalar_value(py: Python, obj: PyObject) -> PyResult Date: Wed, 2 Jul 2025 09:59:02 -0400 Subject: [PATCH 08/96] feat: add FFI support for user defined functions (#1145) * Intermediate work adding ffi scalar udf * Add scalar UDF and example * Add aggregate udf via ffi * Initial commit for window ffi integration * Remove unused import --- docs/source/contributor-guide/ffi.rst | 2 +- examples/datafusion-ffi-example/Cargo.lock | 217 ++++++++++-------- examples/datafusion-ffi-example/Cargo.toml | 8 +- .../python/tests/_test_aggregate_udf.py | 77 +++++++ .../python/tests/_test_scalar_udf.py | 70 ++++++ .../python/tests/_test_window_udf.py | 89 +++++++ .../src/aggregate_udf.rs | 81 +++++++ .../src/catalog_provider.rs | 1 - examples/datafusion-ffi-example/src/lib.rs | 9 + .../datafusion-ffi-example/src/scalar_udf.rs | 91 ++++++++ .../datafusion-ffi-example/src/window_udf.rs | 81 +++++++ python/datafusion/user_defined.py | 107 ++++++++- src/functions.rs | 2 +- src/udaf.rs | 31 ++- src/udf.rs | 25 +- src/udwf.rs | 27 ++- 16 files changed, 805 insertions(+), 113 deletions(-) create mode 100644 examples/datafusion-ffi-example/python/tests/_test_aggregate_udf.py create mode 100644 examples/datafusion-ffi-example/python/tests/_test_scalar_udf.py create mode 100644 examples/datafusion-ffi-example/python/tests/_test_window_udf.py create mode 100644 examples/datafusion-ffi-example/src/aggregate_udf.rs create mode 100644 examples/datafusion-ffi-example/src/scalar_udf.rs create mode 100644 examples/datafusion-ffi-example/src/window_udf.rs diff --git a/docs/source/contributor-guide/ffi.rst b/docs/source/contributor-guide/ffi.rst index c1f9806b3..a40af1234 100644 --- a/docs/source/contributor-guide/ffi.rst +++ b/docs/source/contributor-guide/ffi.rst @@ -176,7 +176,7 @@ By convention the ``datafusion-python`` library expects a Python object that has ``TableProvider`` PyCapsule to have this capsule accessible by calling a function named ``__datafusion_table_provider__``. You can see a complete working example of how to share a ``TableProvider`` from one python library to DataFusion Python in the -`repository examples folder `_. +`repository examples folder `_. This section has been written using ``TableProvider`` as an example. It is the first extension that has been written using this approach and the most thoroughly implemented. diff --git a/examples/datafusion-ffi-example/Cargo.lock b/examples/datafusion-ffi-example/Cargo.lock index e5a1ca8d1..1b4ca6bee 100644 --- a/examples/datafusion-ffi-example/Cargo.lock +++ b/examples/datafusion-ffi-example/Cargo.lock @@ -323,6 +323,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73a47aa0c771b5381de2b7f16998d351a6f4eb839f1e13d48353e17e873d969b" dependencies = [ "bitflags", + "serde", + "serde_json", ] [[package]] @@ -748,9 +750,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffe060b978f74ab446be722adb8a274e052e005bf6dfd171caadc3abaad10080" +checksum = "cc6cb8c2c81eada072059983657d6c9caf3fddefc43b4a65551d243253254a96" dependencies = [ "arrow", "arrow-ipc", @@ -775,7 +777,6 @@ dependencies = [ "datafusion-functions-nested", "datafusion-functions-table", "datafusion-functions-window", - "datafusion-macros", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -790,7 +791,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand", + "rand 0.9.1", "regex", "sqlparser", "tempfile", @@ -803,9 +804,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61fe34f401bd03724a1f96d12108144f8cd495a3cdda2bf5e091822fb80b7e66" +checksum = "b7be8d1b627843af62e447396db08fe1372d882c0eb8d0ea655fd1fbc33120ee" dependencies = [ "arrow", "async-trait", @@ -829,9 +830,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4411b8e3bce5e0fc7521e44f201def2e2d5d1b5f176fb56e8cdc9942c890f00" +checksum = "38ab16c5ae43f65ee525fc493ceffbc41f40dee38b01f643dfcfc12959e92038" dependencies = [ "arrow", "async-trait", @@ -852,9 +853,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0734015d81c8375eb5d4869b7f7ecccc2ee8d6cb81948ef737cd0e7b743bd69c" +checksum = "d3d56b2ac9f476b93ca82e4ef5fb00769c8a3f248d12b4965af7e27635fa7e12" dependencies = [ "ahash", "arrow", @@ -876,9 +877,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5167bb1d2ccbb87c6bc36c295274d7a0519b14afcfdaf401d53cbcaa4ef4968b" +checksum = "16015071202d6133bc84d72756176467e3e46029f3ce9ad2cb788f9b1ff139b2" dependencies = [ "futures", "log", @@ -887,9 +888,9 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04e602dcdf2f50c2abf297cc2203c73531e6f48b29516af7695d338cf2a778b1" +checksum = "b77523c95c89d2a7eb99df14ed31390e04ab29b43ff793e562bdc1716b07e17b" dependencies = [ "arrow", "async-compression", @@ -912,7 +913,7 @@ dependencies = [ "log", "object_store", "parquet", - "rand", + "rand 0.9.1", "tempfile", "tokio", "tokio-util", @@ -923,9 +924,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3bb2253952dc32296ed5b84077cb2e0257fea4be6373e1c376426e17ead4ef6" +checksum = "40d25c5e2c0ebe8434beeea997b8e88d55b3ccc0d19344293f2373f65bc524fc" dependencies = [ "arrow", "async-trait", @@ -948,9 +949,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8c7f47a5d2fe03bfa521ec9bafdb8a5c82de8377f60967c3663f00c8790352" +checksum = "3dc6959e1155741ab35369e1dc7673ba30fc45ed568fad34c01b7cb1daeb4d4c" dependencies = [ "arrow", "async-trait", @@ -973,9 +974,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27d15868ea39ed2dc266728b554f6304acd473de2142281ecfa1294bb7415923" +checksum = "b7a6afdfe358d70f4237f60eaef26ae5a1ce7cb2c469d02d5fc6c7fd5d84e58b" dependencies = [ "arrow", "async-trait", @@ -998,21 +999,21 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand", + "rand 0.9.1", "tokio", ] [[package]] name = "datafusion-doc" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a91f8c2c5788ef32f48ff56c68e5b545527b744822a284373ac79bba1ba47292" +checksum = "9bcd8a3e3e3d02ea642541be23d44376b5d5c37c2938cce39b3873cdf7186eea" [[package]] name = "datafusion-execution" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06f004d100f49a3658c9da6fb0c3a9b760062d96cd4ad82ccc3b7b69a9fb2f84" +checksum = "670da1d45d045eee4c2319b8c7ea57b26cf48ab77b630aaa50b779e406da476a" dependencies = [ "arrow", "dashmap", @@ -1022,16 +1023,16 @@ dependencies = [ "log", "object_store", "parking_lot", - "rand", + "rand 0.9.1", "tempfile", "url", ] [[package]] name = "datafusion-expr" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a4e4ce3802609be38eeb607ee72f6fe86c3091460de9dbfae9e18db423b3964" +checksum = "b3a577f64bdb7e2cc4043cd97f8901d8c504711fde2dbcb0887645b00d7c660b" dependencies = [ "arrow", "chrono", @@ -1050,9 +1051,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "422ac9cf3b22bbbae8cdf8ceb33039107fde1b5492693168f13bd566b1bcc839" +checksum = "51b7916806ace3e9f41884f230f7f38ebf0e955dfbd88266da1826f29a0b9a6a" dependencies = [ "arrow", "datafusion-common", @@ -1063,9 +1064,9 @@ dependencies = [ [[package]] name = "datafusion-ffi" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cf3fe9ab492c56daeb7beed526690d33622d388b8870472e0b7b7f55490338c" +checksum = "980cca31de37f5dadf7ea18e4ffc2b6833611f45bed5ef9de0831d2abb50f1ef" dependencies = [ "abi_stable", "arrow", @@ -1073,7 +1074,9 @@ dependencies = [ "async-ffi", "async-trait", "datafusion", + "datafusion-functions-aggregate-common", "datafusion-proto", + "datafusion-proto-common", "futures", "log", "prost", @@ -1081,11 +1084,25 @@ dependencies = [ "tokio", ] +[[package]] +name = "datafusion-ffi-example" +version = "0.2.0" +dependencies = [ + "arrow", + "arrow-array", + "arrow-schema", + "async-trait", + "datafusion", + "datafusion-ffi", + "pyo3", + "pyo3-build-config", +] + [[package]] name = "datafusion-functions" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ddf0a0a2db5d2918349c978d42d80926c6aa2459cd8a3c533a84ec4bb63479e" +checksum = "7fb31c9dc73d3e0c365063f91139dc273308f8a8e124adda9898db8085d68357" dependencies = [ "arrow", "arrow-buffer", @@ -1103,7 +1120,7 @@ dependencies = [ "itertools", "log", "md-5", - "rand", + "rand 0.9.1", "regex", "sha2", "unicode-segmentation", @@ -1112,9 +1129,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "408a05dafdc70d05a38a29005b8b15e21b0238734dab1e98483fcb58038c5aba" +checksum = "ebb72c6940697eaaba9bd1f746a697a07819de952b817e3fb841fb75331ad5d4" dependencies = [ "ahash", "arrow", @@ -1133,9 +1150,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "756d21da2dd6c9bef97af1504970ff56cbf35d03fbd4ffd62827f02f4d2279d4" +checksum = "d7fdc54656659e5ecd49bf341061f4156ab230052611f4f3609612a0da259696" dependencies = [ "ahash", "arrow", @@ -1146,9 +1163,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d8d50f6334b378930d992d801a10ac5b3e93b846b39e4a05085742572844537" +checksum = "fad94598e3374938ca43bca6b675febe557e7a14eb627d617db427d70d65118b" dependencies = [ "arrow", "arrow-ord", @@ -1167,9 +1184,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9a97220736c8fff1446e936be90d57216c06f28969f9ffd3b72ac93c958c8a" +checksum = "de2fc6c2946da5cab8364fb28b5cac3115f0f3a87960b235ed031c3f7e2e639b" dependencies = [ "arrow", "async-trait", @@ -1183,10 +1200,11 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cefc2d77646e1aadd1d6a9c40088937aedec04e68c5f0465939912e1291f8193" +checksum = "3e5746548a8544870a119f556543adcd88fe0ba6b93723fe78ad0439e0fbb8b4" dependencies = [ + "arrow", "datafusion-common", "datafusion-doc", "datafusion-expr", @@ -1200,9 +1218,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd4aff082c42fa6da99ce0698c85addd5252928c908eb087ca3cfa64ff16b313" +checksum = "dcbe9404382cda257c434f22e13577bee7047031dfdb6216dd5e841b9465e6fe" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1210,9 +1228,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df6f88d7ee27daf8b108ba910f9015176b36fbc72902b1ca5c2a5f1d1717e1a1" +checksum = "8dce50e3b637dab0d25d04d2fe79dfdca2b257eabd76790bffd22c7f90d700c8" dependencies = [ "datafusion-expr", "quote", @@ -1221,9 +1239,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084d9f979c4b155346d3c34b18f4256e6904ded508e9554d90fed416415c3515" +checksum = "03cfaacf06445dc3bbc1e901242d2a44f2cae99a744f49f3fefddcee46240058" dependencies = [ "arrow", "chrono", @@ -1240,9 +1258,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64c536062b0076f4e30084065d805f389f9fe38af0ca75bcbac86bc5e9fbab65" +checksum = "1908034a89d7b2630898e06863583ae4c00a0dd310c1589ca284195ee3f7f8a6" dependencies = [ "ahash", "arrow", @@ -1262,9 +1280,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8a92b53b3193fac1916a1c5b8e3f4347c526f6822e56b71faa5fb372327a863" +checksum = "47b7a12dd59ea07614b67dbb01d85254fbd93df45bcffa63495e11d3bdf847df" dependencies = [ "ahash", "arrow", @@ -1276,9 +1294,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fa0a5ac94c7cf3da97bedabd69d6bbca12aef84b9b37e6e9e8c25286511b5e2" +checksum = "4371cc4ad33978cc2a8be93bd54a232d3f2857b50401a14631c0705f3f910aae" dependencies = [ "arrow", "datafusion-common", @@ -1295,9 +1313,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "690c615db468c2e5fe5085b232d8b1c088299a6c63d87fd960a354a71f7acb55" +checksum = "dc47bc33025757a5c11f2cd094c5b6b5ed87f46fa33c023e6fdfa25fcbfade23" dependencies = [ "ahash", "arrow", @@ -1325,9 +1343,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a1afb2bdb05de7ff65be6883ebfd4ec027bd9f1f21c46aa3afd01927160a83" +checksum = "d8f5d9acd7d96e3bf2a7bb04818373cab6e51de0356e3694b94905fee7b4e8b6" dependencies = [ "arrow", "chrono", @@ -1341,9 +1359,9 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35b7a5876ebd6b564fb9a1fd2c3a2a9686b787071a256b47e4708f0916f9e46f" +checksum = "09ecb5ec152c4353b60f7a5635489834391f7a291d2b39a4820cd469e318b78e" dependencies = [ "arrow", "datafusion-common", @@ -1352,9 +1370,9 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad229a134c7406c057ece00c8743c0c34b97f4e72f78b475fe17b66c5e14fa4f" +checksum = "d7485da32283985d6b45bd7d13a65169dcbe8c869e25d01b2cfbc425254b4b49" dependencies = [ "arrow", "async-trait", @@ -1376,9 +1394,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "47.0.0" +version = "48.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64f6ab28b72b664c21a27b22a2ff815fd390ed224c26e89a93b5a8154a4e8607" +checksum = "a466b15632befddfeac68c125f0260f569ff315c6831538cbb40db754134e0df" dependencies = [ "arrow", "bigdecimal", @@ -1441,20 +1459,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" -[[package]] -name = "ffi-table-provider" -version = "0.1.0" -dependencies = [ - "arrow", - "arrow-array", - "arrow-schema", - "async-trait", - "datafusion", - "datafusion-ffi", - "pyo3", - "pyo3-build-config", -] - [[package]] name = "fixedbitset" version = "0.5.7" @@ -1488,6 +1492,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1666,6 +1676,11 @@ name = "hashbrown" version = "0.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] [[package]] name = "heck" @@ -2271,12 +2286,14 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "petgraph" -version = "0.7.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +checksum = "54acf3a685220b533e437e264e4d932cfbdc4cc7ec0cd232ed73c08d03b8a7ca" dependencies = [ "fixedbitset", + "hashbrown 0.15.3", "indexmap", + "serde", ] [[package]] @@ -2305,7 +2322,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" dependencies = [ "phf_shared", - "rand", + "rand 0.8.5", ] [[package]] @@ -2484,19 +2501,27 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ - "libc", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" +dependencies = [ "rand_chacha", - "rand_core", + "rand_core 0.9.3", ] [[package]] name = "rand_chacha" -version = "0.3.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.9.3", ] [[package]] @@ -2504,8 +2529,14 @@ name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" dependencies = [ - "getrandom 0.2.16", + "getrandom 0.3.3", ] [[package]] @@ -3032,9 +3063,9 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.16.0" +version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9" +checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d" dependencies = [ "getrandom 0.3.3", "js-sys", diff --git a/examples/datafusion-ffi-example/Cargo.toml b/examples/datafusion-ffi-example/Cargo.toml index 319163554..b26ab48e3 100644 --- a/examples/datafusion-ffi-example/Cargo.toml +++ b/examples/datafusion-ffi-example/Cargo.toml @@ -16,13 +16,13 @@ # under the License. [package] -name = "ffi-table-provider" -version = "0.1.0" +name = "datafusion-ffi-example" +version = "0.2.0" edition = "2021" [dependencies] -datafusion = { version = "47.0.0" } -datafusion-ffi = { version = "47.0.0" } +datafusion = { version = "48.0.0" } +datafusion-ffi = { version = "48.0.0" } pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } arrow = { version = "55.0.0" } arrow-array = { version = "55.0.0" } diff --git a/examples/datafusion-ffi-example/python/tests/_test_aggregate_udf.py b/examples/datafusion-ffi-example/python/tests/_test_aggregate_udf.py new file mode 100644 index 000000000..7ea6b295c --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_aggregate_udf.py @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pyarrow as pa +from datafusion import SessionContext, col, udaf +from datafusion_ffi_example import MySumUDF + + +def setup_context_with_table(): + ctx = SessionContext() + + # Pick numbers here so we get the same value in both groups + # since we cannot be certain of the output order of batches + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, 2, 3, None], type=pa.int64()), + pa.array([1, 1, 2, 2], type=pa.int64()), + ], + names=["a", "b"], + ) + ctx.register_record_batches("test_table", [[batch]]) + return ctx + + +def test_ffi_aggregate_register(): + ctx = setup_context_with_table() + my_udaf = udaf(MySumUDF()) + ctx.register_udaf(my_udaf) + + result = ctx.sql("select my_custom_sum(a) from test_table group by b").collect() + + assert len(result) == 2 + assert result[0].num_columns == 1 + + result = [r.column(0) for r in result] + expected = [ + pa.array([3], type=pa.int64()), + pa.array([3], type=pa.int64()), + ] + + assert result == expected + + +def test_ffi_aggregate_call_directly(): + ctx = setup_context_with_table() + my_udaf = udaf(MySumUDF()) + + result = ( + ctx.table("test_table").aggregate([col("b")], [my_udaf(col("a"))]).collect() + ) + + assert len(result) == 2 + assert result[0].num_columns == 2 + + result = [r.column(1) for r in result] + expected = [ + pa.array([3], type=pa.int64()), + pa.array([3], type=pa.int64()), + ] + + assert result == expected diff --git a/examples/datafusion-ffi-example/python/tests/_test_scalar_udf.py b/examples/datafusion-ffi-example/python/tests/_test_scalar_udf.py new file mode 100644 index 000000000..0c949c34a --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_scalar_udf.py @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pyarrow as pa +from datafusion import SessionContext, col, udf +from datafusion_ffi_example import IsNullUDF + + +def setup_context_with_table(): + ctx = SessionContext() + + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3, None])], + names=["a"], + ) + ctx.register_record_batches("test_table", [[batch]]) + return ctx + + +def test_ffi_scalar_register(): + ctx = setup_context_with_table() + my_udf = udf(IsNullUDF()) + ctx.register_udf(my_udf) + + result = ctx.sql("select my_custom_is_null(a) from test_table").collect() + + assert len(result) == 1 + assert result[0].num_columns == 1 + print(result) + + result = [r.column(0) for r in result] + expected = [ + pa.array([False, False, False, True], type=pa.bool_()), + ] + + assert result == expected + + +def test_ffi_scalar_call_directly(): + ctx = setup_context_with_table() + my_udf = udf(IsNullUDF()) + + result = ctx.table("test_table").select(my_udf(col("a"))).collect() + + assert len(result) == 1 + assert result[0].num_columns == 1 + print(result) + + result = [r.column(0) for r in result] + expected = [ + pa.array([False, False, False, True], type=pa.bool_()), + ] + + assert result == expected diff --git a/examples/datafusion-ffi-example/python/tests/_test_window_udf.py b/examples/datafusion-ffi-example/python/tests/_test_window_udf.py new file mode 100644 index 000000000..7d96994b9 --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_window_udf.py @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pyarrow as pa +from datafusion import SessionContext, col, udwf +from datafusion_ffi_example import MyRankUDF + + +def setup_context_with_table(): + ctx = SessionContext() + + # Pick numbers here so we get the same value in both groups + # since we cannot be certain of the output order of batches + batch = pa.RecordBatch.from_arrays( + [ + pa.array([40, 10, 30, 20], type=pa.int64()), + ], + names=["a"], + ) + ctx.register_record_batches("test_table", [[batch]]) + return ctx + + +def test_ffi_window_register(): + ctx = setup_context_with_table() + my_udwf = udwf(MyRankUDF()) + ctx.register_udwf(my_udwf) + + result = ctx.sql( + "select a, my_custom_rank() over (order by a) from test_table" + ).collect() + assert len(result) == 1 + assert result[0].num_columns == 2 + + results = [ + (result[0][0][idx].as_py(), result[0][1][idx].as_py()) for idx in range(4) + ] + results.sort() + + expected = [ + (10, 1), + (20, 2), + (30, 3), + (40, 4), + ] + assert results == expected + + +def test_ffi_window_call_directly(): + ctx = setup_context_with_table() + my_udwf = udwf(MyRankUDF()) + + result = ( + ctx.table("test_table") + .select(col("a"), my_udwf().order_by(col("a")).build()) + .collect() + ) + + assert len(result) == 1 + assert result[0].num_columns == 2 + + results = [ + (result[0][0][idx].as_py(), result[0][1][idx].as_py()) for idx in range(4) + ] + results.sort() + + expected = [ + (10, 1), + (20, 2), + (30, 3), + (40, 4), + ] + assert results == expected diff --git a/examples/datafusion-ffi-example/src/aggregate_udf.rs b/examples/datafusion-ffi-example/src/aggregate_udf.rs new file mode 100644 index 000000000..9481fe9c6 --- /dev/null +++ b/examples/datafusion-ffi-example/src/aggregate_udf.rs @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::DataType; +use datafusion::error::Result as DataFusionResult; +use datafusion::functions_aggregate::sum::Sum; +use datafusion::logical_expr::function::AccumulatorArgs; +use datafusion::logical_expr::{Accumulator, AggregateUDF, AggregateUDFImpl, Signature}; +use datafusion_ffi::udaf::FFI_AggregateUDF; +use pyo3::types::PyCapsule; +use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; +use std::any::Any; +use std::sync::Arc; + +#[pyclass(name = "MySumUDF", module = "datafusion_ffi_example", subclass)] +#[derive(Debug, Clone)] +pub(crate) struct MySumUDF { + inner: Arc, +} + +#[pymethods] +impl MySumUDF { + #[new] + fn new() -> Self { + Self { + inner: Arc::new(Sum::new()), + } + } + + fn __datafusion_aggregate_udf__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let name = cr"datafusion_aggregate_udf".into(); + + let func = Arc::new(AggregateUDF::from(self.clone())); + let provider = FFI_AggregateUDF::from(func); + + PyCapsule::new(py, provider, Some(name)) + } +} + +impl AggregateUDFImpl for MySumUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "my_custom_sum" + } + + fn signature(&self) -> &Signature { + self.inner.signature() + } + + fn return_type(&self, arg_types: &[DataType]) -> DataFusionResult { + self.inner.return_type(arg_types) + } + + fn accumulator(&self, acc_args: AccumulatorArgs) -> DataFusionResult> { + self.inner.accumulator(acc_args) + } + + fn coerce_types(&self, arg_types: &[DataType]) -> DataFusionResult> { + self.inner.coerce_types(arg_types) + } +} diff --git a/examples/datafusion-ffi-example/src/catalog_provider.rs b/examples/datafusion-ffi-example/src/catalog_provider.rs index 54e61cf3e..cd2616916 100644 --- a/examples/datafusion-ffi-example/src/catalog_provider.rs +++ b/examples/datafusion-ffi-example/src/catalog_provider.rs @@ -24,7 +24,6 @@ use datafusion::{ catalog::{ CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider, TableProvider, }, - common::exec_err, datasource::MemTable, error::{DataFusionError, Result}, }; diff --git a/examples/datafusion-ffi-example/src/lib.rs b/examples/datafusion-ffi-example/src/lib.rs index 3a4cf2247..79af276fd 100644 --- a/examples/datafusion-ffi-example/src/lib.rs +++ b/examples/datafusion-ffi-example/src/lib.rs @@ -16,18 +16,27 @@ // under the License. use crate::catalog_provider::MyCatalogProvider; +use crate::aggregate_udf::MySumUDF; +use crate::scalar_udf::IsNullUDF; use crate::table_function::MyTableFunction; use crate::table_provider::MyTableProvider; +use crate::window_udf::MyRankUDF; use pyo3::prelude::*; pub(crate) mod catalog_provider; +pub(crate) mod aggregate_udf; +pub(crate) mod scalar_udf; pub(crate) mod table_function; pub(crate) mod table_provider; +pub(crate) mod window_udf; #[pymodule] fn datafusion_ffi_example(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/examples/datafusion-ffi-example/src/scalar_udf.rs b/examples/datafusion-ffi-example/src/scalar_udf.rs new file mode 100644 index 000000000..727666638 --- /dev/null +++ b/examples/datafusion-ffi-example/src/scalar_udf.rs @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{Array, BooleanArray}; +use arrow_schema::DataType; +use datafusion::common::ScalarValue; +use datafusion::error::Result as DataFusionResult; +use datafusion::logical_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, TypeSignature, + Volatility, +}; +use datafusion_ffi::udf::FFI_ScalarUDF; +use pyo3::types::PyCapsule; +use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; +use std::any::Any; +use std::sync::Arc; + +#[pyclass(name = "IsNullUDF", module = "datafusion_ffi_example", subclass)] +#[derive(Debug, Clone)] +pub(crate) struct IsNullUDF { + signature: Signature, +} + +#[pymethods] +impl IsNullUDF { + #[new] + fn new() -> Self { + Self { + signature: Signature::new(TypeSignature::Any(1), Volatility::Immutable), + } + } + + fn __datafusion_scalar_udf__<'py>(&self, py: Python<'py>) -> PyResult> { + let name = cr"datafusion_scalar_udf".into(); + + let func = Arc::new(ScalarUDF::from(self.clone())); + let provider = FFI_ScalarUDF::from(func); + + PyCapsule::new(py, provider, Some(name)) + } +} + +impl ScalarUDFImpl for IsNullUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "my_custom_is_null" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> DataFusionResult { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> DataFusionResult { + let input = &args.args[0]; + + Ok(match input { + ColumnarValue::Array(arr) => match arr.is_nullable() { + true => { + let nulls = arr.nulls().unwrap(); + let nulls = BooleanArray::from_iter(nulls.iter().map(|x| Some(!x))); + ColumnarValue::Array(Arc::new(nulls)) + } + false => ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))), + }, + ColumnarValue::Scalar(sv) => { + ColumnarValue::Scalar(ScalarValue::Boolean(Some(sv == &ScalarValue::Null))) + } + }) + } +} diff --git a/examples/datafusion-ffi-example/src/window_udf.rs b/examples/datafusion-ffi-example/src/window_udf.rs new file mode 100644 index 000000000..e0d397956 --- /dev/null +++ b/examples/datafusion-ffi-example/src/window_udf.rs @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::{DataType, FieldRef}; +use datafusion::error::Result as DataFusionResult; +use datafusion::functions_window::rank::rank_udwf; +use datafusion::logical_expr::function::{PartitionEvaluatorArgs, WindowUDFFieldArgs}; +use datafusion::logical_expr::{PartitionEvaluator, Signature, WindowUDF, WindowUDFImpl}; +use datafusion_ffi::udwf::FFI_WindowUDF; +use pyo3::types::PyCapsule; +use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; +use std::any::Any; +use std::sync::Arc; + +#[pyclass(name = "MyRankUDF", module = "datafusion_ffi_example", subclass)] +#[derive(Debug, Clone)] +pub(crate) struct MyRankUDF { + inner: Arc, +} + +#[pymethods] +impl MyRankUDF { + #[new] + fn new() -> Self { + Self { inner: rank_udwf() } + } + + fn __datafusion_window_udf__<'py>(&self, py: Python<'py>) -> PyResult> { + let name = cr"datafusion_window_udf".into(); + + let func = Arc::new(WindowUDF::from(self.clone())); + let provider = FFI_WindowUDF::from(func); + + PyCapsule::new(py, provider, Some(name)) + } +} + +impl WindowUDFImpl for MyRankUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "my_custom_rank" + } + + fn signature(&self) -> &Signature { + self.inner.signature() + } + + fn partition_evaluator( + &self, + partition_evaluator_args: PartitionEvaluatorArgs, + ) -> DataFusionResult> { + self.inner + .inner() + .partition_evaluator(partition_evaluator_args) + } + + fn field(&self, field_args: WindowUDFFieldArgs) -> DataFusionResult { + self.inner.inner().field(field_args) + } + + fn coerce_types(&self, arg_types: &[DataType]) -> DataFusionResult> { + self.inner.coerce_types(arg_types) + } +} diff --git a/python/datafusion/user_defined.py b/python/datafusion/user_defined.py index dd634c7fb..bd686acbb 100644 --- a/python/datafusion/user_defined.py +++ b/python/datafusion/user_defined.py @@ -22,7 +22,7 @@ import functools from abc import ABCMeta, abstractmethod from enum import Enum -from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, overload +from typing import TYPE_CHECKING, Any, Callable, Optional, Protocol, TypeVar, overload import pyarrow as pa @@ -77,6 +77,12 @@ def __str__(self) -> str: return self.name.lower() +class ScalarUDFExportable(Protocol): + """Type hint for object that has __datafusion_scalar_udf__ PyCapsule.""" + + def __datafusion_scalar_udf__(self) -> object: ... # noqa: D105 + + class ScalarUDF: """Class for performing scalar user-defined functions (UDF). @@ -96,6 +102,9 @@ def __init__( See helper method :py:func:`udf` for argument details. """ + if hasattr(func, "__datafusion_scalar_udf__"): + self._udf = df_internal.ScalarUDF.from_pycapsule(func) + return if isinstance(input_types, pa.DataType): input_types = [input_types] self._udf = df_internal.ScalarUDF( @@ -134,6 +143,10 @@ def udf( name: Optional[str] = None, ) -> ScalarUDF: ... + @overload + @staticmethod + def udf(func: ScalarUDFExportable) -> ScalarUDF: ... + @staticmethod def udf(*args: Any, **kwargs: Any): # noqa: D417 """Create a new User-Defined Function (UDF). @@ -147,7 +160,10 @@ def udf(*args: Any, **kwargs: Any): # noqa: D417 Args: func (Callable, optional): Only needed when calling as a function. - Skip this argument when using ``udf`` as a decorator. + Skip this argument when using `udf` as a decorator. If you have a Rust + backed ScalarUDF within a PyCapsule, you can pass this parameter + and ignore the rest. They will be determined directly from the + underlying function. See the online documentation for more information. input_types (list[pa.DataType]): The data types of the arguments to ``func``. This list must be of the same length as the number of arguments. @@ -215,12 +231,31 @@ def wrapper(*args: Any, **kwargs: Any): return decorator + if hasattr(args[0], "__datafusion_scalar_udf__"): + return ScalarUDF.from_pycapsule(args[0]) + if args and callable(args[0]): # Case 1: Used as a function, require the first parameter to be callable return _function(*args, **kwargs) # Case 2: Used as a decorator with parameters return _decorator(*args, **kwargs) + @staticmethod + def from_pycapsule(func: ScalarUDFExportable) -> ScalarUDF: + """Create a Scalar UDF from ScalarUDF PyCapsule object. + + This function will instantiate a Scalar UDF that uses a DataFusion + ScalarUDF that is exported via the FFI bindings. + """ + name = str(func.__class__) + return ScalarUDF( + name=name, + func=func, + input_types=None, + return_type=None, + volatility=None, + ) + class Accumulator(metaclass=ABCMeta): """Defines how an :py:class:`AggregateUDF` accumulates values.""" @@ -242,6 +277,12 @@ def evaluate(self) -> pa.Scalar: """Return the resultant value.""" +class AggregateUDFExportable(Protocol): + """Type hint for object that has __datafusion_aggregate_udf__ PyCapsule.""" + + def __datafusion_aggregate_udf__(self) -> object: ... # noqa: D105 + + class AggregateUDF: """Class for performing scalar user-defined functions (UDF). @@ -263,6 +304,9 @@ def __init__( See :py:func:`udaf` for a convenience function and argument descriptions. """ + if hasattr(accumulator, "__datafusion_aggregate_udf__"): + self._udaf = df_internal.AggregateUDF.from_pycapsule(accumulator) + return self._udaf = df_internal.AggregateUDF( name, accumulator, @@ -307,7 +351,7 @@ def udaf( ) -> AggregateUDF: ... @staticmethod - def udaf(*args: Any, **kwargs: Any): # noqa: D417 + def udaf(*args: Any, **kwargs: Any): # noqa: D417, C901 """Create a new User-Defined Aggregate Function (UDAF). This class allows you to define an aggregate function that can be used in @@ -364,6 +408,10 @@ def udf4() -> Summarize: Args: accum: The accumulator python function. Only needed when calling as a function. Skip this argument when using ``udaf`` as a decorator. + If you have a Rust backed AggregateUDF within a PyCapsule, you can + pass this parameter and ignore the rest. They will be determined + directly from the underlying function. See the online documentation + for more information. input_types: The data types of the arguments to ``accum``. return_type: The data type of the return value. state_type: The data types of the intermediate accumulation. @@ -422,12 +470,32 @@ def wrapper(*args: Any, **kwargs: Any) -> Expr: return decorator + if hasattr(args[0], "__datafusion_aggregate_udf__"): + return AggregateUDF.from_pycapsule(args[0]) + if args and callable(args[0]): # Case 1: Used as a function, require the first parameter to be callable return _function(*args, **kwargs) # Case 2: Used as a decorator with parameters return _decorator(*args, **kwargs) + @staticmethod + def from_pycapsule(func: AggregateUDFExportable) -> AggregateUDF: + """Create an Aggregate UDF from AggregateUDF PyCapsule object. + + This function will instantiate a Aggregate UDF that uses a DataFusion + AggregateUDF that is exported via the FFI bindings. + """ + name = str(func.__class__) + return AggregateUDF( + name=name, + accumulator=func, + input_types=None, + return_type=None, + state_type=None, + volatility=None, + ) + class WindowEvaluator: """Evaluator class for user-defined window functions (UDWF). @@ -588,6 +656,12 @@ def include_rank(self) -> bool: return False +class WindowUDFExportable(Protocol): + """Type hint for object that has __datafusion_window_udf__ PyCapsule.""" + + def __datafusion_window_udf__(self) -> object: ... # noqa: D105 + + class WindowUDF: """Class for performing window user-defined functions (UDF). @@ -608,6 +682,9 @@ def __init__( See :py:func:`udwf` for a convenience function and argument descriptions. """ + if hasattr(func, "__datafusion_window_udf__"): + self._udwf = df_internal.WindowUDF.from_pycapsule(func) + return self._udwf = df_internal.WindowUDF( name, func, input_types, return_type, str(volatility) ) @@ -683,7 +760,10 @@ def biased_numbers() -> BiasedNumbers: Args: func: Only needed when calling as a function. Skip this argument when - using ``udwf`` as a decorator. + using ``udwf`` as a decorator. If you have a Rust backed WindowUDF + within a PyCapsule, you can pass this parameter and ignore the rest. + They will be determined directly from the underlying function. See + the online documentation for more information. input_types: The data types of the arguments. return_type: The data type of the return value. volatility: See :py:class:`Volatility` for allowed values. @@ -692,6 +772,9 @@ def biased_numbers() -> BiasedNumbers: Returns: A user-defined window function that can be used in window function calls. """ + if hasattr(args[0], "__datafusion_window_udf__"): + return WindowUDF.from_pycapsule(args[0]) + if args and callable(args[0]): # Case 1: Used as a function, require the first parameter to be callable return WindowUDF._create_window_udf(*args, **kwargs) @@ -759,6 +842,22 @@ def wrapper(*args: Any, **kwargs: Any) -> Expr: return decorator + @staticmethod + def from_pycapsule(func: WindowUDFExportable) -> WindowUDF: + """Create a Window UDF from WindowUDF PyCapsule object. + + This function will instantiate a Window UDF that uses a DataFusion + WindowUDF that is exported via the FFI bindings. + """ + name = str(func.__class__) + return WindowUDF( + name=name, + func=func, + input_types=None, + return_type=None, + volatility=None, + ) + class TableFunction: """Class for performing user-defined table functions (UDTF). diff --git a/src/functions.rs b/src/functions.rs index b40500b8b..eeef48385 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -682,7 +682,7 @@ pub fn approx_percentile_cont_with_weight( add_builder_fns_to_aggregate(agg_fn, None, filter, None, None) } -// We handle first_value explicitly because the signature expects an order_by +// We handle last_value explicitly because the signature expects an order_by // https://github.com/apache/datafusion/issues/12376 #[pyfunction] #[pyo3(signature = (expr, distinct=None, filter=None, order_by=None, null_treatment=None))] diff --git a/src/udaf.rs b/src/udaf.rs index 34a9cd51d..78f4e2b0c 100644 --- a/src/udaf.rs +++ b/src/udaf.rs @@ -19,6 +19,10 @@ use std::sync::Arc; use pyo3::{prelude::*, types::PyTuple}; +use crate::common::data_type::PyScalarValue; +use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionResult}; +use crate::expr::PyExpr; +use crate::utils::{parse_volatility, validate_pycapsule}; use datafusion::arrow::array::{Array, ArrayRef}; use datafusion::arrow::datatypes::DataType; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; @@ -27,11 +31,8 @@ use datafusion::error::{DataFusionError, Result}; use datafusion::logical_expr::{ create_udaf, Accumulator, AccumulatorFactoryFunction, AggregateUDF, }; - -use crate::common::data_type::PyScalarValue; -use crate::errors::to_datafusion_err; -use crate::expr::PyExpr; -use crate::utils::parse_volatility; +use datafusion_ffi::udaf::{FFI_AggregateUDF, ForeignAggregateUDF}; +use pyo3::types::PyCapsule; #[derive(Debug)] struct RustAccumulator { @@ -183,6 +184,26 @@ impl PyAggregateUDF { Ok(Self { function }) } + #[staticmethod] + pub fn from_pycapsule(func: Bound<'_, PyAny>) -> PyDataFusionResult { + if func.hasattr("__datafusion_aggregate_udf__")? { + let capsule = func.getattr("__datafusion_aggregate_udf__")?.call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_aggregate_udf")?; + + let udaf = unsafe { capsule.reference::() }; + let udaf: ForeignAggregateUDF = udaf.try_into()?; + + Ok(Self { + function: udaf.into(), + }) + } else { + Err(crate::errors::PyDataFusionError::Common( + "__datafusion_aggregate_udf__ does not exist on AggregateUDF object.".to_string(), + )) + } + } + /// creates a new PyExpr with the call of the udf #[pyo3(signature = (*args))] fn __call__(&self, args: Vec) -> PyResult { diff --git a/src/udf.rs b/src/udf.rs index 574c9d7b5..de1e3f18c 100644 --- a/src/udf.rs +++ b/src/udf.rs @@ -17,6 +17,8 @@ use std::sync::Arc; +use datafusion_ffi::udf::{FFI_ScalarUDF, ForeignScalarUDF}; +use pyo3::types::PyCapsule; use pyo3::{prelude::*, types::PyTuple}; use datafusion::arrow::array::{make_array, Array, ArrayData, ArrayRef}; @@ -29,8 +31,9 @@ use datafusion::logical_expr::ScalarUDF; use datafusion::logical_expr::{create_udf, ColumnarValue}; use crate::errors::to_datafusion_err; +use crate::errors::{py_datafusion_err, PyDataFusionResult}; use crate::expr::PyExpr; -use crate::utils::parse_volatility; +use crate::utils::{parse_volatility, validate_pycapsule}; /// Create a Rust callable function from a python function that expects pyarrow arrays fn pyarrow_function_to_rust( @@ -105,6 +108,26 @@ impl PyScalarUDF { Ok(Self { function }) } + #[staticmethod] + pub fn from_pycapsule(func: Bound<'_, PyAny>) -> PyDataFusionResult { + if func.hasattr("__datafusion_scalar_udf__")? { + let capsule = func.getattr("__datafusion_scalar_udf__")?.call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_scalar_udf")?; + + let udf = unsafe { capsule.reference::() }; + let udf: ForeignScalarUDF = udf.try_into()?; + + Ok(Self { + function: udf.into(), + }) + } else { + Err(crate::errors::PyDataFusionError::Common( + "__datafusion_scalar_udf__ does not exist on ScalarUDF object.".to_string(), + )) + } + } + /// creates a new PyExpr with the call of the udf #[pyo3(signature = (*args))] fn __call__(&self, args: Vec) -> PyResult { diff --git a/src/udwf.rs b/src/udwf.rs index a0c8cc59a..4fb98916b 100644 --- a/src/udwf.rs +++ b/src/udwf.rs @@ -27,16 +27,17 @@ use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use crate::common::data_type::PyScalarValue; -use crate::errors::to_datafusion_err; +use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionResult}; use crate::expr::PyExpr; -use crate::utils::parse_volatility; +use crate::utils::{parse_volatility, validate_pycapsule}; use datafusion::arrow::datatypes::DataType; use datafusion::arrow::pyarrow::{FromPyArrow, PyArrowType, ToPyArrow}; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_expr::{ PartitionEvaluator, PartitionEvaluatorFactory, Signature, Volatility, WindowUDF, WindowUDFImpl, }; -use pyo3::types::{PyList, PyTuple}; +use datafusion_ffi::udwf::{FFI_WindowUDF, ForeignWindowUDF}; +use pyo3::types::{PyCapsule, PyList, PyTuple}; #[derive(Debug)] struct RustPartitionEvaluator { @@ -245,6 +246,26 @@ impl PyWindowUDF { Ok(self.function.call(args).into()) } + #[staticmethod] + pub fn from_pycapsule(func: Bound<'_, PyAny>) -> PyDataFusionResult { + if func.hasattr("__datafusion_window_udf__")? { + let capsule = func.getattr("__datafusion_window_udf__")?.call0()?; + let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_window_udf")?; + + let udwf = unsafe { capsule.reference::() }; + let udwf: ForeignWindowUDF = udwf.try_into()?; + + Ok(Self { + function: udwf.into(), + }) + } else { + Err(crate::errors::PyDataFusionError::Common( + "__datafusion_window_udf__ does not exist on WindowUDF object.".to_string(), + )) + } + } + fn __repr__(&self) -> PyResult { Ok(format!("WindowUDF({})", self.function.name())) } From 2e1b71369eefc97c22b82be84bbabb414f748fb9 Mon Sep 17 00:00:00 2001 From: kosiew Date: Fri, 4 Jul 2025 20:36:05 +0800 Subject: [PATCH 09/96] refactor: style loading logic in DataFrameHtmlFormatter (#1177) --- python/datafusion/dataframe_formatter.py | 117 ++++++++--------------- python/tests/test_dataframe.py | 60 +++++------- 2 files changed, 63 insertions(+), 114 deletions(-) diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py index 27f00f9c3..2323224b8 100644 --- a/python/datafusion/dataframe_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -135,9 +135,6 @@ class DataFrameHtmlFormatter: session """ - # Class variable to track if styles have been loaded in the notebook - _styles_loaded = False - def __init__( self, max_cell_length: int = 25, @@ -260,23 +257,6 @@ def set_custom_header_builder(self, builder: Callable[[Any], str]) -> None: """ self._custom_header_builder = builder - @classmethod - def is_styles_loaded(cls) -> bool: - """Check if HTML styles have been loaded in the current session. - - This method is primarily intended for debugging UI rendering issues - related to style loading. - - Returns: - True if styles have been loaded, False otherwise - - Example: - >>> from datafusion.dataframe_formatter import DataFrameHtmlFormatter - >>> DataFrameHtmlFormatter.is_styles_loaded() - False - """ - return cls._styles_loaded - def format_html( self, batches: list, @@ -315,18 +295,7 @@ def format_html( # Build HTML components html = [] - # Only include styles and scripts if: - # 1. Not using shared styles, OR - # 2. Using shared styles but they haven't been loaded yet - include_styles = ( - not self.use_shared_styles or not DataFrameHtmlFormatter._styles_loaded - ) - - if include_styles: - html.extend(self._build_html_header()) - # If we're using shared styles, mark them as loaded - if self.use_shared_styles: - DataFrameHtmlFormatter._styles_loaded = True + html.extend(self._build_html_header()) html.extend(self._build_table_container_start()) @@ -338,7 +307,7 @@ def format_html( html.append("") # Add footer (JavaScript and messages) - if include_styles and self.enable_cell_expansion: + if self.enable_cell_expansion: html.append(self._get_javascript()) # Always add truncation message if needed (independent of styles) @@ -375,14 +344,20 @@ def format_str( def _build_html_header(self) -> list[str]: """Build the HTML header with CSS styles.""" - html = [] - html.append("") + html.append(f"") return html def _build_table_container_start(self) -> list[str]: @@ -570,28 +545,31 @@ def _get_default_css(self) -> str: def _get_javascript(self) -> str: """Get JavaScript code for interactive elements.""" return """ - - """ + +""" class FormatterManager: @@ -712,24 +690,9 @@ def reset_formatter() -> None: >>> reset_formatter() # Reset formatter to default settings """ formatter = DataFrameHtmlFormatter() - # Reset the styles_loaded flag to ensure styles will be reloaded - DataFrameHtmlFormatter._styles_loaded = False set_formatter(formatter) -def reset_styles_loaded_state() -> None: - """Reset the styles loaded state to force reloading of styles. - - This can be useful when switching between notebook sessions or - when styles need to be refreshed. - - Example: - >>> from datafusion.html_formatter import reset_styles_loaded_state - >>> reset_styles_loaded_state() # Force styles to reload in next render - """ - DataFrameHtmlFormatter._styles_loaded = False - - def _refresh_formatter_reference() -> None: """Refresh formatter reference in any modules using it. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index c9ae38d8e..a3870ead8 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -42,7 +42,6 @@ configure_formatter, get_formatter, reset_formatter, - reset_styles_loaded_state, ) from datafusion.expr import Window from pyarrow.csv import write_csv @@ -2177,27 +2176,15 @@ def test_html_formatter_shared_styles(df, clean_formatter_state): # First, ensure we're using shared styles configure_formatter(use_shared_styles=True) - # Get HTML output for first table - should include styles html_first = df._repr_html_() - - # Verify styles are included in first render - assert "
" - f"{field.name}
" - f"
" - "" - "" - f"{formatted_value}" - f"" - f"
" - f"
{formatted_value}