diff --git a/README.md b/README.md index 63c2d41..7955fa2 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,16 @@ We provide script to check codes. ./dev/lint-python.sh -h # run this to see more usages ``` +## Build + +We provide a script to build source distribution package. + +```shell +./dev/build-source-distribution-package.sh +``` + +The package is under `dist/`. + # Usage See Apache Paimon Python API [Doc](https://paimon.apache.org/docs/master/program-api/python-api/). diff --git a/dev/build-source-distribution-package.sh b/dev/build-source-distribution-package.sh new file mode 100755 index 0000000..461a52f --- /dev/null +++ b/dev/build-source-distribution-package.sh @@ -0,0 +1,43 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +CURR_DIR=`pwd` +BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" +PROJECT_ROOT="${BASE_DIR}/../" + +# prepare bridge jar + +DEPS_DIR=${PROJECT_ROOT}/deps/jars +rm -rf ${DEPS_DIR} +mkdir -p ${DEPS_DIR} + +cd ${PROJECT_ROOT}/paimon-python-java-bridge + +# get bridge jar version +JAR_VERSION=$(sed -n 's/.*\(.*\)<\/version>.*/\1/p' pom.xml | head -n 1) + +mvn clean install -DskipTests +cp "target/paimon-python-java-bridge-${JAR_VERSION}.jar" ${DEPS_DIR} + +cd ${CURR_DIR} + +# build source distribution package + +python setup.py sdist + +rm -rf ${DEPS_DIR} +cd ${CURR_DIR} diff --git a/pypaimon/api/table_read.py b/pypaimon/api/table_read.py index 9fcb78c..60b31e7 100644 --- a/pypaimon/api/table_read.py +++ b/pypaimon/api/table_read.py @@ -18,12 +18,14 @@ import pandas as pd import pyarrow as pa -import ray from abc import ABC, abstractmethod -from duckdb.duckdb import DuckDBPyConnection from pypaimon.api import Split -from typing import List, Optional +from typing import List, Optional, TYPE_CHECKING + +if TYPE_CHECKING: + import ray + from duckdb.duckdb import DuckDBPyConnection class TableRead(ABC): @@ -46,9 +48,9 @@ def to_duckdb( self, splits: List[Split], table_name: str, - connection: Optional[DuckDBPyConnection] = None) -> DuckDBPyConnection: + connection: Optional["DuckDBPyConnection"] = None) -> "DuckDBPyConnection": """Convert splits into an in-memory DuckDB table which can be queried.""" @abstractmethod - def to_ray(self, splits: List[Split]) -> ray.data.dataset.Dataset: + def to_ray(self, splits: List[Split]) -> "ray.data.dataset.Dataset": """Convert splits into a Ray dataset format.""" diff --git a/pypaimon/py4j/java_implementation.py b/pypaimon/py4j/java_implementation.py index 17c6eda..ce90bc5 100644 --- a/pypaimon/py4j/java_implementation.py +++ b/pypaimon/py4j/java_implementation.py @@ -18,19 +18,20 @@ # pypaimon.api implementation based on Java code & py4j lib -import duckdb import pandas as pd import pyarrow as pa -import ray -from duckdb.duckdb import DuckDBPyConnection from pypaimon.py4j.java_gateway import get_gateway from pypaimon.py4j.util import java_utils, constants from pypaimon.api import \ (catalog, table, read_builder, table_scan, split, table_read, write_builder, table_write, commit_message, table_commit, Schema, predicate) -from typing import List, Iterator, Optional, Any +from typing import List, Iterator, Optional, Any, TYPE_CHECKING + +if TYPE_CHECKING: + import ray + from duckdb.duckdb import DuckDBPyConnection class Catalog(catalog.Catalog): @@ -171,12 +172,16 @@ def to_duckdb( self, splits: List[Split], table_name: str, - connection: Optional[DuckDBPyConnection] = None) -> DuckDBPyConnection: + connection: Optional["DuckDBPyConnection"] = None) -> "DuckDBPyConnection": + import duckdb + con = connection or duckdb.connect(database=":memory:") con.register(table_name, self.to_arrow(splits)) return con - def to_ray(self, splits: List[Split]) -> ray.data.dataset.Dataset: + def to_ray(self, splits: List[Split]) -> "ray.data.dataset.Dataset": + import ray + return ray.data.from_arrow(self.to_arrow(splits)) def _init(self):