googleapis
diff --git a/Collapse file
‎bigframes/core/array_value.py‎
Copy file name to clipboardExpand all lines: bigframes/core/array_value.py
+6-7Lines changed: 6 additions & 7 deletions b/Collapse file
‎bigframes/core/array_value.py‎
Copy file name to clipboardExpand all lines: bigframes/core/array_value.py
+6-7Lines changed: 6 additions & 7 deletions
diff --git a/Collapse file
‎bigframes/core/bq_data.py‎
Copy file name to clipboardExpand all lines: bigframes/core/bq_data.py
+173-13Lines changed: 173 additions & 13 deletions b/Collapse file
‎bigframes/core/bq_data.py‎
Copy file name to clipboardExpand all lines: bigframes/core/bq_data.py
+173-13Lines changed: 173 additions & 13 deletions
diff --git a/Collapse file
‎bigframes/core/compile/ibis_compiler/ibis_compiler.py‎
Copy file name to clipboardExpand all lines: bigframes/core/compile/ibis_compiler/ibis_compiler.py
+1-3Lines changed: 1 addition & 3 deletions b/Collapse file
‎bigframes/core/compile/ibis_compiler/ibis_compiler.py‎
Copy file name to clipboardExpand all lines: bigframes/core/compile/ibis_compiler/ibis_compiler.py
+1-3Lines changed: 1 addition & 3 deletions
diff --git a/Collapse file
‎bigframes/core/nodes.py‎
Copy file name to clipboardExpand all lines: bigframes/core/nodes.py
+1-3Lines changed: 1 addition & 3 deletions b/Collapse file
‎bigframes/core/nodes.py‎
Copy file name to clipboardExpand all lines: bigframes/core/nodes.py
+1-3Lines changed: 1 addition & 3 deletions
diff --git a/Collapse file
‎bigframes/core/schema.py‎
Copy file name to clipboardExpand all lines: bigframes/core/schema.py
+6-21Lines changed: 6 additions & 21 deletions b/Collapse file
‎bigframes/core/schema.py‎
Copy file name to clipboardExpand all lines: bigframes/core/schema.py
+6-21Lines changed: 6 additions & 21 deletions
@@ -17,9 +17,8 @@
 import datetime
 import functools
 import typing
-from typing import Iterable, List, Mapping, Optional, Sequence, Tuple
+from typing import Iterable, List, Mapping, Optional, Sequence, Tuple, Union
 
-import google.cloud.bigquery
 import pandas
 import pyarrow as pa
 
@@ -91,7 +90,7 @@ def from_range(cls, start, end, step):
     @classmethod
     def from_table(
         cls,
-        table: google.cloud.bigquery.Table,
+        table: Union[bq_data.BiglakeIcebergTable, bq_data.GbqNativeTable],
         session: Session,
         *,
         columns: Optional[Sequence[str]] = None,
@@ -103,8 +102,6 @@ def from_table(
     ):
         if offsets_col and primary_key:
             raise ValueError("must set at most one of 'offests', 'primary_key'")
-        # define data source only for needed columns, this makes row-hashing cheaper
-        table_def = bq_data.GbqTable.from_table(table, columns=columns or ())
 
         # create ordering from info
         ordering = None
@@ -115,7 +112,9 @@ def from_table(
                 [ids.ColumnId(key_part) for key_part in primary_key]
             )
 
-        bf_schema = schemata.ArraySchema.from_bq_table(table, columns=columns)
+        bf_schema = schemata.ArraySchema.from_bq_schema(
+            table.physical_schema, columns=columns
+        )
         # Scan all columns by default, we define this list as it can be pruned while preserving source_def
         scan_list = nodes.ScanList(
             tuple(
@@ -124,7 +123,7 @@ def from_table(
             )
         )
         source_def = bq_data.BigqueryDataSource(
-            table=table_def,
+            table=table,
             schema=bf_schema,
             at_time=at_time,
             sql_predicate=predicate,
 
@@ -22,74 +22,214 @@
 import queue
 import threading
 import typing
-from typing import Any, Iterator, Optional, Sequence, Tuple
+from typing import Any, Iterator, List, Literal, Optional, Sequence, Tuple, Union
 
 from google.cloud import bigquery_storage_v1
 import google.cloud.bigquery as bq
 import google.cloud.bigquery_storage_v1.types as bq_storage_types
 from google.protobuf import timestamp_pb2
 import pyarrow as pa
 
+import bigframes.constants
 from bigframes.core import pyarrow_utils
 import bigframes.core.schema
 
 if typing.TYPE_CHECKING:
     import bigframes.core.ordering as orderings
 
 
+def _resolve_standard_gcp_region(bq_region: str):
+    """
+    Resolve bq regions to standardized
+    """
+    if bq_region.casefold() == "US":
+        return "us-central1"
+    elif bq_region.casefold() == "EU":
+        return "europe-west4"
+    return bq_region
+
+
+def is_irc_table(table_id: str):
+    """
+    Determines if a table id should be resolved through the iceberg rest catalog.
+    """
+    return len(table_id.split(".")) == 4
+
+
+def is_compatible(
+    data_region: Union[GcsRegion, BigQueryRegion], session_location: str
+) -> bool:
+    # based on https://docs.cloud.google.com/bigquery/docs/locations#storage-location-considerations
+    if isinstance(data_region, BigQueryRegion):
+        return data_region.name == session_location
+    else:
+        assert isinstance(data_region, GcsRegion)
+        # TODO(b/463675088): Multi-regions don't yet support rest catalog tables
+        if session_location in bigframes.constants.BIGQUERY_MULTIREGIONS:
+            return False
+        return _resolve_standard_gcp_region(session_location) in data_region.included
+
+
+def get_default_bq_region(data_region: Union[GcsRegion, BigQueryRegion]) -> str:
+    if isinstance(data_region, BigQueryRegion):
+        return data_region.name
+    elif isinstance(data_region, GcsRegion):
+        # should maybe try to track and prefer primary replica?
+        return data_region.included[0]
+
+
+@dataclasses.dataclass(frozen=True)
+class BigQueryRegion:
+    name: str
+
+
+@dataclasses.dataclass(frozen=True)
+class GcsRegion:
+    # this is the name of gcs regions, which may be names for multi-regions, so shouldn't be compared with non-gcs locations
+    storage_regions: tuple[str, ...]
+    # this tracks all the included standard, specific regions (eg us-east1), and should be comparable to bq regions (except non-standard US, EU, omni regions)
+    included: tuple[str, ...]
+
+
+# what is the line between metadata and core fields? Mostly metadata fields are optional or unreliable, but its fuzzy
 @dataclasses.dataclass(frozen=True)
-class GbqTable:
+class TableMetadata:
+    # this size metadata might be stale, don't use where strict correctness is needed
+    location: Union[BigQueryRegion, GcsRegion]
+    type: Literal["TABLE", "EXTERNAL", "VIEW", "MATERIALIZE_VIEW", "SNAPSHOT"]
+    numBytes: Optional[int] = None
+    numRows: Optional[int] = None
+    created_time: Optional[datetime.datetime] = None
+    modified_time: Optional[datetime.datetime] = None
+
+
+@dataclasses.dataclass(frozen=True)
+class GbqNativeTable:
     project_id: str = dataclasses.field()
     dataset_id: str = dataclasses.field()
     table_id: str = dataclasses.field()
     physical_schema: Tuple[bq.SchemaField, ...] = dataclasses.field()
-    is_physically_stored: bool = dataclasses.field()
-    cluster_cols: typing.Optional[Tuple[str, ...]]
+    metadata: TableMetadata = dataclasses.field()
+    partition_col: Optional[str] = None
+    cluster_cols: typing.Optional[Tuple[str, ...]] = None
+    primary_key: Optional[Tuple[str, ...]] = None
 
     @staticmethod
-    def from_table(table: bq.Table, columns: Sequence[str] = ()) -> GbqTable:
+    def from_table(table: bq.Table, columns: Sequence[str] = ()) -> GbqNativeTable:
         # Subsetting fields with columns can reduce cost of row-hash default ordering
         if columns:
             schema = tuple(item for item in table.schema if item.name in columns)
         else:
             schema = tuple(table.schema)
-        return GbqTable(
+
+        metadata = TableMetadata(
+            numBytes=table.num_bytes,
+            numRows=table.num_rows,
+            location=BigQueryRegion(table.location),  # type: ignore
+            type=table.table_type or "TABLE",  # type: ignore
+            created_time=table.created,
+            modified_time=table.modified,
+        )
+        partition_col = None
+        if table.range_partitioning:
+            partition_col = table.range_partitioning.field
+        elif table.time_partitioning:
+            partition_col = table.time_partitioning.field
+
+        return GbqNativeTable(
             project_id=table.project,
             dataset_id=table.dataset_id,
             table_id=table.table_id,
             physical_schema=schema,
-            is_physically_stored=(table.table_type in ["TABLE", "MATERIALIZED_VIEW"]),
+            partition_col=partition_col,
             cluster_cols=None
-            if table.clustering_fields is None
+            if (table.clustering_fields is None)
             else tuple(table.clustering_fields),
+            primary_key=tuple(_get_primary_keys(table)),
+            metadata=metadata,
         )
 
     @staticmethod
     def from_ref_and_schema(
         table_ref: bq.TableReference,
         schema: Sequence[bq.SchemaField],
+        location: str,
+        table_type: Literal["TABLE"] = "TABLE",
         cluster_cols: Optional[Sequence[str]] = None,
-    ) -> GbqTable:
-        return GbqTable(
+    ) -> GbqNativeTable:
+        return GbqNativeTable(
             project_id=table_ref.project,
             dataset_id=table_ref.dataset_id,
             table_id=table_ref.table_id,
+            metadata=TableMetadata(location=BigQueryRegion(location), type=table_type),
             physical_schema=tuple(schema),
-            is_physically_stored=True,
             cluster_cols=tuple(cluster_cols) if cluster_cols else None,
         )
 
+    @property
+    def is_physically_stored(self) -> bool:
+        return self.metadata.type in ["TABLE", "MATERIALIZED_VIEW"]
+
     def get_table_ref(self) -> bq.TableReference:
         return bq.TableReference(
             bq.DatasetReference(self.project_id, self.dataset_id), self.table_id
         )
 
+    def get_full_id(self, quoted: bool = False) -> str:
+        if quoted:
+            return f"`{self.project_id}`.`{self.dataset_id}`.`{self.table_id}`"
+        return f"{self.project_id}.{self.dataset_id}.{self.table_id}"
+
     @property
     @functools.cache
     def schema_by_id(self):
         return {col.name: col for col in self.physical_schema}
 
 
+@dataclasses.dataclass(frozen=True)
+class BiglakeIcebergTable:
+    project_id: str = dataclasses.field()
+    catalog_id: str = dataclasses.field()
+    namespace_id: str = dataclasses.field()
+    table_id: str = dataclasses.field()
+    physical_schema: Tuple[bq.SchemaField, ...] = dataclasses.field()
+    cluster_cols: typing.Optional[Tuple[str, ...]]
+    metadata: TableMetadata
+
+    def get_full_id(self, quoted: bool = False) -> str:
+        if quoted:
+            return f"`{self.project_id}`.`{self.catalog_id}`.`{self.namespace_id}`.`{self.table_id}`"
+        return (
+            f"{self.project_id}.{self.catalog_id}.{self.namespace_id}.{self.table_id}"
+        )
+
+    @property
+    @functools.cache
+    def schema_by_id(self):
+        return {col.name: col for col in self.physical_schema}
+
+    @property
+    def partition_col(self) -> Optional[str]:
+        # TODO: Use iceberg partition metadata
+        return None
+
+    @property
+    def dataset_id(self) -> str:
+        """
+        Not a true dataset, but serves as the dataset component of the identifer in sql queries
+        """
+        return f"{self.catalog_id}.{self.namespace_id}"
+
+    @property
+    def primary_key(self) -> Optional[Tuple[str, ...]]:
+        return None
+
+    def get_table_ref(self) -> bq.TableReference:
+        return bq.TableReference(
+            bq.DatasetReference(self.project_id, self.dataset_id), self.table_id
+        )
+
+
 @dataclasses.dataclass(frozen=True)
 class BigqueryDataSource:
     """
@@ -104,13 +244,13 @@ def __post_init__(self):
             self.schema.names
         )
 
-    table: GbqTable
+    table: Union[GbqNativeTable, BiglakeIcebergTable]
     schema: bigframes.core.schema.ArraySchema
     at_time: typing.Optional[datetime.datetime] = None
     # Added for backwards compatibility, not validated
     sql_predicate: typing.Optional[str] = None
     ordering: typing.Optional[orderings.RowOrdering] = None
-    # Optimization field
+    # Optimization field, must be correct if set, don't put maybe-stale number here
     n_rows: Optional[int] = None
 
 
@@ -188,6 +328,8 @@ def get_arrow_batches(
     project_id: str,
     sample_rate: Optional[float] = None,
 ) -> ReadResult:
+    assert isinstance(data.table, GbqNativeTable)
+
     table_mod_options = {}
     read_options_dict: dict[str, Any] = {"selected_fields": list(columns)}
 
@@ -245,3 +387,21 @@ def process_batch(pa_batch):
     return ReadResult(
         batches, session.estimated_row_count, session.estimated_total_bytes_scanned
     )
+
+
+def _get_primary_keys(
+    table: bq.Table,
+) -> List[str]:
+    """Get primary keys from table if they are set."""
+
+    primary_keys: List[str] = []
+    if (
+        (table_constraints := getattr(table, "table_constraints", None)) is not None
+        and (primary_key := table_constraints.primary_key) is not None
+        # This will be False for either None or empty list.
+        # We want primary_keys = None if no primary keys are set.
+        and (columns := primary_key.columns)
+    ):
+        primary_keys = columns if columns is not None else []
+
+    return primary_keys
@@ -215,9 +215,7 @@ def _table_to_ibis(
     source: bq_data.BigqueryDataSource,
     scan_cols: typing.Sequence[str],
 ) -> ibis_types.Table:
-    full_table_name = (
-        f"{source.table.project_id}.{source.table.dataset_id}.{source.table.table_id}"
-    )
+    full_table_name = source.table.get_full_id(quoted=False)
     # Physical schema might include unused columns, unsupported datatypes like JSON
     physical_schema = ibis_bigquery.BigQuerySchema.to_ibis(
         list(source.table.physical_schema)
 
@@ -825,9 +825,7 @@ def variables_introduced(self) -> int:
 
     @property
     def row_count(self) -> typing.Optional[int]:
-        if self.source.sql_predicate is None and self.source.table.is_physically_stored:
-            return self.source.n_rows
-        return None
+        return self.source.n_rows
 
     @property
     def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]:
 
@@ -17,7 +17,7 @@
 from dataclasses import dataclass
 import functools
 import typing
-from typing import Dict, List, Optional, Sequence
+from typing import Dict, Optional, Sequence
 
 import google.cloud.bigquery
 import pyarrow
@@ -40,31 +40,16 @@ class ArraySchema:
     def __iter__(self):
         yield from self.items
 
-    @classmethod
-    def from_bq_table(
-        cls,
-        table: google.cloud.bigquery.Table,
-        column_type_overrides: Optional[
-            typing.Dict[str, bigframes.dtypes.Dtype]
-        ] = None,
-        columns: Optional[Sequence[str]] = None,
-    ):
-        if not columns:
-            fields = table.schema
-        else:
-            lookup = {field.name: field for field in table.schema}
-            fields = [lookup[col] for col in columns]
-
-        return ArraySchema.from_bq_schema(
-            fields, column_type_overrides=column_type_overrides
-        )
-
     @classmethod
     def from_bq_schema(
         cls,
-        schema: List[google.cloud.bigquery.SchemaField],
+        schema: Sequence[google.cloud.bigquery.SchemaField],
         column_type_overrides: Optional[Dict[str, bigframes.dtypes.Dtype]] = None,
+        columns: Optional[Sequence[str]] = None,
     ):
+        if columns:
+            lookup = {field.name: field for field in schema}
+            schema = [lookup[col] for col in columns]
         if column_type_overrides is None:
             column_type_overrides = {}
         items = tuple(