Source code for cytotable.warehouse.iceberg

"""
Utilities for reading and writing local Iceberg warehouses with CytoTable.
"""

from __future__ import annotations

import json
import logging
import shutil
import tempfile
from pathlib import Path
from typing import Any, Dict, Optional, Tuple, Union, cast

import pandas as pd
import parsl
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.parquet as parquet
from parsl.app.app import python_app

from cytotable.constants import CYTOTABLE_DEFAULT_PARQUET_METADATA
from cytotable.convert import _run_export_workflow
from cytotable.exceptions import CytoTableException
from cytotable.presets import config
from cytotable.sources import _build_path
from cytotable.utils import (
    CYTOTABLE_THREAD_EXECUTOR_LABEL,
    _default_parsl_config,
    _ensure_thread_executor,
    _expand_path,
    _parsl_loaded,
)

from .images import (
    IMAGE_TABLE_NAME,
    SOURCE_IMAGE_TABLE_NAME,
    add_object_id_to_profiles_frame,
    image_crop_table_from_joined_chunk,
    profile_with_images_frame,
    source_image_table_from_joined_chunk,
)

logger = logging.getLogger(__name__)


@python_app(executors=[CYTOTABLE_THREAD_EXECUTOR_LABEL])
def _image_crop_table_app(
    chunk_path: str,
    image_dir: str,
    mask_dir: "Optional[str]" = None,
    outline_dir: "Optional[str]" = None,
    bbox_column_map: "Optional[Dict[str, str]]" = None,
    segmentation_file_regex: "Optional[Dict[str, str]]" = None,
    path_kwargs: "Optional[Dict[str, Any]]" = None,
) -> "pa.Table":
    """Parsl thread-pool wrapper for image_crop_table_from_joined_chunk."""
    from cytotable.warehouse.images import image_crop_table_from_joined_chunk

    return image_crop_table_from_joined_chunk(
        chunk_path=chunk_path,
        image_dir=image_dir,
        mask_dir=mask_dir,
        outline_dir=outline_dir,
        bbox_column_map=bbox_column_map,
        segmentation_file_regex=segmentation_file_regex,
        path_kwargs=path_kwargs,
    )


@python_app(executors=[CYTOTABLE_THREAD_EXECUTOR_LABEL])
def _source_image_table_app(
    chunk_path: str,
    image_dir: str,
    mask_dir: "Optional[str]" = None,
    outline_dir: "Optional[str]" = None,
    segmentation_file_regex: "Optional[Dict[str, str]]" = None,
    path_kwargs: "Optional[Dict[str, Any]]" = None,
) -> "pa.Table":
    """Parsl thread-pool wrapper for source_image_table_from_joined_chunk."""
    from cytotable.warehouse.images import source_image_table_from_joined_chunk

    return source_image_table_from_joined_chunk(
        chunk_path=chunk_path,
        image_dir=image_dir,
        mask_dir=mask_dir,
        outline_dir=outline_dir,
        segmentation_file_regex=segmentation_file_regex,
        path_kwargs=path_kwargs,
    )


DEFAULT_NAMESPACE = "profiles"
DEFAULT_IMAGES_NAMESPACE = "images"
DEFAULT_REGISTRY_FILE = "catalog.json"
# Row groups in the export parquet are sized by chunk_size (default 1000).
# Aggregating them here keeps the number of Iceberg append transactions low
# while still bounding peak memory per batch.
_PROFILE_WRITE_BATCH_ROWS = 100_000
DEFAULT_WAREHOUSE_DIR = "warehouse"
DEFAULT_PROFILES_TABLE = "joined_profiles"
DEFAULT_PROFILE_WITH_IMAGES_VIEW = "profile_with_images"


def _cytotable_iceberg_properties() -> dict[str, str]:
    """
    Return CytoTable provenance properties for Iceberg tables and warehouses.
    """

    return dict(CYTOTABLE_DEFAULT_PARQUET_METADATA)


try:
    from pyiceberg.catalog import Catalog, MetastoreCatalog, PropertiesUpdateSummary
    from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError
    from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
    from pyiceberg.schema import Schema
    from pyiceberg.serializers import FromInputFile
    from pyiceberg.table import CommitTableResponse, Table
    from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
    from pyiceberg.table.update import TableRequirement, TableUpdate
    from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties
except ImportError as import_error:
    _PYICEBERG_IMPORT_ERROR: Optional[ImportError] = import_error
else:
    _PYICEBERG_IMPORT_ERROR = None


def _require_pyiceberg() -> None:
    """
    Raise an informative error when pyiceberg is unavailable.
    """

    if _PYICEBERG_IMPORT_ERROR is not None:
        raise ImportError(
            "Using CytoTable with iceberg/OME-arrow support requires the optional 'pyiceberg' dependency."
        ) from _PYICEBERG_IMPORT_ERROR


def _qualify(name: str, namespace: str) -> str:
    """
    Return a fully qualified Iceberg identifier such as
    `profiles.joined_profiles` from a bare name and namespace.

    This matters for Iceberg because tables and views live within namespaces,
    unlike standalone table files where a single filename can identify the
    dataset directly.
    """

    return name if "." in name else f"{namespace}.{name}"


def _resolve_unqualified_name(
    bundle: TinyCatalog,
    name: str,
) -> str:
    """
    Resolve an unqualified table/view name across namespaces when unique.
    """

    if "." in name:
        return name

    default_qualified = _qualify(name, bundle.default_namespace)
    identifier = tuple(default_qualified.split("."))
    if bundle.table_exists(identifier) or bundle.view_exists(identifier):
        return default_qualified

    matches: list[str] = []
    for namespace in bundle.list_namespaces():
        qualified = _qualify(name, ".".join(namespace))
        candidate = tuple(qualified.split("."))
        if bundle.table_exists(candidate) or bundle.view_exists(candidate):
            matches.append(qualified)

    if len(matches) == 1:
        return matches[0]
    if len(matches) > 1:
        raise CytoTableException(
            f"Ambiguous unqualified Iceberg name '{name}'. "
            f"Use a fully qualified name such as '{matches[0]}'."
        )
    return default_qualified


def _warehouse_dir(path: Union[str, Path], registry_file: str) -> Path:
    """
    Return the directory that stores Iceberg metadata and data files.

    Args:
        path (Union[str, Path]):
            Warehouse root path or an internal warehouse data directory.
        registry_file (str):
            Name of the CytoTable registry file that records warehouse tables
            and views, used to determine whether ``path`` already points at the
            warehouse root.

    Returns:
        Path:
            ``path`` itself when it already contains the registry file,
            otherwise the conventional warehouse data subdirectory beneath it.
    """

    root = Path(path)
    return root if (root / registry_file).exists() else root / DEFAULT_WAREHOUSE_DIR


def _rewrite_join_sql_for_warehouse(joins: str, source_names: Dict[str, str]) -> str:
    """
    Replace parquet reads in join SQL with registered DuckDB relation names.
    """

    rewritten = joins
    for source_name in source_names:
        rewritten = rewritten.replace(
            f"read_parquet('{source_name}.parquet')",
            source_names[source_name],
        )
    return rewritten


def _apply_preset_defaults_to_convert_config(
    *,
    preset: Optional[str],
    metadata: Optional[Tuple[str, ...] | list[str]],
    compartments: Optional[Tuple[str, ...] | list[str]],
    identifying_columns: Optional[Tuple[str, ...] | list[str]],
    joins: Optional[str],
    chunk_size: Optional[int],
    page_keys: Optional[Dict[str, str]],
) -> Dict[str, Any]:
    """
    Return convert() configuration with preset defaults applied.
    """

    if preset is not None:
        metadata = (
            cast(Tuple[str, ...], config[preset]["CONFIG_NAMES_METADATA"])
            if metadata is None
            else metadata
        )
        compartments = (
            cast(Tuple[str, ...], config[preset]["CONFIG_NAMES_COMPARTMENTS"])
            if compartments is None
            else compartments
        )
        identifying_columns = (
            cast(Tuple[str, ...], config[preset]["CONFIG_IDENTIFYING_COLUMNS"])
            if identifying_columns is None
            else identifying_columns
        )
        joins = cast(str, config[preset]["CONFIG_JOINS"]) if joins is None else joins
        chunk_size = (
            cast(int, config[preset]["CONFIG_CHUNK_SIZE"])
            if chunk_size is None
            else chunk_size
        )
        page_keys = (
            cast(Dict[str, str], config[preset]["CONFIG_PAGE_KEYS"])
            if page_keys is None
            else page_keys
        )

    return {
        "metadata": tuple(metadata or ()),
        "compartments": tuple(compartments or ()),
        "identifying_columns": tuple(identifying_columns or ()),
        "joins": joins or "",
        "chunk_size": chunk_size,
        "page_keys": dict(page_keys or {}),
        "preset": preset,
    }


def _validate_image_export_prerequisites(
    *,
    image_dir: Optional[str],
    mask_dir: Optional[str],
    outline_dir: Optional[str],
    bbox_column_map: Optional[Dict[str, str]],
    segmentation_file_regex: Optional[Dict[str, str]],
    joins: str,
    page_keys: Dict[str, str],
    path_kwargs: Optional[Dict[str, Any]] = None,
) -> bool:
    """
    Validate that image export configuration includes required join settings.
    """

    ancillary_image_config = any(
        (
            mask_dir is not None,
            outline_dir is not None,
            bool(bbox_column_map),
            bool(segmentation_file_regex),
        )
    )
    image_export_requested = image_dir is not None or ancillary_image_config

    if not image_export_requested:
        return False

    if image_dir is None:
        raise CytoTableException(
            "Image export options require 'image_dir' to be provided."
        )

    for label, path_value in (
        ("image_dir", image_dir),
        ("mask_dir", mask_dir),
        ("outline_dir", outline_dir),
    ):
        if path_value is None:
            continue
        built_path = _build_path(path_value, **(path_kwargs or {}))
        path_exists = (
            built_path.is_dir() if isinstance(built_path, Path) else built_path.exists()
        )
        if not path_exists:
            raise CytoTableException(
                f"Image export requires '{label}' to reference an existing directory: "
                f"'{path_value}'."
            )

    if not joins.strip():
        raise CytoTableException(
            "Image export requires join SQL. Provide 'joins' directly or use a "
            "preset that defines them."
        )

    if not page_keys.get("join"):
        raise CytoTableException(
            "Image export requires page_keys to include a non-empty 'join' entry."
        )

    return True


def _validate_iceberg_join_prerequisites(
    *, joins: str, page_keys: Dict[str, str]
) -> None:
    """
    Validate that Iceberg export has the join configuration it requires.
    """

    if not joins.strip():
        raise ValueError(
            "Iceberg export requires non-empty join SQL. Provide 'joins' directly "
            "or use a preset that defines them."
        )

    if not page_keys.get("join"):
        raise ValueError(
            "Iceberg export requires page_keys to include a non-empty 'join' entry."
        )


# Define Iceberg catalog helpers only when the optional pyiceberg dependency
# is available, so importing cytotable does not fail for parquet-only users.
if _PYICEBERG_IMPORT_ERROR is None:

    class TinyCatalog(MetastoreCatalog):
        """
        Tiny filesystem-backed catalog for local CytoTable Iceberg warehouses.
        """

        def __init__(
            self,
            warehouse_root: Path,
            *,
            default_namespace: str = DEFAULT_NAMESPACE,
            registry_file: str = DEFAULT_REGISTRY_FILE,
        ) -> None:
            self.default_namespace = default_namespace
            self.registry_path = warehouse_root / registry_file
            warehouse_root.mkdir(parents=True, exist_ok=True)
            super().__init__("local", warehouse=warehouse_root.resolve().as_uri())


[docs]
        def _read_registry(self) -> dict[str, object]:
            if not self.registry_path.exists():
                return {
                    "namespaces": [self.default_namespace],
                    "properties": _cytotable_iceberg_properties(),
                    "tables": {},
                    "views": {},
                }
            registry = json.loads(self.registry_path.read_text())
            registry.setdefault("properties", _cytotable_iceberg_properties())
            registry.setdefault("views", {})
            return registry



[docs]
        def _write_registry(self, registry: dict[str, object]) -> None:
            self.registry_path.write_text(
                json.dumps(registry, indent=2, sort_keys=True)
            )



[docs]
        def create_namespace(
            self, namespace: str | Identifier, properties: Properties = EMPTY_DICT
        ) -> None:
            del properties
            registry = self._read_registry()
            names = set(cast(list[str], registry["namespaces"]))
            names.add(Catalog.namespace_to_string(namespace))
            registry["namespaces"] = sorted(names)
            self._write_registry(registry)



[docs]
        def load_namespace_properties(
            self, namespace: str | Identifier
        ) -> dict[str, str]:
            name = Catalog.namespace_to_string(namespace)
            if name not in cast(list[str], self._read_registry()["namespaces"]):
                raise NoSuchNamespaceError(name)
            return {}



[docs]
        def list_namespaces(
            self, namespace: str | Identifier = ()
        ) -> list[tuple[str, ...]]:
            del namespace
            return [
                tuple(name.split("."))
                for name in cast(list[str], self._read_registry()["namespaces"])
            ]



[docs]
        def list_tables(self, namespace: str | Identifier) -> list[tuple[str, ...]]:
            prefix = f"{Catalog.namespace_to_string(namespace)}."
            return [
                tuple(name.split("."))
                for name in sorted(
                    cast(dict[str, str], self._read_registry()["tables"])
                )
                if name.startswith(prefix)
            ]



[docs]
        def load_table(self, identifier: str | Identifier) -> Table:
            name = ".".join(Catalog.identifier_to_tuple(identifier))
            metadata_location = cast(
                dict[str, str], self._read_registry()["tables"]
            ).get(name)
            if metadata_location is None:
                raise NoSuchTableError(name)
            io = self._load_file_io(location=metadata_location)
            metadata = FromInputFile.table_metadata(io.new_input(metadata_location))
            return Table(
                Catalog.identifier_to_tuple(identifier),
                metadata,
                metadata_location,
                io,
                self,
            )



[docs]
        def register_table(
            self, identifier: str | Identifier, metadata_location: str
        ) -> Table:
            registry = self._read_registry()
            cast(dict[str, str], registry["tables"])[
                ".".join(Catalog.identifier_to_tuple(identifier))
            ] = metadata_location
            self._write_registry(registry)
            return self.load_table(identifier)



[docs]
        def commit_table(
            self,
            table: Table,
            requirements: tuple[TableRequirement, ...],
            updates: tuple[TableUpdate, ...],
        ) -> CommitTableResponse:
            identifier = Catalog.identifier_to_tuple(table.name())
            try:
                current = self.load_table(identifier)
            except NoSuchTableError:
                current = None
            staged = self._update_and_stage_table(
                current, identifier, requirements, updates
            )
            self._write_metadata(staged.metadata, staged.io, staged.metadata_location)
            registry = self._read_registry()
            cast(dict[str, str], registry["tables"])[
                ".".join(identifier)
            ] = staged.metadata_location
            self._write_registry(registry)
            return CommitTableResponse(
                metadata=staged.metadata, metadata_location=staged.metadata_location
            )



[docs]
        def create_table(  # noqa: PLR0913
            self,
            identifier: str | Identifier,
            schema: Schema | pa.Schema,
            location: str | None = None,
            partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC,
            sort_order: SortOrder = UNSORTED_SORT_ORDER,
            properties: Properties = EMPTY_DICT,
        ) -> Table:
            return self.create_table_transaction(
                identifier, schema, location, partition_spec, sort_order, properties
            ).commit_transaction()



[docs]
        def table_exists(self, identifier: str | Identifier) -> bool:
            return ".".join(Catalog.identifier_to_tuple(identifier)) in cast(
                dict[str, str], self._read_registry()["tables"]
            )



[docs]
        def view_exists(self, identifier: str | Identifier) -> bool:
            return ".".join(Catalog.identifier_to_tuple(identifier)) in cast(
                dict[str, dict[str, object]], self._read_registry()["views"]
            )



[docs]
        def list_views(self, namespace: str | Identifier) -> list[tuple[str, ...]]:
            prefix = f"{Catalog.namespace_to_string(namespace)}."
            return [
                tuple(name.split("."))
                for name in sorted(cast(dict[str, str], self._read_registry()["views"]))
                if name.startswith(prefix)
            ]



[docs]
        def drop_view(self, _identifier: str | Identifier) -> None:
            raise NotImplementedError



[docs]
        def drop_table(self, _identifier: str | Identifier) -> None:
            raise NotImplementedError



[docs]
        def rename_table(
            self, _from_identifier: str | Identifier, _to_identifier: str | Identifier
        ) -> Table:
            raise NotImplementedError



[docs]
        def drop_namespace(self, _namespace: str | Identifier) -> None:
            raise NotImplementedError



[docs]
        def update_namespace_properties(
            self,
            _namespace: str | Identifier,
            _removals: set[str] | None = None,
            _updates: Properties = EMPTY_DICT,
        ) -> PropertiesUpdateSummary:
            raise NotImplementedError


else:


[docs]
    class TinyCatalog:  # type: ignore[no-redef]
        """
        Placeholder catalog when pyiceberg is unavailable.
        """




[docs]
def catalog(
    warehouse_path: Union[str, Path],
    *,
    default_namespace: str = DEFAULT_NAMESPACE,
    registry_file: str = DEFAULT_REGISTRY_FILE,
) -> TinyCatalog:
    """
    Open a local Iceberg warehouse and return its tiny catalog.
    """

    _require_pyiceberg()
    return TinyCatalog(
        _warehouse_dir(warehouse_path, registry_file),
        default_namespace=default_namespace,
        registry_file=registry_file,
    )




[docs]
def write_iceberg_warehouse(  # noqa: PLR0913
    source_path: str,
    warehouse_path: Union[str, Path],
    source_datatype: Optional[str] = None,
    metadata: Optional[Tuple[str, ...] | list[str]] = None,
    compartments: Optional[Tuple[str, ...] | list[str]] = None,
    identifying_columns: Optional[Tuple[str, ...] | list[str]] = None,
    joins: Optional[str] = None,
    chunk_size: Optional[int] = None,
    infer_common_schema: bool = True,
    data_type_cast_map: Optional[Dict[str, str]] = None,
    add_tablenumber: Optional[bool] = None,
    page_keys: Optional[Dict[str, str]] = None,
    sort_output: bool = True,
    preset: Optional[str] = "cellprofiler_csv",
    image_dir: Optional[str] = None,
    mask_dir: Optional[str] = None,
    outline_dir: Optional[str] = None,
    bbox_column_map: Optional[Dict[str, str]] = None,
    segmentation_file_regex: Optional[Dict[str, str]] = None,
    include_source_images: bool = False,
    default_namespace: str = DEFAULT_NAMESPACE,
    images_namespace: str = DEFAULT_IMAGES_NAMESPACE,
    registry_file: str = DEFAULT_REGISTRY_FILE,
    profiles_table_name: str = DEFAULT_PROFILES_TABLE,
    profile_with_images_view_name: Optional[str] = DEFAULT_PROFILE_WITH_IMAGES_VIEW,
    drop_null: bool = False,
    parsl_config: Optional[parsl.Config] = None,
    **kwargs,
) -> str:
    """
    Write a CytoTable Iceberg warehouse from raw source data.

    This helper powers ``convert(..., dest_backend="iceberg")`` and accepts the
    same core conversion arguments for source selection, joins, chunking, and
    image export. See :func:`cytotable.convert.convert` for the shared argument
    semantics; this function adds Iceberg-specific options such as
    ``default_namespace``, ``images_namespace``, ``registry_file``,
    ``profiles_table_name``, and ``profile_with_images_view_name``.

    Args:
        source_path (str):
            Source path passed through to the underlying conversion. See
            :func:`cytotable.convert.convert`.
        warehouse_path (Union[str, Path]):
            Filesystem path at which to create the Iceberg warehouse root.
            Must not already exist.
        source_datatype (Optional[str]):
            See :func:`cytotable.convert.convert`.
        metadata (Optional[Tuple[str, ...] | list[str]]):
            See :func:`cytotable.convert.convert`.
        compartments (Optional[Tuple[str, ...] | list[str]]):
            See :func:`cytotable.convert.convert`.
        identifying_columns (Optional[Tuple[str, ...] | list[str]]):
            See :func:`cytotable.convert.convert`.
        joins (Optional[str]):
            See :func:`cytotable.convert.convert`.
        chunk_size (Optional[int]):
            See :func:`cytotable.convert.convert`.
        infer_common_schema (bool):
            See :func:`cytotable.convert.convert`.
        data_type_cast_map (Optional[Dict[str, str]]):
            See :func:`cytotable.convert.convert`.
        add_tablenumber (Optional[bool]):
            See :func:`cytotable.convert.convert`.
        page_keys (Optional[Dict[str, str]]):
            See :func:`cytotable.convert.convert`.
        sort_output (bool):
            See :func:`cytotable.convert.convert`.
        preset (Optional[str]):
            See :func:`cytotable.convert.convert`.
        image_dir (Optional[str]):
            See :func:`cytotable.convert.convert`.
        mask_dir (Optional[str]):
            See :func:`cytotable.convert.convert`.
        outline_dir (Optional[str]):
            See :func:`cytotable.convert.convert`.
        bbox_column_map (Optional[Dict[str, str]]):
            See :func:`cytotable.convert.convert`.
        segmentation_file_regex (Optional[Dict[str, str]]):
            See :func:`cytotable.convert.convert`.
        include_source_images (bool):
            See :func:`cytotable.convert.convert`.
        default_namespace (str):
            Iceberg namespace under which the profiles table is registered.
        images_namespace (str):
            Iceberg namespace under which image-related tables are registered
            when image export is enabled.
        registry_file (str):
            Filename of the CytoTable registry file written under the warehouse
            root. Used to record warehouse tables and views.
        profiles_table_name (str):
            Name of the joined profiles table written into ``default_namespace``.
        profile_with_images_view_name (Optional[str]):
            Optional view name registered when image export is enabled, joining
            profile rows with their corresponding image rows.
        drop_null (bool):
            See :func:`cytotable.convert.convert`.
        parsl_config (Optional[parsl.Config]):
            See :func:`cytotable.convert.convert`.
        **kwargs:
            Additional keyword args forwarded to source-gathering. See
            :func:`cytotable.convert.convert`.

    Returns:
        str:
            Path to the created Iceberg warehouse root.

    Raises:
        CytoTableException:
            Raised when ``warehouse_path`` already exists, when image-export
            options are inconsistent (for example, ancillary image options
            provided without ``image_dir``, ``image_dir``/``mask_dir``/
            ``outline_dir`` referencing a missing directory, missing join SQL,
            or ``page_keys`` lacking a non-empty ``'join'`` entry while image
            export is requested).
        ImportError:
            Raised when the optional ``pyiceberg`` dependency is unavailable.
        ValueError:
            Raised when Iceberg export's join configuration is missing -- an
            empty ``joins`` SQL string or a ``page_keys`` mapping without a
            non-empty ``'join'`` entry.
    """  # noqa: DOC503

    _require_pyiceberg()

    root = Path(_expand_path(str(warehouse_path)))
    if root.exists():
        raise CytoTableException(
            f"An existing file or directory was provided as warehouse_path: '{root}'."
        )
    root.parent.mkdir(parents=True, exist_ok=True)

    resolved = _apply_preset_defaults_to_convert_config(
        preset=preset,
        metadata=metadata,
        compartments=compartments,
        identifying_columns=identifying_columns,
        joins=joins,
        chunk_size=chunk_size,
        page_keys=page_keys,
    )
    _validate_iceberg_join_prerequisites(
        joins=cast(str, resolved["joins"]),
        page_keys=cast(Dict[str, str], resolved["page_keys"]),
    )
    image_export_enabled = _validate_image_export_prerequisites(
        image_dir=image_dir,
        mask_dir=mask_dir,
        outline_dir=outline_dir,
        bbox_column_map=bbox_column_map,
        segmentation_file_regex=segmentation_file_regex,
        joins=cast(str, resolved["joins"]),
        page_keys=cast(Dict[str, str], resolved["page_keys"]),
        path_kwargs=kwargs,
    )

    root.mkdir(parents=True, exist_ok=False)
    build_root: Optional[Path] = root
    stage_dir = Path(tempfile.mkdtemp(prefix="cytotable-iceberg-", dir=str(root)))

    parsl_was_loaded = _parsl_loaded()
    parsl_loaded_here = False

    try:
        if not parsl_was_loaded:
            effective_config = _ensure_thread_executor(
                parsl_config or _default_parsl_config()
            )
            parsl.load(effective_config)
            parsl_loaded_here = True
        else:
            logger.info(
                "Reusing the already loaded Parsl configuration; "
                "write_iceberg_warehouse will not replace it with a new one."
            )
            if CYTOTABLE_THREAD_EXECUTOR_LABEL not in parsl.dfk().executors:
                logger.warning(
                    "The active Parsl configuration has no '%s' executor. "
                    "Image crop processing will run sequentially. "
                    "Add a ThreadPoolExecutor with label '%s' to your parsl_config "
                    "to enable parallel image processing.",
                    CYTOTABLE_THREAD_EXECUTOR_LABEL,
                    CYTOTABLE_THREAD_EXECUTOR_LABEL,
                )

        # First materialize the analysis-ready joined profiles output as a
        # single parquet artifact, then import that artifact into Iceberg.
        profiles_path = cast(
            str,
            _run_export_workflow(
                source_path=source_path,
                dest_path=str(stage_dir / f"{profiles_table_name}.parquet"),
                source_datatype=source_datatype,
                metadata=list(cast(Tuple[str, ...], resolved["metadata"])),
                compartments=list(cast(Tuple[str, ...], resolved["compartments"])),
                identifying_columns=list(
                    cast(Tuple[str, ...], resolved["identifying_columns"])
                ),
                concat=True,
                join=True,
                joins=cast(str, resolved["joins"]),
                chunk_size=cast(Optional[int], resolved["chunk_size"]),
                infer_common_schema=infer_common_schema,
                drop_null=drop_null,
                sort_output=sort_output,
                page_keys=cast(Dict[str, str], resolved["page_keys"]),
                dest_datatype="parquet",
                data_type_cast_map=data_type_cast_map,
                add_tablenumber=add_tablenumber,
                **kwargs,
            ),
        )

        bundle = catalog(
            root,
            default_namespace=default_namespace,
            registry_file=registry_file,
        )
        bundle.create_namespace(default_namespace)
        if image_export_enabled:
            bundle.create_namespace(images_namespace)

        profiles_table_exists = False
        if profiles_path and Path(profiles_path).exists():
            # Stream the profiles parquet in row-group batches so we never
            # hold the full dataset in memory. The Iceberg table is created
            # from the schema of the first processed batch, then each batch
            # is appended and released before the next one is read.
            pq_file = parquet.ParquetFile(profiles_path)
            profiles_iceberg_table = None
            for batch in pq_file.iter_batches(batch_size=_PROFILE_WRITE_BATCH_ROWS):
                processed = pa.Table.from_pandas(
                    add_object_id_to_profiles_frame(
                        batch.to_pandas(),
                        bbox_column_map=bbox_column_map,
                    ),
                    preserve_index=False,
                )
                if profiles_iceberg_table is None:
                    if bundle.table_exists((default_namespace, profiles_table_name)):
                        profiles_iceberg_table = bundle.load_table(
                            (default_namespace, profiles_table_name)
                        )
                    else:
                        profiles_iceberg_table = bundle.create_table(
                            (default_namespace, profiles_table_name),
                            processed.schema,
                            properties=_cytotable_iceberg_properties(),
                        )
                profiles_iceberg_table.append(processed)
            if profiles_iceberg_table is None:
                # Zero-row parquet: build the augmented schema from an empty
                # frame so the Iceberg table is registered even for empty exports.
                augmented_schema = pa.Table.from_pandas(
                    add_object_id_to_profiles_frame(
                        pq_file.schema_arrow.empty_table().to_pandas(),
                        bbox_column_map=bbox_column_map,
                    ),
                    preserve_index=False,
                ).schema
                profiles_iceberg_table = bundle.create_table(
                    (default_namespace, profiles_table_name),
                    augmented_schema,
                    properties=_cytotable_iceberg_properties(),
                )
            profiles_table_exists = profiles_iceberg_table is not None

        if image_export_enabled:
            # Run the same join in chunked mode for image work so crops and
            # full source-image rows can be produced lazily per chunk.
            joined_chunk_paths = cast(
                list[str],
                _run_export_workflow(
                    source_path=source_path,
                    dest_path=str(stage_dir / "joined"),
                    source_datatype=source_datatype,
                    metadata=list(cast(Tuple[str, ...], resolved["metadata"])),
                    compartments=list(cast(Tuple[str, ...], resolved["compartments"])),
                    identifying_columns=list(
                        cast(Tuple[str, ...], resolved["identifying_columns"])
                    ),
                    concat=False,
                    join=True,
                    joins=cast(str, resolved["joins"]),
                    chunk_size=cast(Optional[int], resolved["chunk_size"]),
                    infer_common_schema=infer_common_schema,
                    drop_null=drop_null,
                    sort_output=sort_output,
                    page_keys=cast(Dict[str, str], resolved["page_keys"]),
                    data_type_cast_map=data_type_cast_map,
                    add_tablenumber=add_tablenumber,
                    **kwargs,
                ),
            )
            image_table: Optional[Table] = None
            source_images_table: Optional[Table] = None
            seen_source_image_ids: set[str] = set()
            if bundle.table_exists((images_namespace, SOURCE_IMAGE_TABLE_NAME)):
                # Project only the ID column — avoids loading image pixels
                # into memory just to build the dedup set.
                existing_ids = (
                    bundle.load_table((images_namespace, SOURCE_IMAGE_TABLE_NAME))
                    .scan(selected_fields=("Metadata_ImageID",))
                    .to_arrow()
                )
                if "Metadata_ImageID" in existing_ids.column_names:
                    seen_source_image_ids.update(
                        image_id
                        for image_id in existing_ids["Metadata_ImageID"].to_pylist()
                        if image_id is not None
                    )
            # Determine whether parallel image processing is available.
            use_threads = CYTOTABLE_THREAD_EXECUTOR_LABEL in parsl.dfk().executors
            _image_dir = cast(str, image_dir)

            # Submit all crop (and optionally source-image) futures upfront so
            # the thread pool can overlap image I/O across chunks. Iceberg
            # appends are done sequentially as each future resolves.
            if use_threads:
                crop_futures = [
                    _image_crop_table_app(
                        chunk_path=chunk_path,
                        image_dir=_image_dir,
                        mask_dir=mask_dir,
                        outline_dir=outline_dir,
                        bbox_column_map=bbox_column_map,
                        segmentation_file_regex=segmentation_file_regex,
                        path_kwargs=kwargs,
                    )
                    for chunk_path in joined_chunk_paths
                ]
                source_futures = (
                    [
                        _source_image_table_app(
                            chunk_path=chunk_path,
                            image_dir=_image_dir,
                            mask_dir=mask_dir,
                            outline_dir=outline_dir,
                            segmentation_file_regex=segmentation_file_regex,
                            path_kwargs=kwargs,
                        )
                        for chunk_path in joined_chunk_paths
                    ]
                    if include_source_images
                    else []
                )
            else:
                crop_futures = []
                source_futures = []

            for i, chunk_path in enumerate(joined_chunk_paths):
                crop_table = (
                    crop_futures[i].result()
                    if use_threads
                    else image_crop_table_from_joined_chunk(
                        chunk_path=chunk_path,
                        image_dir=_image_dir,
                        mask_dir=mask_dir,
                        outline_dir=outline_dir,
                        bbox_column_map=bbox_column_map,
                        segmentation_file_regex=segmentation_file_regex,
                        path_kwargs=kwargs,
                    )
                )
                if crop_table.num_rows == 0:
                    continue
                if image_table is None:
                    image_table = (
                        bundle.load_table((images_namespace, IMAGE_TABLE_NAME))
                        if bundle.table_exists((images_namespace, IMAGE_TABLE_NAME))
                        else bundle.create_table(
                            (images_namespace, IMAGE_TABLE_NAME),
                            crop_table.schema,
                            properties=_cytotable_iceberg_properties(),
                        )
                    )
                image_table.append(crop_table)

                if include_source_images:
                    source_image_table = (
                        source_futures[i].result()
                        if use_threads
                        else source_image_table_from_joined_chunk(
                            chunk_path=chunk_path,
                            image_dir=_image_dir,
                            mask_dir=mask_dir,
                            outline_dir=outline_dir,
                            segmentation_file_regex=segmentation_file_regex,
                            path_kwargs=kwargs,
                        )
                    )
                    if source_image_table.num_rows != 0:
                        ids_col = source_image_table["Metadata_ImageID"]
                        id_set = pa.array(
                            list(seen_source_image_ids), type=ids_col.type
                        )
                        filtered_source_image_table = source_image_table.filter(
                            pc.invert(pc.is_in(ids_col, value_set=id_set))
                        )
                        if filtered_source_image_table.num_rows == 0:
                            continue
                        if source_images_table is None:
                            source_images_table = (
                                bundle.load_table(
                                    (images_namespace, SOURCE_IMAGE_TABLE_NAME)
                                )
                                if bundle.table_exists(
                                    (images_namespace, SOURCE_IMAGE_TABLE_NAME)
                                )
                                else bundle.create_table(
                                    (images_namespace, SOURCE_IMAGE_TABLE_NAME),
                                    filtered_source_image_table.schema,
                                    properties=_cytotable_iceberg_properties(),
                                )
                            )
                        source_images_table.append(filtered_source_image_table)
                        seen_source_image_ids.update(
                            image_id
                            for image_id in filtered_source_image_table[
                                "Metadata_ImageID"
                            ].to_pylist()
                            if image_id is not None
                        )

            if (
                profiles_table_exists
                and profile_with_images_view_name
                and image_table is not None
            ):
                # Persist the cross-namespace analytical view only when both
                # the base profile table and image crop table exist.
                registry = bundle._read_registry()
                cast(dict[str, dict[str, object]], registry["views"])[
                    _qualify(profile_with_images_view_name, default_namespace)
                ] = {
                    "kind": "profile_with_images",
                    "base_table": _qualify(profiles_table_name, default_namespace),
                    "image_table": _qualify(IMAGE_TABLE_NAME, images_namespace),
                    "bbox_column_map": bbox_column_map or {},
                }
                bundle._write_registry(registry)

        # Drop transient parquet staging after the warehouse contents have
        # been committed successfully.
        shutil.rmtree(stage_dir, ignore_errors=True)
        build_root = None

    finally:
        if parsl_loaded_here:
            parsl.dfk().cleanup()
        if build_root is not None:
            # Clean up partially built warehouse state on any failure path.
            shutil.rmtree(build_root, ignore_errors=True)

    return str(root)



def _read_sql_view(bundle: TinyCatalog, view_name: str) -> pd.DataFrame:
    """
    Read a saved SQL view by materializing Iceberg tables into DuckDB.
    """

    from cytotable.utils import _duckdb_reader

    registry = bundle._read_registry()
    spec = cast(dict[str, Any], cast(dict[str, Any], registry["views"])[view_name])
    sql = cast(str, spec["sql"])

    with _duckdb_reader() as reader:
        for table_name in cast(list[str], spec["tables"]):
            qualified = _qualify(table_name, bundle.default_namespace)
            arrow_table = (
                bundle.load_table(tuple(qualified.split("."))).scan().to_arrow()
            )
            reader.register(table_name, arrow_table)
        return reader.execute(sql).fetch_arrow_table().to_pandas()


def _read_profile_with_images_view(bundle: TinyCatalog, view_name: str) -> pd.DataFrame:
    """
    Read a saved profile/image manifest view from warehouse tables.
    """

    registry = bundle._read_registry()
    spec = cast(dict[str, Any], cast(dict[str, Any], registry["views"])[view_name])
    joined_frame = (
        bundle.load_table(tuple(cast(str, spec["base_table"]).split(".")))
        .scan()
        .to_arrow()
        .to_pandas()
    )
    image_frame = (
        bundle.load_table(tuple(cast(str, spec["image_table"]).split(".")))
        .scan()
        .to_arrow()
        .to_pandas()
    )
    return profile_with_images_frame(
        joined_frame=joined_frame,
        image_frame=image_frame,
        bbox_column_map=cast(Dict[str, str], spec.get("bbox_column_map") or {}),
    )


def _read_registered_view(bundle: TinyCatalog, view_name: str) -> pd.DataFrame:
    """
    Read a saved registry-backed warehouse view.
    """

    registry = bundle._read_registry()
    spec = cast(dict[str, Any], cast(dict[str, Any], registry["views"])[view_name])
    kind = cast(str, spec["kind"])
    if kind == "sql":
        return _read_sql_view(bundle, view_name)
    if kind == "profile_with_images":
        return _read_profile_with_images_view(bundle, view_name)
    raise CytoTableException(f"Unsupported warehouse view kind: {kind}")



[docs]
def read_iceberg_table(
    warehouse_path: Union[str, Path],
    table_name: str,
    *,
    default_namespace: str = DEFAULT_NAMESPACE,
    registry_file: str = DEFAULT_REGISTRY_FILE,
) -> pd.DataFrame:
    """
    Read an Iceberg table or saved SQL view from a local warehouse.
    """

    bundle = catalog(
        warehouse_path,
        default_namespace=default_namespace,
        registry_file=registry_file,
    )
    qualified_name = _resolve_unqualified_name(bundle, table_name)
    if bundle.view_exists(tuple(qualified_name.split("."))):
        return _read_registered_view(bundle, qualified_name)
    return (
        bundle.load_table(tuple(qualified_name.split(".")))
        .scan()
        .to_arrow()
        .to_pandas()
    )




[docs]
def list_iceberg_tables(
    warehouse_path: Union[str, Path],
    include_views: bool = True,
    *,
    default_namespace: str = DEFAULT_NAMESPACE,
    registry_file: str = DEFAULT_REGISTRY_FILE,
) -> list[str]:
    """
    List fully qualified tables and optional views in a local Iceberg warehouse.
    """

    bundle = catalog(
        warehouse_path,
        default_namespace=default_namespace,
        registry_file=registry_file,
    )
    names = [
        ".".join(identifier)
        for namespace in bundle.list_namespaces()
        for identifier in bundle.list_tables(namespace)
    ]
    if include_views:
        names.extend(
            ".".join(identifier)
            for namespace in bundle.list_namespaces()
            for identifier in bundle.list_views(namespace)
        )
    return sorted(names)




[docs]
def describe_iceberg_warehouse(
    warehouse_path: Union[str, Path],
    include_views: bool = True,
    *,
    default_namespace: str = DEFAULT_NAMESPACE,
    registry_file: str = DEFAULT_REGISTRY_FILE,
) -> pd.DataFrame:
    """
    Summarize tables and saved views within a local Iceberg warehouse.
    """

    bundle = catalog(
        warehouse_path,
        default_namespace=default_namespace,
        registry_file=registry_file,
    )
    rows: list[dict[str, object]] = []
    for namespace in bundle.list_namespaces():
        for identifier in bundle.list_tables(namespace):
            table = bundle.load_table(identifier)
            files = table.inspect.files().to_pandas()
            current_snapshot = table.current_snapshot()
            rows.append(
                {
                    "table": ".".join(identifier),
                    "rows": int(files["record_count"].sum()),
                    "data_files": len(files),
                    "snapshot_id": (
                        current_snapshot.snapshot_id
                        if current_snapshot is not None
                        else None
                    ),
                    "kind": "table",
                }
            )
        if include_views:
            for identifier in bundle.list_views(namespace):
                view_name = ".".join(identifier)
                rows.append(
                    {
                        "table": view_name,
                        # View row counts require a full join — omit to avoid
                        # materialising the entire view just for describe().
                        "rows": None,
                        "data_files": 0,
                        "snapshot_id": None,
                        "kind": "view",
                    }
                )
    return pd.DataFrame(rows).sort_values("table").reset_index(drop=True)



__all__ = [
    "DEFAULT_IMAGES_NAMESPACE",
    "DEFAULT_NAMESPACE",
    "DEFAULT_PROFILE_WITH_IMAGES_VIEW",
    "DEFAULT_PROFILES_TABLE",
    "DEFAULT_REGISTRY_FILE",
    "TinyCatalog",
    "catalog",
    "describe_iceberg_warehouse",
    "list_iceberg_tables",
    "read_iceberg_table",
    "write_iceberg_warehouse",
]
Source code for cytotable.warehouse.iceberg

CytoTable

Navigation

Related Topics