CytoDataFrame at a Glance#

This notebook demonstrates various capabilities of CytoDataFrame using examples.

CytoDataFrame is intended to provide you a Pandas-like DataFrame experience which is enhanced with single-cell visual information which can be viewed directly in a Jupyter notebook.

import pathlib

import pandas as pd

from cytodataframe.frame import CytoDataFrame

# create paths for use with CytoDataFrames below
jump_data_path = "../../../tests/data/cytotable/JUMP_plate_BR00117006"
nf1_cellpainting_path = "../../../tests/data/cytotable/NF1_cellpainting_data_shrunken/"
nuclear_speckles_path = "../../../tests/data/cytotable/nuclear_speckles"
pediatric_cancer_atlas_path = (
    "../../../tests/data/cytotable/pediatric_cancer_atlas_profiling"
)
%%time
# view JUMP plate BR00117006 with images
frame = CytoDataFrame(
    data=f"{jump_data_path}/BR00117006_shrunken.parquet",
    data_context_dir=f"{jump_data_path}/images/orig",
)[
    [
        "Metadata_ImageNumber",
        "Cells_Number_Object_Number",
        "Image_FileName_OrigAGP",
        "Image_FileName_OrigDNA",
        "Image_FileName_OrigRNA",
    ]
][:3]
frame
CPU times: user 877 ms, sys: 542 ms, total: 1.42 s
Wall time: 585 ms

%%time
# view JUMP plate BR00117006 with images and overlaid outlines for segmentation
frame = CytoDataFrame(
    data=f"{jump_data_path}/BR00117006_shrunken.parquet",
    data_context_dir=f"{jump_data_path}/images/orig",
    data_outline_context_dir=f"{jump_data_path}/images/outlines",
)[
    [
        "Metadata_ImageNumber",
        "Cells_Number_Object_Number",
        "Image_FileName_OrigAGP",
        "Image_FileName_OrigDNA",
        "Image_FileName_OrigRNA",
    ]
][:3]
frame
CPU times: user 845 ms, sys: 567 ms, total: 1.41 s
Wall time: 482 ms

%%time
# view JUMP plate BR00117006 with images and overlaid outlines for segmentation
# and changing the color to something besides the default (default is green).
CytoDataFrame(
    data=f"{jump_data_path}/BR00117006_shrunken.parquet",
    data_context_dir=f"{jump_data_path}/images/orig",
    data_outline_context_dir=f"{jump_data_path}/images/outlines",
    display_options={"outline_color": (200, 100, 255)},
)[
    [
        "Metadata_ImageNumber",
        "Cells_Number_Object_Number",
        "Image_FileName_OrigAGP",
        "Image_FileName_OrigDNA",
        "Image_FileName_OrigRNA",
    ]
][:3]
CPU times: user 844 ms, sys: 530 ms, total: 1.37 s
Wall time: 485 ms

%%time
# view JUMP plate BR00117006 with images and overlaid outlines for segmentation
# and adding scale bars which show how micrometers scale to the pixels displayed.
CytoDataFrame(
    data=f"{jump_data_path}/BR00117006_shrunken.parquet",
    data_context_dir=f"{jump_data_path}/images/orig",
    data_outline_context_dir=f"{jump_data_path}/images/outlines",
    display_options={
        "um_per_pixel": 0.1550,
        "scale_bar": {
            "length_um": 5,
            "location": "lower right",
            "color": (255, 255, 255),
            "thickness_px": 2,
            "margin_px": 5,
        },
    },
)[
    [
        "Metadata_ImageNumber",
        "Cells_Number_Object_Number",
        "Image_FileName_OrigAGP",
        "Image_FileName_OrigDNA",
        "Image_FileName_OrigRNA",
    ]
][:3]
CPU times: user 850 ms, sys: 532 ms, total: 1.38 s
Wall time: 507 ms

%%time
# view JUMP plate BR00117006 with images and adjust the brightness
CytoDataFrame(
    data=f"{jump_data_path}/BR00117006_shrunken.parquet",
    data_context_dir=f"{jump_data_path}/images/orig",
    display_options={"brightness": 10},
)[
    [
        "Metadata_ImageNumber",
        "Cells_Number_Object_Number",
        "Image_FileName_OrigAGP",
        "Image_FileName_OrigDNA",
        "Image_FileName_OrigRNA",
    ]
][:3]
CPU times: user 873 ms, sys: 573 ms, total: 1.45 s
Wall time: 492 ms

%%time
# view JUMP plate BR00117006 with images and overlaid outlines for segmentation
# and removing the optional red center dot.
CytoDataFrame(
    data=f"{jump_data_path}/BR00117006_shrunken.parquet",
    data_context_dir=f"{jump_data_path}/images/orig",
    data_outline_context_dir=f"{jump_data_path}/images/outlines",
    display_options={"center_dot": False},
)[
    [
        "Metadata_ImageNumber",
        "Cells_Number_Object_Number",
        "Image_FileName_OrigAGP",
        "Image_FileName_OrigDNA",
        "Image_FileName_OrigRNA",
    ]
][:3]
CPU times: user 829 ms, sys: 527 ms, total: 1.36 s
Wall time: 485 ms

%%time
# view JUMP plate BR00117006 with images and change the display width
CytoDataFrame(
    data=f"{jump_data_path}/BR00117006_shrunken.parquet",
    data_context_dir=f"{jump_data_path}/images/orig",
    data_outline_context_dir=f"{jump_data_path}/images/outlines",
    display_options={"width": "100"},
)[
    [
        "Metadata_ImageNumber",
        "Cells_Number_Object_Number",
        "Image_FileName_OrigAGP",
        "Image_FileName_OrigDNA",
        "Image_FileName_OrigRNA",
    ]
][:3]
CPU times: user 868 ms, sys: 536 ms, total: 1.4 s
Wall time: 507 ms

%%time
# view JUMP plate BR00117006 with images, change the display height and width
# and also transpose for a different view of things.
CytoDataFrame(
    data=f"{jump_data_path}/BR00117006_shrunken.parquet",
    data_context_dir=f"{jump_data_path}/images/orig",
    data_outline_context_dir=f"{jump_data_path}/images/outlines",
    display_options={"width": "200px", "height": "auto"},
)[
    [
        "Metadata_ImageNumber",
        "Cells_Number_Object_Number",
        "Image_FileName_OrigAGP",
        "Image_FileName_OrigDNA",
        "Image_FileName_OrigRNA",
    ]
][:5].T
CPU times: user 826 ms, sys: 480 ms, total: 1.31 s
Wall time: 498 ms

%%time
# export to OME Parquet, a format which uses OME Arrow
# to store OME-spec images as values within the table.
frame.to_ome_parquet(file_path="example.ome.parquet")

# read OME Parquet file into the CytoDataFrame
CytoDataFrame(data="example.ome.parquet")
CPU times: user 2.09 s, sys: 473 ms, total: 2.56 s
Wall time: 5.45 s

%%time
# view JUMP plate BR00117006 with images, changing the bounding box
# using offsets so each image has roughly the same size.
CytoDataFrame(
    data=f"{jump_data_path}/BR00117006_shrunken.parquet",
    data_context_dir=f"{jump_data_path}/images/orig",
    data_outline_context_dir=f"{jump_data_path}/images/outlines",
    display_options={
        "offset_bounding_box": {
            "x_min": -20,
            "y_min": -20,
            "x_max": 20,
            "y_max": 20,
        },
    },
)[
    [
        "Metadata_ImageNumber",
        "Cells_Number_Object_Number",
        "Image_FileName_OrigAGP",
        "Image_FileName_OrigDNA",
        "Image_FileName_OrigRNA",
    ]
][:5]
CPU times: user 881 ms, sys: 550 ms, total: 1.43 s
Wall time: 514 ms

%%time
# view NF1 Cell Painting data with images
CytoDataFrame(
    data=f"{nf1_cellpainting_path}/Plate_2_with_image_data_shrunken.parquet",
    data_context_dir=f"{nf1_cellpainting_path}/Plate_2_images",
)[
    [
        "Metadata_ImageNumber",
        "Metadata_Cells_Number_Object_Number",
        "Image_FileName_GFP",
        "Image_FileName_RFP",
        "Image_FileName_DAPI",
    ]
][:3]
CPU times: user 244 ms, sys: 162 ms, total: 406 ms
Wall time: 148 ms

%%time
# view NF1 Cell Painting data with images and overlaid outlines from masks
frame = CytoDataFrame(
    data=f"{nf1_cellpainting_path}/Plate_2_with_image_data_shrunken.parquet",
    data_context_dir=f"{nf1_cellpainting_path}/Plate_2_images",
    data_mask_context_dir=f"{nf1_cellpainting_path}/Plate_2_masks",
)[
    [
        "Metadata_ImageNumber",
        "Metadata_Cells_Number_Object_Number",
        "Image_FileName_GFP",
        "Image_FileName_RFP",
        "Image_FileName_DAPI",
    ]
][:3]
frame
CPU times: user 326 ms, sys: 182 ms, total: 508 ms
Wall time: 239 ms

%%time
# add active paths on the local system to show how CytoDataFrame
# may be used without specifying a context directory for images.
# Note: normally these paths are local to the system where the
# profile data was generated, which often is not the same as the
# system which will be used to analyze the data.
parquet_path = f"{nf1_cellpainting_path}/Plate_2_with_image_data_shrunken.parquet"
nf1_dataset_with_modified_image_paths = pd.read_parquet(path=parquet_path)
nf1_dataset_with_modified_image_paths.loc[
    :, ["Image_PathName_DAPI", "Image_PathName_GFP", "Image_PathName_RFP"]
] = f"{pathlib.Path(parquet_path).parent}/Plate_2_images"

# view NF1 Cell Painting data with images and overlaid outlines from masks
CytoDataFrame(
    # note: we can read directly from an existing Pandas DataFrame
    data=nf1_dataset_with_modified_image_paths,
    data_mask_context_dir=f"{nf1_cellpainting_path}/Plate_2_masks",
)[
    [
        "Metadata_ImageNumber",
        "Metadata_Cells_Number_Object_Number",
        "Image_FileName_GFP",
        "Image_FileName_RFP",
        "Image_FileName_DAPI",
    ]
][:3]
CPU times: user 261 ms, sys: 177 ms, total: 437 ms
Wall time: 149 ms

%%time
# export to OME Parquet, a format which uses OME Arrow
# to store OME-spec images as values within the table.
frame.to_ome_parquet(file_path="example.ome.parquet")

# read OME Parquet file into the CytoDataFrame
CytoDataFrame(data="example.ome.parquet")
CPU times: user 958 ms, sys: 182 ms, total: 1.14 s
Wall time: 1.15 s

%%time
# view nuclear speckles data with images and overlaid outlines from masks
CytoDataFrame(
    data=f"{nuclear_speckles_path}/test_slide1_converted.parquet",
    data_context_dir=f"{nuclear_speckles_path}/images/plate1",
    data_mask_context_dir=f"{nuclear_speckles_path}/masks/plate1",
)[
    [
        "Metadata_ImageNumber",
        "Nuclei_Number_Object_Number",
        "Image_FileName_A647",
        "Image_FileName_DAPI",
        "Image_FileName_GOLD",
    ]
][:3]
CPU times: user 92.2 ms, sys: 37.9 ms, total: 130 ms
Wall time: 66.1 ms

%%time
# view ALSF pediatric cancer atlas plate BR00143976 with images
cdf = CytoDataFrame(
    data=f"{pediatric_cancer_atlas_path}/BR00143976_shrunken.parquet",
    data_context_dir=f"{pediatric_cancer_atlas_path}/images/orig",
    data_outline_context_dir=f"{pediatric_cancer_atlas_path}/images/outlines",
    segmentation_file_regex={
        r"CellsOutlines_BR(\d+)_C(\d{2})_\d+\.tiff": r".*ch3.*\.tiff",
        r"NucleiOutlines_BR(\d+)_C(\d{2})_\d+\.tiff": r".*ch5.*\.tiff",
    },
)[
    [
        "Metadata_ImageNumber",
        "Metadata_Nuclei_Number_Object_Number",
        "Image_FileName_OrigAGP",
        "Image_FileName_OrigDNA",
    ]
]
cdf
CPU times: user 336 ms, sys: 233 ms, total: 570 ms
Wall time: 185 ms

%%time
# show that we can use the cytodataframe again
# by quick variable reference.
cdf
CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.1 μs

%%time
# export to OME Parquet, a format which uses OME Arrow
# to store OME-spec images as values within the table.
cdf.to_ome_parquet(file_path="example.ome.parquet")

# read OME Parquet file into the CytoDataFrame
CytoDataFrame(data="example.ome.parquet")
CPU times: user 895 ms, sys: 236 ms, total: 1.13 s
Wall time: 1.05 s