CytoDataFrame at a Glance#
This notebook demonstrates various capabilities of CytoDataFrame using examples.
CytoDataFrame is intended to provide you a Pandas-like DataFrame experience which is enhanced with single-cell visual information which can be viewed directly in a Jupyter notebook.
import logging
import pathlib
import warnings
import pandas as pd
from cytodataframe.frame import CytoDataFrame
logging.basicConfig(level=logging.INFO) # or logging.DEBUG, WARNING, etc.
logging.getLogger().setLevel(logging.INFO) # Ensures root logger is set
# filter warnings from skimage about imageio
warnings.filterwarnings(
"ignore",
message=r"The plugin infrastructure.*",
category=FutureWarning,
)
# create paths for use with CytoDataFrames below
jump_data_path = "../../../tests/data/cytotable/JUMP_plate_BR00117006"
nf1_cellpainting_path = "../../../tests/data/cytotable/NF1_cellpainting_data_shrunken/"
nuclear_speckles_path = "../../../tests/data/cytotable/nuclear_speckles"
pediatric_cancer_atlas_path = (
"../../../tests/data/cytotable/pediatric_cancer_atlas_profiling"
)
%%time
# view JUMP plate BR00117006 with images
frame = CytoDataFrame(
data=f"{jump_data_path}/BR00117006_shrunken.parquet",
data_context_dir=f"{jump_data_path}/images/orig",
)[
[
"Metadata_ImageNumber",
"Cells_Number_Object_Number",
"Image_FileName_OrigAGP",
"Image_FileName_OrigDNA",
"Image_FileName_OrigRNA",
]
][:3]
frame
CPU times: user 978 ms, sys: 701 ms, total: 1.68 s
Wall time: 640 ms
Metadata_ImageNumber | Cells_Number_Object_Number | Image_FileName_OrigAGP | Image_FileName_OrigDNA | Image_FileName_OrigRNA | |
---|---|---|---|---|---|
0 | 1 | 1 | |||
1 | 1 | 2 | |||
2 | 1 | 3 |
%%time
# view JUMP plate BR00117006 with images and overlaid outlines for segmentation
CytoDataFrame(
data=f"{jump_data_path}/BR00117006_shrunken.parquet",
data_context_dir=f"{jump_data_path}/images/orig",
data_outline_context_dir=f"{jump_data_path}/images/outlines",
)[
[
"Metadata_ImageNumber",
"Cells_Number_Object_Number",
"Image_FileName_OrigAGP",
"Image_FileName_OrigDNA",
"Image_FileName_OrigRNA",
]
][:3]
CPU times: user 952 ms, sys: 727 ms, total: 1.68 s
Wall time: 507 ms
Metadata_ImageNumber | Cells_Number_Object_Number | Image_FileName_OrigAGP | Image_FileName_OrigDNA | Image_FileName_OrigRNA | |
---|---|---|---|---|---|
0 | 1 | 1 | |||
1 | 1 | 2 | |||
2 | 1 | 3 |
%%time
# view JUMP plate BR00117006 with images and overlaid outlines for segmentation
# and changing the color to something besides the default (default is green).
CytoDataFrame(
data=f"{jump_data_path}/BR00117006_shrunken.parquet",
data_context_dir=f"{jump_data_path}/images/orig",
data_outline_context_dir=f"{jump_data_path}/images/outlines",
display_options={"outline_color": (200, 100, 255)},
)[
[
"Metadata_ImageNumber",
"Cells_Number_Object_Number",
"Image_FileName_OrigAGP",
"Image_FileName_OrigDNA",
"Image_FileName_OrigRNA",
]
][:3]
CPU times: user 932 ms, sys: 715 ms, total: 1.65 s
Wall time: 491 ms
Metadata_ImageNumber | Cells_Number_Object_Number | Image_FileName_OrigAGP | Image_FileName_OrigDNA | Image_FileName_OrigRNA | |
---|---|---|---|---|---|
0 | 1 | 1 | |||
1 | 1 | 2 | |||
2 | 1 | 3 |
%%time
# view JUMP plate BR00117006 with images and adjust the brightness
CytoDataFrame(
data=f"{jump_data_path}/BR00117006_shrunken.parquet",
data_context_dir=f"{jump_data_path}/images/orig",
display_options={"brightness": 10},
)[
[
"Metadata_ImageNumber",
"Cells_Number_Object_Number",
"Image_FileName_OrigAGP",
"Image_FileName_OrigDNA",
"Image_FileName_OrigRNA",
]
][:3]
CPU times: user 959 ms, sys: 676 ms, total: 1.64 s
Wall time: 522 ms
Metadata_ImageNumber | Cells_Number_Object_Number | Image_FileName_OrigAGP | Image_FileName_OrigDNA | Image_FileName_OrigRNA | |
---|---|---|---|---|---|
0 | 1 | 1 | |||
1 | 1 | 2 | |||
2 | 1 | 3 |
%%time
# view JUMP plate BR00117006 with images and overlaid outlines for segmentation
# and removing the optional red center dot.
CytoDataFrame(
data=f"{jump_data_path}/BR00117006_shrunken.parquet",
data_context_dir=f"{jump_data_path}/images/orig",
data_outline_context_dir=f"{jump_data_path}/images/outlines",
display_options={"center_dot": False},
)[
[
"Metadata_ImageNumber",
"Cells_Number_Object_Number",
"Image_FileName_OrigAGP",
"Image_FileName_OrigDNA",
"Image_FileName_OrigRNA",
]
][:3]
CPU times: user 835 ms, sys: 415 ms, total: 1.25 s
Wall time: 593 ms
Metadata_ImageNumber | Cells_Number_Object_Number | Image_FileName_OrigAGP | Image_FileName_OrigDNA | Image_FileName_OrigRNA | |
---|---|---|---|---|---|
0 | 1 | 1 | |||
1 | 1 | 2 | |||
2 | 1 | 3 |
%%time
# view JUMP plate BR00117006 with images and change the display width
CytoDataFrame(
data=f"{jump_data_path}/BR00117006_shrunken.parquet",
data_context_dir=f"{jump_data_path}/images/orig",
data_outline_context_dir=f"{jump_data_path}/images/outlines",
display_options={"width": "100"},
)[
[
"Metadata_ImageNumber",
"Cells_Number_Object_Number",
"Image_FileName_OrigAGP",
"Image_FileName_OrigDNA",
"Image_FileName_OrigRNA",
]
][:3]
CPU times: user 901 ms, sys: 654 ms, total: 1.56 s
Wall time: 482 ms
Metadata_ImageNumber | Cells_Number_Object_Number | Image_FileName_OrigAGP | Image_FileName_OrigDNA | Image_FileName_OrigRNA | |
---|---|---|---|---|---|
0 | 1 | 1 | |||
1 | 1 | 2 | |||
2 | 1 | 3 |
%%time
# view JUMP plate BR00117006 with images, change the display height and width
# and also transpose for a different view of things.
CytoDataFrame(
data=f"{jump_data_path}/BR00117006_shrunken.parquet",
data_context_dir=f"{jump_data_path}/images/orig",
data_outline_context_dir=f"{jump_data_path}/images/outlines",
display_options={"width": "200px", "height": "auto"},
)[
[
"Metadata_ImageNumber",
"Cells_Number_Object_Number",
"Image_FileName_OrigAGP",
"Image_FileName_OrigDNA",
"Image_FileName_OrigRNA",
]
][:5].T
CPU times: user 950 ms, sys: 688 ms, total: 1.64 s
Wall time: 507 ms
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
Metadata_ImageNumber | 1 | 1 | 1 | 1 | 1 |
Cells_Number_Object_Number | 1 | 2 | 3 | 4 | 5 |
Image_FileName_OrigAGP | |||||
Image_FileName_OrigDNA | |||||
Image_FileName_OrigRNA |
%%time
# view JUMP plate BR00117006 with images, changing the bounding box
# using offsets so each image has roughly the same size.
CytoDataFrame(
data=f"{jump_data_path}/BR00117006_shrunken.parquet",
data_context_dir=f"{jump_data_path}/images/orig",
data_outline_context_dir=f"{jump_data_path}/images/outlines",
display_options={
"offset_bounding_box": {
"x_min": -20,
"y_min": -20,
"x_max": 20,
"y_max": 20,
},
},
)[
[
"Metadata_ImageNumber",
"Cells_Number_Object_Number",
"Image_FileName_OrigAGP",
"Image_FileName_OrigDNA",
"Image_FileName_OrigRNA",
]
][:5]
CPU times: user 917 ms, sys: 541 ms, total: 1.46 s
Wall time: 733 ms
Metadata_ImageNumber | Cells_Number_Object_Number | Image_FileName_OrigAGP | Image_FileName_OrigDNA | Image_FileName_OrigRNA | |
---|---|---|---|---|---|
0 | 1 | 1 | |||
1 | 1 | 2 | |||
2 | 1 | 3 | |||
3 | 1 | 4 | |||
4 | 1 | 5 |
%%time
# view NF1 Cell Painting data with images
CytoDataFrame(
data=f"{nf1_cellpainting_path}/Plate_2_with_image_data_shrunken.parquet",
data_context_dir=f"{nf1_cellpainting_path}/Plate_2_images",
)[
[
"Metadata_ImageNumber",
"Metadata_Cells_Number_Object_Number",
"Image_FileName_GFP",
"Image_FileName_RFP",
"Image_FileName_DAPI",
]
][:3]
CPU times: user 246 ms, sys: 157 ms, total: 403 ms
Wall time: 149 ms
Metadata_ImageNumber | Metadata_Cells_Number_Object_Number | Image_FileName_GFP | Image_FileName_RFP | Image_FileName_DAPI | |
---|---|---|---|---|---|
353 | 31 | 4 | |||
1564 | 113 | 17 | |||
1275 | 94 | 5 |
%%time
# view NF1 Cell Painting data with images and overlaid outlines from masks
CytoDataFrame(
data=f"{nf1_cellpainting_path}/Plate_2_with_image_data_shrunken.parquet",
data_context_dir=f"{nf1_cellpainting_path}/Plate_2_images",
data_mask_context_dir=f"{nf1_cellpainting_path}/Plate_2_masks",
)[
[
"Metadata_ImageNumber",
"Metadata_Cells_Number_Object_Number",
"Image_FileName_GFP",
"Image_FileName_RFP",
"Image_FileName_DAPI",
]
][:3]
CPU times: user 309 ms, sys: 167 ms, total: 476 ms
Wall time: 280 ms
Metadata_ImageNumber | Metadata_Cells_Number_Object_Number | Image_FileName_GFP | Image_FileName_RFP | Image_FileName_DAPI | |
---|---|---|---|---|---|
353 | 31 | 4 | |||
1564 | 113 | 17 | |||
1275 | 94 | 5 |
%%time
# add active paths on the local system to show how CytoDataFrame
# may be used without specifying a context directory for images.
# Note: normally these paths are local to the system where the
# profile data was generated, which often is not the same as the
# system which will be used to analyze the data.
parquet_path = f"{nf1_cellpainting_path}/Plate_2_with_image_data_shrunken.parquet"
nf1_dataset_with_modified_image_paths = pd.read_parquet(path=parquet_path)
nf1_dataset_with_modified_image_paths.loc[
:, ["Image_PathName_DAPI", "Image_PathName_GFP", "Image_PathName_RFP"]
] = f"{pathlib.Path(parquet_path).parent}/Plate_2_images"
# view NF1 Cell Painting data with images and overlaid outlines from masks
CytoDataFrame(
# note: we can read directly from an existing Pandas DataFrame
data=nf1_dataset_with_modified_image_paths,
data_mask_context_dir=f"{nf1_cellpainting_path}/Plate_2_masks",
)[
[
"Metadata_ImageNumber",
"Metadata_Cells_Number_Object_Number",
"Image_FileName_GFP",
"Image_FileName_RFP",
"Image_FileName_DAPI",
]
][:3]
CPU times: user 268 ms, sys: 190 ms, total: 458 ms
Wall time: 152 ms
Metadata_ImageNumber | Metadata_Cells_Number_Object_Number | Image_FileName_GFP | Image_FileName_RFP | Image_FileName_DAPI | |
---|---|---|---|---|---|
353 | 31 | 4 | |||
1564 | 113 | 17 | |||
1275 | 94 | 5 |
%%time
# view nuclear speckles data with images and overlaid outlines from masks
CytoDataFrame(
data=f"{nuclear_speckles_path}/test_slide1_converted.parquet",
data_context_dir=f"{nuclear_speckles_path}/images/plate1",
data_mask_context_dir=f"{nuclear_speckles_path}/masks/plate1",
)[
[
"Metadata_ImageNumber",
"Nuclei_Number_Object_Number",
"Image_FileName_A647",
"Image_FileName_DAPI",
"Image_FileName_GOLD",
]
][:3]
CPU times: user 123 ms, sys: 73.8 ms, total: 197 ms
Wall time: 75.8 ms
Metadata_ImageNumber | Nuclei_Number_Object_Number | Image_FileName_A647 | Image_FileName_DAPI | Image_FileName_GOLD | |
---|---|---|---|---|---|
0 | 1 | 1 | slide1_A1_M10_CH1_Z09_illumcorrect.tiff | slide1_A1_M10_CH2_Z09_illumcorrect.tiff | |
1 | 1 | 2 | slide1_A1_M10_CH1_Z09_illumcorrect.tiff | slide1_A1_M10_CH2_Z09_illumcorrect.tiff | |
2 | 1 | 3 | slide1_A1_M10_CH1_Z09_illumcorrect.tiff | slide1_A1_M10_CH2_Z09_illumcorrect.tiff |
%%time
# view ALSF pediatric cancer atlas plate BR00143976 with images
cdf = CytoDataFrame(
data=f"{pediatric_cancer_atlas_path}/BR00143976_shrunken.parquet",
data_context_dir=f"{pediatric_cancer_atlas_path}/images/orig",
data_outline_context_dir=f"{pediatric_cancer_atlas_path}/images/outlines",
segmentation_file_regex={
r"CellsOutlines_BR(\d+)_C(\d{2})_\d+\.tiff": r".*ch3.*\.tiff",
r"NucleiOutlines_BR(\d+)_C(\d{2})_\d+\.tiff": r".*ch5.*\.tiff",
},
)[
[
"Metadata_ImageNumber",
"Metadata_Nuclei_Number_Object_Number",
"Image_FileName_OrigAGP",
"Image_FileName_OrigDNA",
]
]
cdf
CPU times: user 344 ms, sys: 199 ms, total: 543 ms
Wall time: 228 ms
Metadata_ImageNumber | Metadata_Nuclei_Number_Object_Number | Image_FileName_OrigAGP | Image_FileName_OrigDNA | |
---|---|---|---|---|
0 | 3 | 3 | ||
1 | 3 | 4 | ||
2 | 3 | 6 | ||
3 | 3 | 7 | ||
4 | 3 | 8 |
%%time
# show that we can use the cytodataframe again
# by quick variable reference.
cdf
CPU times: user 1 µs, sys: 1e+03 ns, total: 2 µs
Wall time: 3.1 µs
Metadata_ImageNumber | Metadata_Nuclei_Number_Object_Number | Image_FileName_OrigAGP | Image_FileName_OrigDNA | |
---|---|---|---|---|
0 | 3 | 3 | ||
1 | 3 | 4 | ||
2 | 3 | 6 | ||
3 | 3 | 7 | ||
4 | 3 | 8 |