You're reading the documentation for a development version. For the latest released version, please have a look at v1.0.1.

coSMicQC in a nutshellΒΆ

This notebook demonstrates various capabilities of coSMicQC using examples.

import pathlib

import pandas as pd
from cytodataframe import CytoDataFrame

import cosmicqc

# set a path for the parquet-based dataset
# (in this case, CellProfiler SQLite data processed by CytoTable)
data_path = (
    "../../../tests/data/cytotable/NF1_cellpainting_data/"
    "Plate_2_with_image_data.parquet"
)

# set a context directory for images associated with the dataset
image_context_dir = pathlib.Path(data_path).parent / "Plate_2_images"
mask_context_dir = pathlib.Path(data_path).parent / "Plate_2_masks"

# create a cosmicqc CytoDataFrame (single-cell DataFrame)
scdf = CytoDataFrame(
    data=data_path,
    data_context_dir=image_context_dir,
    data_mask_context_dir=mask_context_dir,
)

# display the dataframe
scdf
Static snapshot (for non-interactive view)
Metadata_ImageNumber Image_Metadata_Plate_x Metadata_number_of_singlecells Image_Metadata_Site_x Image_Metadata_Well_x Metadata_Cells_Number_Object_Number Metadata_Cytoplasm_Parent_Cells Metadata_Cytoplasm_Parent_Nuclei Metadata_Nuclei_Number_Object_Number Cytoplasm_AreaShape_Area ... Image_Threshold_SumOfEntropies_Cells Image_Threshold_SumOfEntropies_Nuclei Image_Threshold_WeightedVariance_Cells Image_Threshold_WeightedVariance_Nuclei Image_URL_DAPI Image_URL_GFP Image_URL_RFP Image_Width_DAPI Image_Width_GFP Image_Width_RFP
0 1 Plate_2 44 1 A12 1 1 2 2 21024.0 ... -12.181288 -11.699993 0.992624 0.657791 1224 1224 1224
1 1 Plate_2 44 1 A12 4 4 7 7 12754.0 ... -12.181288 -11.699993 0.992624 0.657791 1224 1224 1224
2 1 Plate_2 44 1 A12 7 7 10 10 23976.0 ... -12.181288 -11.699993 0.992624 0.657791 1224 1224 1224
3 1 Plate_2 44 1 A12 8 8 12 12 19374.0 ... -12.181288 -11.699993 0.992624 0.657791 1224 1224 1224
4 1 Plate_2 44 1 A12 9 9 13 13 27385.0 ... -12.181288 -11.699993 0.992624 0.657791 1224 1224 1224
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1709 128 Plate_2 59 4 H7 10 10 14 14 24942.0 ... -12.566582 -11.633043 1.624310 0.545186 1224 1224 1224
1710 128 Plate_2 59 4 H7 11 11 15 15 6627.0 ... -12.566582 -11.633043 1.624310 0.545186 1224 1224 1224
1711 128 Plate_2 59 4 H7 12 12 16 16 11216.0 ... -12.566582 -11.633043 1.624310 0.545186 1224 1224 1224
1712 128 Plate_2 59 4 H7 13 13 17 17 15279.0 ... -12.566582 -11.633043 1.624310 0.545186 1224 1224 1224
1713 128 Plate_2 59 4 H7 14 14 20 20 7106.0 ... -12.566582 -11.633043 1.624310 0.545186 1224 1224 1224

1714 rows Γ— 2076 columns


# Identify which rows include outliers for a given threshold definition
# which references a column name and a z-score number which is considered
# the limit.
cosmicqc.analyze.identify_outliers(
    df=scdf,
    feature_thresholds={"Nuclei_AreaShape_Area": -1},
).sort_values()
0       False
1085    False
1083    False
1082    False
1080    False
        ...  
572      True
571      True
567      True
280      True
856      True
Name: Metadata_cqc_custom_is_outlier, Length: 1714, dtype: bool
# Show the number of outliers given a column name and a specified threshold
# via the `feature_thresholds` parameter and the `find_outliers` function.
cosmicqc.analyze.find_outliers(
    df=scdf,
    metadata_columns=["Metadata_ImageNumber", "Image_Metadata_Plate_x"],
    feature_thresholds={"Nuclei_AreaShape_Area": -1},
)
Number of outliers: 328 (19.14%)
Outliers Range:
Nuclei_AreaShape_Area Min: 734.0
Nuclei_AreaShape_Area Max: 1904.0
Clamping filter plot threshold for column 'Nuclei_AreaShape_Area' from 1908.5286283956243 to 1904.0 because it is outside data range [734.0, 1904.0].
Static snapshot (for non-interactive view)
Nuclei_AreaShape_Area Metadata_ImageNumber Image_Metadata_Plate_x
23 921.0 2 Plate_2
28 845.0 2 Plate_2
29 1024.0 2 Plate_2
32 787.0 2 Plate_2
37 1347.0 2 Plate_2
... ... ... ...
1682 1497.0 127 Plate_2
1689 1794.0 127 Plate_2
1692 1732.0 127 Plate_2
1699 1149.0 127 Plate_2
1707 1594.0 128 Plate_2

328 rows Γ— 3 columns


# create a labeled dataset which includes z-scores and whether those scores
# are interpreted as outliers or inliers. We use pre-defined threshold sets
# loaded from defaults (cosmicqc can accept user-defined thresholds too!).
labeled_scdf = cosmicqc.analyze.label_outliers(
    df=scdf, include_threshold_scores=True, feature_thresholds="large_nuclei"
)
labeled_scdf
Static snapshot (for non-interactive view)
Metadata_ImageNumber Image_Metadata_Plate_x Metadata_number_of_singlecells Image_Metadata_Site_x Image_Metadata_Well_x Metadata_Cells_Number_Object_Number Metadata_Cytoplasm_Parent_Cells Metadata_Cytoplasm_Parent_Nuclei Metadata_Nuclei_Number_Object_Number Cytoplasm_AreaShape_Area ... Image_Threshold_WeightedVariance_Nuclei Image_URL_DAPI Image_URL_GFP Image_URL_RFP Image_Width_DAPI Image_Width_GFP Image_Width_RFP Metadata_cqc_large_nuclei_Nuclei_AreaShape_Area_zscore Metadata_cqc_large_nuclei_Nuclei_AreaShape_FormFactor_zscore Metadata_cqc_large_nuclei_is_outlier
0 1 Plate_2 44 1 A12 1 1 2 2 21024.0 ... 0.657791 1224 1224 1224 0.848820 0.219903 False
1 1 Plate_2 44 1 A12 4 4 7 7 12754.0 ... 0.657791 1224 1224 1224 -0.252521 -1.280795 False
2 1 Plate_2 44 1 A12 7 7 10 10 23976.0 ... 0.657791 1224 1224 1224 -0.402491 -0.325652 False
3 1 Plate_2 44 1 A12 8 8 12 12 19374.0 ... 0.657791 1224 1224 1224 0.329549 -0.268920 False
4 1 Plate_2 44 1 A12 9 9 13 13 27385.0 ... 0.657791 1224 1224 1224 1.153446 0.028845 False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1709 128 Plate_2 59 4 H7 10 10 14 14 24942.0 ... 0.545186 1224 1224 1224 0.598557 -0.280063 False
1710 128 Plate_2 59 4 H7 11 11 15 15 6627.0 ... 0.545186 1224 1224 1224 -0.716490 0.068293 False
1711 128 Plate_2 59 4 H7 12 12 16 16 11216.0 ... 0.545186 1224 1224 1224 1.187189 0.833264 False
1712 128 Plate_2 59 4 H7 13 13 17 17 15279.0 ... 0.545186 1224 1224 1224 -0.699619 0.534479 False
1713 128 Plate_2 59 4 H7 14 14 20 20 7106.0 ... 0.545186 1224 1224 1224 -0.990185 0.356614 False

1714 rows Γ— 2079 columns


# show cropped images through CytoDataFrame from the dataset to help analyze outliers
# labeled_scdf._enbable_debug_mode()
labeled_scdf.sort_values(by="Metadata_cqc_large_nuclei_is_outlier", ascending=False)[
    [
        "Metadata_ImageNumber",
        "Metadata_Cells_Number_Object_Number",
        "Metadata_cqc_large_nuclei_is_outlier",
        "Image_FileName_GFP",
        "Image_FileName_RFP",
        "Image_FileName_DAPI",
    ]
]

# One can convert from cosmicqc.CytoDataFrame to pd.DataFrame's
# (when or if needed!)
df = pd.DataFrame(scdf)
print(type(df))
df
<class 'pandas.core.frame.DataFrame'>
Metadata_ImageNumber Image_Metadata_Plate_x Metadata_number_of_singlecells Image_Metadata_Site_x Image_Metadata_Well_x Metadata_Cells_Number_Object_Number Metadata_Cytoplasm_Parent_Cells Metadata_Cytoplasm_Parent_Nuclei Metadata_Nuclei_Number_Object_Number Cytoplasm_AreaShape_Area ... Image_Threshold_SumOfEntropies_Cells Image_Threshold_SumOfEntropies_Nuclei Image_Threshold_WeightedVariance_Cells Image_Threshold_WeightedVariance_Nuclei Image_URL_DAPI Image_URL_GFP Image_URL_RFP Image_Width_DAPI Image_Width_GFP Image_Width_RFP
0 1 Plate_2 44 1 A12 1 1 2 2 21024.0 ... -12.181288 -11.699993 0.992624 0.657791 file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... 1224 1224 1224
1 1 Plate_2 44 1 A12 4 4 7 7 12754.0 ... -12.181288 -11.699993 0.992624 0.657791 file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... 1224 1224 1224
2 1 Plate_2 44 1 A12 7 7 10 10 23976.0 ... -12.181288 -11.699993 0.992624 0.657791 file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... 1224 1224 1224
3 1 Plate_2 44 1 A12 8 8 12 12 19374.0 ... -12.181288 -11.699993 0.992624 0.657791 file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... 1224 1224 1224
4 1 Plate_2 44 1 A12 9 9 13 13 27385.0 ... -12.181288 -11.699993 0.992624 0.657791 file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... 1224 1224 1224
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1709 128 Plate_2 59 4 H7 10 10 14 14 24942.0 ... -12.566582 -11.633043 1.624310 0.545186 file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... 1224 1224 1224
1710 128 Plate_2 59 4 H7 11 11 15 15 6627.0 ... -12.566582 -11.633043 1.624310 0.545186 file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... 1224 1224 1224
1711 128 Plate_2 59 4 H7 12 12 16 16 11216.0 ... -12.566582 -11.633043 1.624310 0.545186 file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... 1224 1224 1224
1712 128 Plate_2 59 4 H7 13 13 17 17 15279.0 ... -12.566582 -11.633043 1.624310 0.545186 file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... 1224 1224 1224
1713 128 Plate_2 59 4 H7 14 14 20 20 7106.0 ... -12.566582 -11.633043 1.624310 0.545186 file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... file:/home/jenna/nf1_cellpainting_data/1.cellp... 1224 1224 1224

1714 rows Γ— 2076 columns