{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# `coSMicQC` in a nutshell\n", "\n", "This notebook demonstrates various capabilities of `coSMicQC` using examples." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "import pathlib\n", "\n", "import pandas as pd\n", "from cytodataframe import CytoDataFrame\n", "\n", "import cosmicqc\n", "\n", "# set a path for the parquet-based dataset\n", "# (in this case, CellProfiler SQLite data processed by CytoTable)\n", "data_path = (\n", " \"../../../tests/data/cytotable/NF1_cellpainting_data/\"\n", " \"Plate_2_with_image_data.parquet\"\n", ")\n", "\n", "# set a context directory for images associated with the dataset\n", "image_context_dir = pathlib.Path(data_path).parent / \"Plate_2_images\"\n", "mask_context_dir = pathlib.Path(data_path).parent / \"Plate_2_masks\"\n", "\n", "# create a cosmicqc CytoDataFrame (single-cell DataFrame)\n", "scdf = CytoDataFrame(\n", " data=data_path,\n", " data_context_dir=image_context_dir,\n", " data_mask_context_dir=mask_context_dir,\n", ")\n", "\n", "# display the dataframe\n", "scdf" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1085 False\n", "1083 False\n", "1082 False\n", "1080 False\n", " ... \n", "572 True\n", "571 True\n", "567 True\n", "280 True\n", "856 True\n", "Name: cqc.custom.Z_Score.Nuclei_AreaShape_Area, Length: 1714, dtype: bool" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Identify which rows include outliers for a given threshold definition\n", "# which references a column name and a z-score number which is considered\n", "# the limit.\n", "cosmicqc.analyze.identify_outliers(\n", " df=scdf,\n", " feature_thresholds={\"Nuclei_AreaShape_Area\": -1},\n", ").sort_values()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of outliers: 328 (19.14%)\n", "Outliers Range:\n", "Nuclei_AreaShape_Area Min: 734.0\n", "Nuclei_AreaShape_Area Max: 1904.0\n" ] }, { "data": { "text/plain": [ " Nuclei_AreaShape_Area Metadata_ImageNumber Image_Metadata_Plate_x\n", "23 921.0 2 Plate_2\n", "28 845.0 2 Plate_2\n", "29 1024.0 2 Plate_2\n", "32 787.0 2 Plate_2\n", "37 1347.0 2 Plate_2\n", "... ... ... ...\n", "1682 1497.0 127 Plate_2\n", "1689 1794.0 127 Plate_2\n", "1692 1732.0 127 Plate_2\n", "1699 1149.0 127 Plate_2\n", "1707 1594.0 128 Plate_2\n", "\n", "[328 rows x 3 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Show the number of outliers given a column name and a specified threshold\n", "# via the `feature_thresholds` parameter and the `find_outliers` function.\n", "cosmicqc.analyze.find_outliers(\n", " df=scdf,\n", " metadata_columns=[\"Metadata_ImageNumber\", \"Image_Metadata_Plate_x\"],\n", " feature_thresholds={\"Nuclei_AreaShape_Area\": -1},\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | cqc.small_and_low_formfactor_nuclei.Z_Score.Nuclei_AreaShape_Area | \n", "cqc.small_and_low_formfactor_nuclei.Z_Score.Nuclei_AreaShape_FormFactor | \n", "cqc.small_and_low_formfactor_nuclei.is_outlier | \n", "cqc.elongated_nuclei.Z_Score.Nuclei_AreaShape_Eccentricity | \n", "cqc.elongated_nuclei.is_outlier | \n", "cqc.large_nuclei.Z_Score.Nuclei_AreaShape_Area | \n", "cqc.large_nuclei.Z_Score.Nuclei_AreaShape_FormFactor | \n", "cqc.large_nuclei.is_outlier | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "0.848820 | \n", "0.219903 | \n", "False | \n", "0.498274 | \n", "False | \n", "0.848820 | \n", "0.219903 | \n", "False | \n", "
1 | \n", "-0.252521 | \n", "-1.280795 | \n", "False | \n", "-0.659400 | \n", "False | \n", "-0.252521 | \n", "-1.280795 | \n", "False | \n", "
2 | \n", "-0.402491 | \n", "-0.325652 | \n", "False | \n", "0.819165 | \n", "False | \n", "-0.402491 | \n", "-0.325652 | \n", "False | \n", "
3 | \n", "0.329549 | \n", "-0.268920 | \n", "False | \n", "0.961218 | \n", "False | \n", "0.329549 | \n", "-0.268920 | \n", "False | \n", "
4 | \n", "1.153446 | \n", "0.028845 | \n", "False | \n", "-0.372891 | \n", "False | \n", "1.153446 | \n", "0.028845 | \n", "False | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1709 | \n", "0.598557 | \n", "-0.280063 | \n", "False | \n", "0.923075 | \n", "False | \n", "0.598557 | \n", "-0.280063 | \n", "False | \n", "
1710 | \n", "-0.716490 | \n", "0.068293 | \n", "False | \n", "0.650830 | \n", "False | \n", "-0.716490 | \n", "0.068293 | \n", "False | \n", "
1711 | \n", "1.187189 | \n", "0.833264 | \n", "False | \n", "-0.752359 | \n", "False | \n", "1.187189 | \n", "0.833264 | \n", "False | \n", "
1712 | \n", "-0.699619 | \n", "0.534479 | \n", "False | \n", "-0.747030 | \n", "False | \n", "-0.699619 | \n", "0.534479 | \n", "False | \n", "
1713 | \n", "-0.990185 | \n", "0.356614 | \n", "False | \n", "-1.309290 | \n", "False | \n", "-0.990185 | \n", "0.356614 | \n", "False | \n", "
1714 rows × 8 columns
\n", "\n", " | Metadata_ImageNumber | \n", "Metadata_Cells_Number_Object_Number | \n", "cqc.large_nuclei.is_outlier | \n", "Image_FileName_GFP | \n", "Image_FileName_RFP | \n", "Image_FileName_DAPI | \n", "
---|---|---|---|---|---|---|
699 | \n", "50 | \n", "2 | \n", "True | \n", "|||
1557 | \n", "113 | \n", "10 | \n", "True | \n", "|||
1677 | \n", "126 | \n", "9 | \n", "True | \n", "|||
457 | \n", "34 | \n", "6 | \n", "True | \n", "|||
882 | \n", "61 | \n", "6 | \n", "True | \n", "|||
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
570 | \n", "45 | \n", "13 | \n", "False | \n", "|||
569 | \n", "45 | \n", "10 | \n", "False | \n", "|||
568 | \n", "45 | \n", "9 | \n", "False | \n", "|||
567 | \n", "45 | \n", "8 | \n", "False | \n", "|||
1713 | \n", "128 | \n", "14 | \n", "False | \n", "
1714 rows × 6 columns
" ], "text/plain": [ " Metadata_ImageNumber Metadata_Cells_Number_Object_Number \\\n", "699 50 2 \n", "1557 113 10 \n", "1677 126 9 \n", "457 34 6 \n", "882 61 6 \n", "... ... ... \n", "570 45 13 \n", "569 45 10 \n", "568 45 9 \n", "567 45 8 \n", "1713 128 14 \n", "\n", " cqc.large_nuclei.is_outlier Image_FileName_GFP \\\n", "699 True D12_01_2_2_GFP_001.tif \n", "1557 True H12_01_2_1_GFP_001.tif \n", "1677 True H7_01_2_2_GFP_001.tif \n", "457 True C12_01_2_2_GFP_001.tif \n", "882 True D7_01_2_1_GFP_001.tif \n", "... ... ... \n", "570 False C7_01_2_1_GFP_001.tif \n", "569 False C7_01_2_1_GFP_001.tif \n", "568 False C7_01_2_1_GFP_001.tif \n", "567 False C7_01_2_1_GFP_001.tif \n", "1713 False H7_01_2_4_GFP_001.tif \n", "\n", " Image_FileName_RFP Image_FileName_DAPI \n", "699 D12_01_3_2_RFP_001.tif D12_01_1_2_DAPI_001.tif \n", "1557 H12_01_3_1_RFP_001.tif H12_01_1_1_DAPI_001.tif \n", "1677 H7_01_3_2_RFP_001.tif H7_01_1_2_DAPI_001.tif \n", "457 C12_01_3_2_RFP_001.tif C12_01_1_2_DAPI_001.tif \n", "882 D7_01_3_1_RFP_001.tif D7_01_1_1_DAPI_001.tif \n", "... ... ... \n", "570 C7_01_3_1_RFP_001.tif C7_01_1_1_DAPI_001.tif \n", "569 C7_01_3_1_RFP_001.tif C7_01_1_1_DAPI_001.tif \n", "568 C7_01_3_1_RFP_001.tif C7_01_1_1_DAPI_001.tif \n", "567 C7_01_3_1_RFP_001.tif C7_01_1_1_DAPI_001.tif \n", "1713 H7_01_3_4_RFP_001.tif H7_01_1_4_DAPI_001.tif \n", "\n", "[1714 rows x 6 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# show cropped images through CytoDataFrame from the dataset to help analyze outliers\n", "labeled_scdf.sort_values(by=\"cqc.large_nuclei.is_outlier\", ascending=False)[\n", " [\n", " \"Metadata_ImageNumber\",\n", " \"Metadata_Cells_Number_Object_Number\",\n", " \"cqc.large_nuclei.is_outlier\",\n", " \"Image_FileName_GFP\",\n", " \"Image_FileName_RFP\",\n", " \"Image_FileName_DAPI\",\n", " ]\n", "]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " | Metadata_ImageNumber | \n", "Image_Metadata_Plate_x | \n", "Metadata_number_of_singlecells | \n", "Image_Metadata_Site_x | \n", "Image_Metadata_Well_x | \n", "Metadata_Cells_Number_Object_Number | \n", "Metadata_Cytoplasm_Parent_Cells | \n", "Metadata_Cytoplasm_Parent_Nuclei | \n", "Metadata_Nuclei_Number_Object_Number | \n", "Cytoplasm_AreaShape_Area | \n", "... | \n", "Image_Threshold_SumOfEntropies_Cells | \n", "Image_Threshold_SumOfEntropies_Nuclei | \n", "Image_Threshold_WeightedVariance_Cells | \n", "Image_Threshold_WeightedVariance_Nuclei | \n", "Image_URL_DAPI | \n", "Image_URL_GFP | \n", "Image_URL_RFP | \n", "Image_Width_DAPI | \n", "Image_Width_GFP | \n", "Image_Width_RFP | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "Plate_2 | \n", "44 | \n", "1 | \n", "A12 | \n", "1 | \n", "1 | \n", "2 | \n", "2 | \n", "21024.0 | \n", "... | \n", "-12.181288 | \n", "-11.699993 | \n", "0.992624 | \n", "0.657791 | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "1224 | \n", "1224 | \n", "1224 | \n", "
1 | \n", "1 | \n", "Plate_2 | \n", "44 | \n", "1 | \n", "A12 | \n", "4 | \n", "4 | \n", "7 | \n", "7 | \n", "12754.0 | \n", "... | \n", "-12.181288 | \n", "-11.699993 | \n", "0.992624 | \n", "0.657791 | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "1224 | \n", "1224 | \n", "1224 | \n", "
2 | \n", "1 | \n", "Plate_2 | \n", "44 | \n", "1 | \n", "A12 | \n", "7 | \n", "7 | \n", "10 | \n", "10 | \n", "23976.0 | \n", "... | \n", "-12.181288 | \n", "-11.699993 | \n", "0.992624 | \n", "0.657791 | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "1224 | \n", "1224 | \n", "1224 | \n", "
3 | \n", "1 | \n", "Plate_2 | \n", "44 | \n", "1 | \n", "A12 | \n", "8 | \n", "8 | \n", "12 | \n", "12 | \n", "19374.0 | \n", "... | \n", "-12.181288 | \n", "-11.699993 | \n", "0.992624 | \n", "0.657791 | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "1224 | \n", "1224 | \n", "1224 | \n", "
4 | \n", "1 | \n", "Plate_2 | \n", "44 | \n", "1 | \n", "A12 | \n", "9 | \n", "9 | \n", "13 | \n", "13 | \n", "27385.0 | \n", "... | \n", "-12.181288 | \n", "-11.699993 | \n", "0.992624 | \n", "0.657791 | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "1224 | \n", "1224 | \n", "1224 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1709 | \n", "128 | \n", "Plate_2 | \n", "59 | \n", "4 | \n", "H7 | \n", "10 | \n", "10 | \n", "14 | \n", "14 | \n", "24942.0 | \n", "... | \n", "-12.566582 | \n", "-11.633043 | \n", "1.624310 | \n", "0.545186 | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "1224 | \n", "1224 | \n", "1224 | \n", "
1710 | \n", "128 | \n", "Plate_2 | \n", "59 | \n", "4 | \n", "H7 | \n", "11 | \n", "11 | \n", "15 | \n", "15 | \n", "6627.0 | \n", "... | \n", "-12.566582 | \n", "-11.633043 | \n", "1.624310 | \n", "0.545186 | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "1224 | \n", "1224 | \n", "1224 | \n", "
1711 | \n", "128 | \n", "Plate_2 | \n", "59 | \n", "4 | \n", "H7 | \n", "12 | \n", "12 | \n", "16 | \n", "16 | \n", "11216.0 | \n", "... | \n", "-12.566582 | \n", "-11.633043 | \n", "1.624310 | \n", "0.545186 | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "1224 | \n", "1224 | \n", "1224 | \n", "
1712 | \n", "128 | \n", "Plate_2 | \n", "59 | \n", "4 | \n", "H7 | \n", "13 | \n", "13 | \n", "17 | \n", "17 | \n", "15279.0 | \n", "... | \n", "-12.566582 | \n", "-11.633043 | \n", "1.624310 | \n", "0.545186 | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "1224 | \n", "1224 | \n", "1224 | \n", "
1713 | \n", "128 | \n", "Plate_2 | \n", "59 | \n", "4 | \n", "H7 | \n", "14 | \n", "14 | \n", "20 | \n", "20 | \n", "7106.0 | \n", "... | \n", "-12.566582 | \n", "-11.633043 | \n", "1.624310 | \n", "0.545186 | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "file:/home/jenna/nf1_cellpainting_data/1.cellp... | \n", "1224 | \n", "1224 | \n", "1224 | \n", "
1714 rows × 2076 columns
\n", "