{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "ce6594c1-702f-4724-b190-5370e7396d3e", "metadata": {}, "source": [ "# CytoTable from the cloud (using cloud-based data sources)\n", "\n", "\n", "\n", "__Figure 1.__ _CytoTable is capable of reading data from cloud-based locations such as AWS S3._\n", "\n", "This notebook includes a quick demonstration of CytoTable with cloud-based data sources.\n", "For a more general overview of using CytoTable and the concepts behind the work please see: [CytoTable mise en place (general overview)](https://cytomining.github.io/CytoTable/examples/cytotable_mise_en_place_general_overview.html)" ] }, { "cell_type": "code", "execution_count": 1, "id": "c529e38f-8784-4a17-955b-06ea0b2375ce", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/buntend/Library/Caches/pypoetry/virtualenvs/cytotable-Y1C43DIB-py3.11/lib/python3.11/site-packages/google_crc32c/__config__.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n", " import pkg_resources\n" ] } ], "source": [ "import pathlib\n", "from collections import Counter\n", "from cloudpathlib import S3Client, CloudPath\n", "\n", "import pandas as pd\n", "import pyarrow.parquet as pq\n", "from IPython.display import Image, display\n", "from PIL import Image\n", "\n", "import cytotable\n", "\n", "# setup variables for use throughout the notebook\n", "source_path = \"s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/workspace/analysis/2020_11_04_CPJUMP1/BR00116991/analysis/BR00116991-A01-1\"\n", "dest_path = \"./cloud_example.parquet\"" ] }, { "cell_type": "code", "execution_count": 2, "id": "8036f404-5519-4d0f-aeb7-92d39e0e5d44", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "S3Path('s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/workspace/analysis/2020_11_04_CPJUMP1/BR00116991/analysis/BR00116991-A01-1')" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# setup a source cloudpath using unsigned (anonymous) requests to AWS S3\n", "# to access publicly-available data using CytoTable\n", "source_cloud_path = S3Client(no_sign_request=True).CloudPath(source_path)\n", "source_cloud_path" ] }, { "cell_type": "code", "execution_count": 3, "id": "c6e14f33-1c7a-437f-9a0f-9cda7e1620b2", "metadata": {}, "outputs": [], "source": [ "# remove the dest_path if it's present\n", "if pathlib.Path(dest_path).is_file():\n", " pathlib.Path(dest_path).unlink()" ] }, { "cell_type": "code", "execution_count": 4, "id": "d45b25f0-c115-4f95-8521-addc45bb8a90", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[S3Path('s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/workspace/analysis/2020_11_04_CPJUMP1/BR00116991/analysis/BR00116991-A01-1/outlines'),\n", " S3Path('s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/workspace/analysis/2020_11_04_CPJUMP1/BR00116991/analysis/BR00116991-A01-1/Cells.csv'),\n", " S3Path('s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/workspace/analysis/2020_11_04_CPJUMP1/BR00116991/analysis/BR00116991-A01-1/Cytoplasm.csv'),\n", " S3Path('s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/workspace/analysis/2020_11_04_CPJUMP1/BR00116991/analysis/BR00116991-A01-1/Experiment.csv'),\n", " S3Path('s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/workspace/analysis/2020_11_04_CPJUMP1/BR00116991/analysis/BR00116991-A01-1/Image.csv'),\n", " S3Path('s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/workspace/analysis/2020_11_04_CPJUMP1/BR00116991/analysis/BR00116991-A01-1/Nuclei.csv')]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# show the files we will use as source data with CytoTable\n", "list(source_cloud_path.glob(\"*\"))" ] }, { "cell_type": "code", "execution_count": 5, "id": "78f1f734-16f5-4957-9aeb-ad421d3d1bae", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 657 ms, sys: 474 ms, total: 1.13 s\n", "Wall time: 23.1 s\n" ] }, { "data": { "text/plain": [ "'cloud_example.parquet'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "# run cytotable convert\n", "result = cytotable.convert(\n", " source_path=source_path,\n", " dest_path=dest_path,\n", " # specify a destination data format type\n", " dest_datatype=\"parquet\",\n", " # specify a preset which enables quick use of common input file formats\n", " preset=\"cellprofiler_csv\",\n", " # use unsigned (anonymous) requests to AWS S3\n", " no_sign_request=True,\n", ")\n", "print(pathlib.Path(result).name)" ] }, { "cell_type": "code", "execution_count": 6, "id": "1601b045-2631-46d7-a001-39ae6cfb27fb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Metadata_ImageNumber | \n", "Metadata_Cells_Parent_Nuclei | \n", "Metadata_Cytoplasm_Parent_Cells | \n", "Metadata_Cytoplasm_Parent_Nuclei | \n", "Metadata_ObjectNumber | \n", "Image_FileName_CellOutlines | \n", "Image_FileName_IllumAGP | \n", "Image_FileName_IllumBrightfield | \n", "Image_FileName_IllumDNA | \n", "Image_FileName_IllumER | \n", "... | \n", "Nuclei_Texture_Variance_RNA_10_02_256 | \n", "Nuclei_Texture_Variance_RNA_10_03_256 | \n", "Nuclei_Texture_Variance_RNA_3_00_256 | \n", "Nuclei_Texture_Variance_RNA_3_01_256 | \n", "Nuclei_Texture_Variance_RNA_3_02_256 | \n", "Nuclei_Texture_Variance_RNA_3_03_256 | \n", "Nuclei_Texture_Variance_RNA_5_00_256 | \n", "Nuclei_Texture_Variance_RNA_5_01_256 | \n", "Nuclei_Texture_Variance_RNA_5_02_256 | \n", "Nuclei_Texture_Variance_RNA_5_03_256 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "A01_s1--cell_outlines.png | \n", "BR00116991_IllumAGP.npy | \n", "BR00116991_IllumBrightfield.npy | \n", "BR00116991_IllumDNA.npy | \n", "BR00116991_IllumER.npy | \n", "... | \n", "123.254311 | \n", "97.515432 | \n", "104.416086 | \n", "102.542736 | \n", "97.846168 | \n", "103.858206 | \n", "106.430977 | \n", "108.100381 | \n", "103.027255 | \n", "114.144057 | \n", "
1 | \n", "1 | \n", "2 | \n", "2 | \n", "2 | \n", "2 | \n", "A01_s1--cell_outlines.png | \n", "BR00116991_IllumAGP.npy | \n", "BR00116991_IllumBrightfield.npy | \n", "BR00116991_IllumDNA.npy | \n", "BR00116991_IllumER.npy | \n", "... | \n", "113.730092 | \n", "124.395062 | \n", "110.407805 | \n", "112.069085 | \n", "118.502086 | \n", "110.968525 | \n", "109.129278 | \n", "112.849919 | \n", "118.488473 | \n", "109.671296 | \n", "
2 | \n", "1 | \n", "3 | \n", "3 | \n", "3 | \n", "3 | \n", "A01_s1--cell_outlines.png | \n", "BR00116991_IllumAGP.npy | \n", "BR00116991_IllumBrightfield.npy | \n", "BR00116991_IllumDNA.npy | \n", "BR00116991_IllumER.npy | \n", "... | \n", "32.055903 | \n", "33.261607 | \n", "29.508841 | \n", "29.782456 | \n", "31.286135 | \n", "30.040329 | \n", "30.164875 | \n", "29.909890 | \n", "30.905352 | \n", "31.879207 | \n", "
3 | \n", "1 | \n", "4 | \n", "4 | \n", "4 | \n", "4 | \n", "A01_s1--cell_outlines.png | \n", "BR00116991_IllumAGP.npy | \n", "BR00116991_IllumBrightfield.npy | \n", "BR00116991_IllumDNA.npy | \n", "BR00116991_IllumER.npy | \n", "... | \n", "98.994943 | \n", "93.852921 | \n", "86.758957 | \n", "89.990907 | \n", "84.579607 | \n", "84.336410 | \n", "87.629213 | \n", "91.608659 | \n", "85.393601 | \n", "90.367573 | \n", "
4 | \n", "1 | \n", "5 | \n", "5 | \n", "5 | \n", "5 | \n", "A01_s1--cell_outlines.png | \n", "BR00116991_IllumAGP.npy | \n", "BR00116991_IllumBrightfield.npy | \n", "BR00116991_IllumDNA.npy | \n", "BR00116991_IllumER.npy | \n", "... | \n", "63.190325 | \n", "75.623413 | \n", "69.075226 | \n", "71.936792 | \n", "68.527792 | \n", "65.938826 | \n", "68.118131 | \n", "64.434855 | \n", "65.367477 | \n", "69.311713 | \n", "
5 rows × 5812 columns
\n", "