WayScience · axiomcura · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/notebooks/0.download-data/1.download-data.ipynb b/notebooks/0.download-data/1.download-data.ipynb
diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb
@@ -31,17 +31,12 @@
     "import sys\n",
     "import json\n",
     "import pathlib\n",
-    "from typing import Optional\n",
     "\n",
     "import polars as pl\n",
     "\n",
     "sys.path.append(\"../../\")\n",
-    "from utils.data_utils import (\n",
-    "    split_meta_and_features,\n",
-    "    add_cell_id_hash,\n",
-    "    transform_ensg_to_gene_symbol,\n",
-    ")\n",
-    "from utils.io_utils import load_profiles"
+    "from utils.data_utils import split_meta_and_features, add_cell_id_hash\n",
+    "from utils.io_utils import load_and_concat_profiles"
    ]
   },
   {
@@ -61,64 +56,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def load_and_concat_profiles(\n",
-    "    profile_dir: str | pathlib.Path,\n",
-    "    shared_features: Optional[list[str]] = None,\n",
-    "    specific_plates: Optional[list[pathlib.Path]] = None,\n",
-    ") -> pl.DataFrame:\n",
-    "    \"\"\"\n",
-    "    Load all profile files from a directory and concatenate them into a single Polars DataFrame.\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    profile_dir : str or pathlib.Path\n",
-    "        Directory containing the profile files (.parquet).\n",
-    "    shared_features : Optional[list[str]], optional\n",
-    "        List of shared feature names to filter the profiles. If None, all features are loaded.\n",
-    "    specific_plates : Optional[list[pathlib.Path]], optional\n",
-    "        List of specific plate file paths to load. If None, all profiles in the directory are loaded.\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    pl.DataFrame\n",
-    "        Concatenated Polars DataFrame containing all loaded profiles.\n",
-    "    \"\"\"\n",
-    "    # Ensure profile_dir is a pathlib.Path\n",
-    "    if isinstance(profile_dir, str):\n",
-    "        profile_dir = pathlib.Path(profile_dir)\n",
-    "    elif not isinstance(profile_dir, pathlib.Path):\n",
-    "        raise TypeError(\"profile_dir must be a string or a pathlib.Path object\")\n",
-    "\n",
-    "    # Validate specific_plates\n",
-    "    if specific_plates is not None:\n",
-    "        if not isinstance(specific_plates, list):\n",
-    "            raise TypeError(\"specific_plates must be a list of pathlib.Path objects\")\n",
-    "        if not all(isinstance(path, pathlib.Path) for path in specific_plates):\n",
-    "            raise TypeError(\n",
-    "                \"All elements in specific_plates must be pathlib.Path objects\"\n",
-    "            )\n",
-    "\n",
-    "    # Use specific_plates if provided, otherwise gather all .parquet files\n",
-    "    if specific_plates is not None:\n",
-    "        # Validate that all specific plate files exist\n",
-    "        for plate_path in specific_plates:\n",
-    "            if not plate_path.exists():\n",
-    "                raise FileNotFoundError(f\"Profile file not found: {plate_path}\")\n",
-    "        files_to_load = specific_plates\n",
-    "    else:\n",
-    "        files_to_load = list(profile_dir.glob(\"*.parquet\"))\n",
-    "        if not files_to_load:\n",
-    "            raise FileNotFoundError(f\"No profile files found in {profile_dir}\")\n",
-    "\n",
-    "    # Load and concatenate profiles\n",
-    "    loaded_profiles = [\n",
-    "        load_profiles(f, shared_features=shared_features) for f in files_to_load\n",
-    "    ]\n",
-    "\n",
-    "    # Concatenate all loaded profiles\n",
-    "    return pl.concat(loaded_profiles, rechunk=True)\n",
-    "\n",
-    "\n",
     "def split_data(\n",
     "    pycytominer_output: pl.DataFrame, dataset: str = \"CP_and_DP\"\n",
     ") -> pl.DataFrame:\n",
@@ -193,6 +130,18 @@
   {
    "cell_type": "code",
    "execution_count": 3,
+   "id": "3dfe8d86",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the type of perturbation for the dataset\n",
+    "# options are: \"compound\" or \"crispr\"\n",
+    "pert_type = \"crispr\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
    "id": "3ea207e4",
    "metadata": {},
    "outputs": [],
@@ -203,16 +152,20 @@
     "# Setting profiles directory\n",
     "profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n",
     "\n",
-    "# setting connectivity map drug repurposing config\n",
-    "drug_repurposing_config_path = (data_dir / \"repurposing_drugs_20180907.txt\").resolve(\n",
-    "    strict=True\n",
-    ")\n",
     "\n",
     "# Experimental metadata\n",
     "exp_metadata_path = (\n",
-    "    profiles_dir / \"cpjump1\" / \"cpjump1_compound_experimental-metadata.csv\"\n",
+    "    profiles_dir / \"cpjump1\" / f\"cpjump1_{pert_type}_experimental-metadata.csv\"\n",
     ").resolve(strict=True)\n",
     "\n",
+    "# cpjump1 compound metadata\n",
+    "if pert_type == \"compound\":\n",
+    "    cmp_metadata_path = (\n",
+    "        profiles_dir / \"cpjump1\" / \"cpjump1_compound_compound-metadata.tsv\"\n",
+    "    ).resolve(strict=True)\n",
+    "else:\n",
+    "    cmp_metadata_path = None\n",
+    "\n",
     "# Setting CFReT profiles directory\n",
     "cfret_profiles_dir = (profiles_dir / \"cfret\").resolve(strict=True)\n",
     "cfret_profiles_path = (\n",
@@ -244,12 +197,12 @@
    "id": "7168a71a",
    "metadata": {},
    "source": [
-    "Create a list of paths that only points compound treated plates and load the shared features config file that can be found in this [repo](https://github.com/WayScience/JUMP-single-cell)"
+    "Create a list of paths pointing to the selected CPJUMP1 plates and load the shared features configuration file from the [JUMP-single-cell](https://github.com/WayScience/JUMP-single-cell) repository."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "c7944fc2",
    "metadata": {},
    "outputs": [],
@@ -260,7 +213,7 @@
     "compound_plate_names = (\n",
     "    exp_metadata.select(\"Assay_Plate_Barcode\").unique().to_series().to_list()\n",
     ")\n",
-    "compound_plate_paths = [\n",
+    "cpjump1_plate_paths = [\n",
     "    (profiles_dir / \"cpjump1\" / f\"{plate}_feature_selected_sc_qc.parquet\").resolve(\n",
     "        strict=True\n",
     "    )\n",
@@ -278,41 +231,41 @@
    "id": "c6bfd5c7",
    "metadata": {},
    "source": [
-    "## Preprocessing CPJUMP1 Compound data\n",
+    "## Preprocessing CPJUMP1 Data\n",
     "\n",
-    "Using the filtered compound plate file paths and shared features configuration, we load all individual profile files and concatenate them into a single comprehensive DataFrame. This step combines data from multiple experimental plates while maintaining the consistent feature space defined by the shared features list.\n",
+    "Using the filtered plate file paths and shared features configuration, we load all individual profile files and concatenate them into a single comprehensive DataFrame. This step combines data from multiple experimental plates, for either compound or CRISPR perturbation types, while maintaining a consistent feature space defined by the shared features list.\n",
     "\n",
     "The concatenation process ensures:\n",
     "- All profiles use the same feature set for downstream compatibility\n",
     "- Metadata columns are preserved across all plates\n",
     "- Data integrity is maintained during the merge operation\n",
-    "- Adding a unique cell id has column `Metadata_cell_id`"
+    "- A unique cell identifier is added via the `Metadata_cell_id` column"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "9ec882fa",
    "metadata": {},
    "source": [
-    "We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis."
+    "We load per-plate Parquet profiles for the selected perturbation type (compound or CRISPR), apply the shared feature set, and concatenate them into a single Polars DataFrame while preserving metadata. A unique `Metadata_cell_id` is added for each cell. The resulting `cpjump1_profiles` table is ready for downstream analysis."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "f6f7e08d",
    "metadata": {},
    "outputs": [],
    "source": [
     "# Loading compound profiles with shared features and concat into a single DataFrame\n",
     "concat_output_path = (\n",
-    "    cpjump1_output_dir / \"cpjump1_compound_concat_profiles.parquet\"\n",
+    "    cpjump1_output_dir / f\"cpjump1_{pert_type}_concat_profiles.parquet\"\n",
     ").resolve()\n",
     "\n",
     "# loaded and concatenated profiles\n",
     "cpjump1_profiles = load_and_concat_profiles(\n",
     "    profile_dir=profiles_dir,\n",
-    "    specific_plates=compound_plate_paths,\n",
+    "    specific_plates=cpjump1_plate_paths,\n",
     "    shared_features=shared_features,\n",
     ")\n",
     "\n",
@@ -325,50 +278,75 @@
    "id": "3df9bbf5",
    "metadata": {},
    "source": [
-    "Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.\n"
+    "For compound-treated plates, we annotate each profile with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP), which provides drug and tool compound annotations including target information and clinical development status. Cell type metadata is also merged in from the experimental metadata. This step is skipped for CRISPR-treated plates."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "adfb9148",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skipping this step since the dataset is CPJUMP1 crispr and not compound\n"
+     ]
+    }
+   ],
    "source": [
     "# load drug repurposing moa file and add prefix to metadata columns\n",
-    "rep_moa_df = pl.read_csv(\n",
-    "    drug_repurposing_config_path, separator=\"\\t\", skip_rows=9, encoding=\"utf8-lossy\"\n",
-    ").rename(lambda x: f\"Metadata_{x}\" if not x.startswith(\"Metadata_\") else x)\n",
+    "if pert_type == \"compound\":\n",
+    "    rep_moa_df = pl.read_csv(\n",
+    "        cmp_metadata_path,\n",
+    "        separator=\"\\t\",\n",
+    "        columns=[\"Metadata_pert_iname\", \"Metadata_target\", \"Metadata_moa\"],\n",
+    "    ).unique(subset=[\"Metadata_pert_iname\"])\n",
+    "\n",
+    "    # merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname\n",
+    "    cpjump1_profiles = cpjump1_profiles.join(\n",
+    "        rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n",
+    "    )\n",
     "\n",
-    "# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname\n",
-    "cpjump1_profiles = cpjump1_profiles.join(\n",
-    "    rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n",
-    ")\n",
+    "    # merge cell type metadata with cpjump1_profiles on Metadata_Plate\n",
+    "    cell_type_metadata = exp_metadata.select(\n",
+    "        [\"Assay_Plate_Barcode\", \"Cell_type\"]\n",
+    "    ).rename(\n",
+    "        {\"Assay_Plate_Barcode\": \"Metadata_Plate\", \"Cell_type\": \"Metadata_cell_type\"}\n",
+    "    )\n",
+    "    cpjump1_profiles = cpjump1_profiles.join(\n",
+    "        cell_type_metadata, on=\"Metadata_Plate\", how=\"left\"\n",
+    "    )\n",
+    "else:\n",
+    "    print(\n",
+    "        f\"Skipping this step since the dataset is CPJUMP1 {pert_type} and not compound\"\n",
+    "    )\n",
     "\n",
     "# split meta and feature\n",
     "meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)\n",
     "\n",
     "# save the feature space information into a json file\n",
     "meta_features_dict = {\n",
     "    \"concat-profiles\": {\n",
+    "        \"data-type\": f\"{pert_type}_plates\",\n",
     "        \"meta-features\": meta_cols,\n",
     "        \"shared-features\": features_cols,\n",
     "    }\n",
     "}\n",
-    "with open(cpjump1_output_dir / \"concat_profiles_meta_features.json\", \"w\") as f:\n",
+    "with open(\n",
+    "    cpjump1_output_dir / f\"{pert_type}_concat_profiles_meta_features.json\", \"w\"\n",
+    ") as f:\n",
     "    json.dump(meta_features_dict, f, indent=4)\n",
     "\n",
     "# save concatenated profiles\n",
     "# Loading compound profiles with shared features and concat into a single DataFrame\n",
-    "concat_output_path = (\n",
-    "    cpjump1_output_dir / \"cpjump1_compound_concat_profiles.parquet\"\n",
-    ").resolve()\n",
     "cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "4a0ba6ad",
+   "id": "92bacbc9",
    "metadata": {},
    "source": [
     "## Preprocessing MitoCheck Dataset\n",
@@ -391,7 +369,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
    "id": "c5471d3e",
    "metadata": {},
    "outputs": [],
@@ -445,7 +423,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 9,
    "id": "c57da947",
    "metadata": {},
    "outputs": [],
@@ -478,7 +456,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 10,
    "id": "1d7ced04",
    "metadata": {},
    "outputs": [],
@@ -532,7 +510,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "42108980",
    "metadata": {},
    "outputs": [],
@@ -568,11 +546,6 @@
     "    .to_list()\n",
     "    if gene.startswith(\"ENSG\")\n",
     "]\n",
-    "decoded_genes = transform_ensg_to_gene_symbol(ensg_ids)\n",
-    "\n",
-    "# save the mapping of ENSG IDs to gene symbols in a json file\n",
-    "with open(mitocheck_dir / \"mitocheck_ensg_to_gene_symbol_mapping.json\", \"w\") as f:\n",
-    "    json.dump(decoded_genes, f, indent=4)\n",
     "\n",
     "# save concatenated mitocheck profiles\n",
     "concat_mitocheck_profiles.write_parquet(\n",
@@ -594,14 +567,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "id": "1763d383",
    "metadata": {},
    "outputs": [],
    "source": [
     "# load in cfret profiles and add a unique cell ID\n",
     "cfret_profiles = pl.read_parquet(cfret_profiles_path)\n",
     "\n",
+    "\n",
     "# adding a unique cell ID based on all features\n",
     "cfret_profiles = add_cell_id_hash(cfret_profiles, force=True)\n",
     "\n",
@@ -643,7 +617,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.11"
+   "version": "3.12.9"
   }
  },
  "nbformat": 4,