From 50442035ab9516f537ac6bf7cb6b7af41cf03ea8 Mon Sep 17 00:00:00 2001
From: Erik Serrano <erikishere3@gmail.com>
Date: Sat, 7 Mar 2026 22:38:43 -0700
Subject: [PATCH 01/16] updated docs and removed functions that were no longer
 used

---
 .pre-commit-config.yaml |   2 +-
 utils/data_utils.py     | 494 +++++++++-------------------------------
 utils/io_utils.py       | 112 +++++++--
 utils/validator.py      | 133 -----------
 4 files changed, 192 insertions(+), 549 deletions(-)
 delete mode 100644 utils/validator.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bfa641a..755d4c5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -35,7 +35,7 @@ repos:
 
   # Ruff for linting and formatting Python files
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.15.4
+    rev: v0.15.5
     hooks:
     -   id: ruff-check
         args: ["--fix"]
diff --git a/utils/data_utils.py b/utils/data_utils.py
index 64f24a5..c8ef769 100644
--- a/utils/data_utils.py
+++ b/utils/data_utils.py
@@ -1,8 +1,9 @@
 """
-Module: utils.py
+Module: data_utils.py
 
-A collection of common utility functions for data processing,
-as well as for saving, loading, and writing files.
+Utility functions for processing image-based single-cell profiles, including
+feature/metadata splitting, hash-based cell ID generation, data shuffling,
+consensus signature generation, and profile loading/concatenation.
 """
 
 import hashlib
@@ -15,139 +16,17 @@
 from pycytominer.cyto_utils import infer_cp_features
 
 
-def _sort_features_by_compartment_organelles(
-    features: list[str],
-    compartment_pos: int = 0,
-    organelle_pos: int = 3,
-    organelles: list[str] = ["DNA", "RNA", "ER", "Mito", "AGP"],
-) -> dict:
-    """Sort features by compartment and organelle.
-
-    This function takes a list of feature names and organizes them into a nested dictionary
-    structure where the first level is compartments and the second level is organelles.
-    It filters out features that do not match the specified organelle list.
-
-    Parameters
-    ----------
-    features : list[str]
-        list of morpholgy features
-    compartment_pos : int, optional
-        position where the compartment name resides with the feature name
-        , by default 0
-    organelle_pos : int, optional
-        position where the organelle name resides within the feature name
-        , by default 3
-    organelles : list[str], optional
-        List of organelles that are measured in the feature space,
-        by default ["DNA", "RNA", "ER", "Mito", "AGP"]
-
-    Returns
-    -------
-    dict
-        Nested dictionary: compartment -> organelle -> features
-    """
-
-    result = defaultdict(list)
-    for feature in features:
-        # Skip AreaShape features as they don't contain organelle information
-        if "AreaShape" in feature:
-            continue
-
-        # Split feature name and validate structure
-        split_feature = feature.split("_")
-        if len(split_feature) < 4:
-            continue
-
-        # Extract compartment and organelle from feature name
-        compartment = split_feature[compartment_pos]
-        organelle = split_feature[organelle_pos]
-
-        # Only include features with valid organelles
-        if organelle in organelles:
-            result[compartment].append(feature)
-
-    # Create nested dictionary: compartment -> organelle -> features
-    compartment_organelle_dict = defaultdict(dict)
-    for compartment, features_list in result.items():
-        organelle_dict = defaultdict(list)
-
-        # Group features by organelle within each compartment
-        for feature in features_list:
-            organelle = feature.split("_")[organelle_pos]
-            organelle_dict[organelle].append(feature)
-
-        compartment_organelle_dict[compartment] = organelle_dict
-
-    return compartment_organelle_dict
-
-
-def _generate_organelle_counts(compartment_organelle_dict: dict) -> dict:
-    """Generate a count of organelles per compartment for each gene.
-
-    This function processes a nested dictionary containing gene signatures organized
-    by compartment and organelle, and returns the count of features for each
-    organelle within each compartment for every gene.
-
-    Parameters
-    ----------
-    compartment_organelle_dict : dict
-        Nested dictionary structure:
-        gene -> signature_type -> compartment -> organelle -> list of features
-        Where signature_type is 'on_morph_sig' or 'off_morph_sig'
-
-    Returns
-    -------
-    dict
-        Dictionary structure: gene -> signature_type -> compartment -> organelle -> count
-        Where count is the number of features for each organelle in each compartment
-
-    Raises
-    ------
-    TypeError
-        If the organelle_dict for any gene is not a dictionary
-    """
-    # Initialize a nested dictionary to hold the counts
-    # This will be structured as: gene -> signature_type -> compartment -> organelle -> count
-    feature_count_per_organelle = defaultdict(lambda: defaultdict(dict))
-
-    # Iterate through every gene's on and off morphology signatures that are sorted by
-    # compartment and organelle
-    for gene, signature_dict in compartment_organelle_dict.items():
-        if not isinstance(signature_dict, dict):
-            raise TypeError(
-                f"Expected signature_dict to be a dict for gene {gene}, got {type(signature_dict)}"
-            )
-
-        # Process each signature type (on_morph_sig, off_morph_sig)
-        counted_organelle_per_signature = defaultdict(dict)
-        for sig_type, compartment_dict in signature_dict.items():
-            # For each compartment-organelle combination, count the number of features
-            counted_organelle_dict = defaultdict(dict)
-            for compartment, organelle_dict in compartment_dict.items():
-                for organelle, features in organelle_dict.items():
-                    counted_organelle_dict[compartment][organelle] = len(features)
-            counted_organelle_per_signature[sig_type] = counted_organelle_dict
-
-        # Store the counted organelle dictionary per gene and signature type
-        feature_count_per_organelle[gene] = counted_organelle_per_signature
-
-    return feature_count_per_organelle
-
-
 def split_meta_and_features(
     profile: pd.DataFrame | pl.DataFrame,
     compartments: list[str] = ["Nuclei", "Cells", "Cytoplasm"],
     metadata_tag: bool | None = False,
 ) -> tuple[list[str], list[str]]:
-    """Splits metadata and feature column names
-
-    This function takes a DataFrame containing image-based profiles and splits
-    the column names into metadata and feature columns. It uses the Pycytominer's
-    `infer_cp_features` function to identify feature columns based on the specified compartments.
-    If the `metadata_tag` is set to False, it assumes that metadata columns do not have a specific tag
-    and identifies them by excluding feature columns. If `metadata_tag` is True, it uses
-    the `infer_cp_features` function with the `metadata` argument set to True.
+    """Split column names of an image-based profile into metadata and feature lists.
 
+    Uses pycytominer's `infer_cp_features` to identify CellProfiler feature columns
+    based on the specified compartments. Metadata columns are identified as all
+    remaining columns not in the feature set (when `metadata_tag=False`), or via
+    `infer_cp_features(metadata=True)` when `metadata_tag=True`.
 
     Parameters
     ----------
@@ -167,8 +46,8 @@ def split_meta_and_features(
 
     Notes
     -----
-    - If a polars DataFrame is provided, it will be converted to a pandas DataFrame in order
-    to maintain compatibility with the `infer_cp_features` function.
+    - If a polars DataFrame is provided, it will be converted to a pandas DataFrame in
+      order to maintain compatibility with the `infer_cp_features` function.
     """
 
     # type checking
@@ -183,7 +62,8 @@ def split_meta_and_features(
     # identify features names
     features_cols = infer_cp_features(profile, compartments=compartments)
 
-    # iteratively search metadata features and retain order if the Metadata tag is not added
+    # iteratively search metadata features and retain order if the Metadata tag is not
+    # added
     if metadata_tag is False:
         meta_cols = [
             colname
@@ -196,256 +76,22 @@ def split_meta_and_features(
     return (meta_cols, features_cols)
 
 
-def group_signature_by_compartment(signatures: dict, compartment_pos: int = 0):
-    """Group gene features in each signature by their compartment.
-
-    This function takes a dictionary of gene signatures and groups the features
-    by their compartment. The compartment is determined by the position in the
-    feature string, which is specified by the `compartment_pos` parameter.
-
-    Parameters
-    ----------
-    signatures : dict
-        A dictionary containing gene signatures.
-    compartment_pos : int, optional
-        The position of the compartment in the feature string, by default 0
-
-    Returns
-    -------
-    dict
-        A dictionary with genes as keys and their grouped features as values.
-        The structure is: gene --> signature_type -> compartment -> features
-    """
-    # Type validation
-    if not isinstance(signatures, dict):
-        raise TypeError("signatures must be a dictionary")
-    if not isinstance(compartment_pos, int):
-        raise TypeError("compartment_pos must be an integer")
-
-    # Initialize the result dictionary
-    gene_signature_grouped_by_compartment = defaultdict(lambda: defaultdict(dict))
-
-    # Process each gene and its signatures
-    for gene, signature_dict in signatures.items():
-        # get features from each signature type
-        for sig_type, features in signature_dict.items():
-            # Group features by compartment for this signature type
-            compartment_groups = defaultdict(list)
-            for feature in features:
-                try:
-                    compartment = feature.split("_")[compartment_pos]
-                    compartment_groups[compartment].append(feature)
-
-                # Handle features that don't have enough parts when split
-                except IndexError:
-                    continue
-
-            # Store the grouped features
-            gene_signature_grouped_by_compartment[gene][sig_type] = dict(
-                compartment_groups
-            )
-
-    return gene_signature_grouped_by_compartment
-
-
-def group_features_by_compartment_organelle(
-    signatures: dict,
-    compartments: list[str] = ["Nuclei", "Cytoplasm", "Cells"],
-    organelles: list[str] = ["DNA", "RNA", "ER", "Mito", "AGP"],
-    compartment_pos: int = 0,
-    organelle_pos: int = 3,
-) -> dict:
-    """Group features by compartment and organelle from gene on- and off-morphology
-    signatures.
-
-    This function processes on- off- signatures of each gene to organize morphological
-    features into nested dictionaries based on compartment and organelle groupings.
-    It applies validation checks and uses the helper function `_sort_compartment_organelles`
-    to structure the data.
-
-    Keep note that some features are removed since this function is solely looking
-    for features that contain organelle information. For example, features that have AreaShape
-    measurements do not contain organelle information and therefore are excluded.
-
-    Parameters
-    ----------
-    signatures : dict
-        Dictionary where keys are gene names and values are dictionaries containing
-        'on_morph_sig' and 'off_morph_sig' lists of morphological features
-    compartments : list[str], optional
-        List of valid compartment names, by default ["Nuclei", "Cytoplasm", "Cells"]
-    organelles : list[str], optional
-        List of valid organelle names, by default ["DNA", "RNA", "ER", "Mito", "AGP"]
-    compartment_pos : int, optional
-        Position index for compartment name in feature string, by default 0
-    organelle_pos : int, optional
-        Position index for organelle name in feature string, by default 3
-
-    Returns
-    -------
-    dict
-        Nested dictionary structure:
-        gene -> {'on_morph_sig': {compartment: {organelle: [features]}},
-                'off_morph_sig': {compartment: {organelle: [features]}}}
-
-    Raises
-    ------
-    TypeError
-        If signatures is not a dict with proper structure, or if compartments/organelles
-        are not lists of strings, or if position parameters are not integers
-    ValueError
-        If position parameters are negative or equal to each other
-    """
-
-    # type checking for compartments and organelles
-    if not isinstance(signatures, dict):
-        raise TypeError("Signatures must be a dictionary.")
-    if not isinstance(compartments, list) or not isinstance(organelles, list):
-        raise TypeError("Compartments and organelles must be lists.")
-    if not all(isinstance(compartment, str) for compartment in compartments):
-        raise TypeError("All compartments must be strings.")
-    if not all(isinstance(organelle, str) for organelle in organelles):
-        raise TypeError("All organelles must be strings.")
-    if not isinstance(compartment_pos, int) or not isinstance(organelle_pos, int):
-        raise TypeError("Compartment and organelle positions must be integers.")
-    if compartment_pos < 0 or organelle_pos < 0:
-        raise ValueError("Compartment and organelle positions must be non-negative.")
-    if compartment_pos == organelle_pos:
-        raise ValueError("Compartment and organelle positions must be different.")
-
-    # Group features by compartment that contain organelle information
-    sorted_compartment_and_organelle_per_gene = defaultdict(dict)
-    for gene, signature_dict in signatures.items():
-        # extracting features from signatures
-        on_sig_features = _sort_features_by_compartment_organelles(
-            signature_dict["on_morph_sig"]
-        )
-        off_sig_features = _sort_features_by_compartment_organelles(
-            signature_dict["off_morph_sig"]
-        )
-
-        # Combine the sorted features for the gene
-        sorted_compartment_and_organelle_per_gene[gene] = {
-            "on_morph_sig": on_sig_features,
-            "off_morph_sig": off_sig_features,
-        }
-
-    return sorted_compartment_and_organelle_per_gene
-
-
-def organelle_count_table_per_gene(
-    sorted_signatures: dict, stratify_by_compartment: bool = False
-) -> pd.DataFrame:
-    """Generate a count table of organelles per gene from morphological signatures.
-
-    This function processes gene signatures that have been organized by compartment
-    and organelle to create a summary table showing the count of features for each
-    organelle within each gene's on- and off-morphology signatures.
-
-    Parameters
-    ----------
-    sorted_signatures : dict
-        Nested dictionary structure containing gene signatures organized by compartment
-        and organelle. Expected format:
-        gene -> signature_type -> compartment -> organelle -> list of features
-        where signature_type is 'on_morph_sig' or 'off_morph_sig'
-    stratify_by_compartment : bool, optional
-        If True, creates separate columns for each compartment-organelle combination
-        (e.g., "Cyto_DNA", "Nuc_RNA"). If False, sums counts across all compartments
-        for each organelle, by default False
-
-    Returns
-    -------
-    pd.DataFrame
-        DataFrame with organelle counts per gene and signature type. Structure depends
-        on stratify_by_compartment parameter:
-        - If True: columns are compartment_organelle combinations (e.g., "Cyto_DNA")
-        - If False: columns are organelle names with counts summed across compartments
-        Index contains gene names, with 'sig_type' column indicating 'on' or 'off'
-
-    Notes
-    -----
-    - Each gene will have two rows in the output: one for 'on' signatures and one for 'off'
-    - Compartment names are abbreviated: "Cytoplasm" -> "Cyto", "Nuclei" -> "Nuc"
-    - Missing organelle counts are filled with 0
-    - The function uses the helper function `_generate_organelle_counts` to process
-      the input data structure
-
-
-    """
-    # count organelles per compartment
-    organelle_counts = _generate_organelle_counts(sorted_signatures)
-
-    # initialize an empty DataFrame to hold the counts
-    organelle_counted_per_gene = pd.DataFrame()
-
-    # iterate through each gene and its morphological signatures
-    for gene, morph_signatures in organelle_counts.items():
-        # iterate through each signature type (on_morph_sig, off_morph_sig)
-        for sig_type, compartment_organelle_counts in morph_signatures.items():
-            # convert nested dict to DataFrame with compartments as index and organelles as columns
-            count_table = (
-                pd.DataFrame.from_dict(compartment_organelle_counts, orient="index")
-                .fillna(0)
-                .astype(int)
-            )
-
-            if stratify_by_compartment:
-                # create compartment-organelle combinations as columns
-                flattened_data = []
-                column_names = []
-
-                for compartment in count_table.index:
-                    # abbreviate compartment names
-                    compartment_abbrev = (
-                        "Cyto"
-                        if compartment == "Cytoplasm"
-                        else "Nuc"
-                        if compartment == "Nuclei"
-                        else compartment
-                    )
-
-                    # add compartment-organelle combinations
-                    for organelle in count_table.columns:
-                        column_names.append(f"{compartment_abbrev}_{organelle}")
-                        flattened_data.append(count_table.loc[compartment, organelle])
-
-                # create DataFrame with flattened structure
-                gene_row = pd.DataFrame(
-                    [flattened_data], columns=column_names, index=[gene]
-                )
-            else:
-                # sum counts across all compartments for each organelle
-                gene_row = count_table.sum().to_frame().T
-                gene_row.index = [gene]
-
-            # add signature type column
-            gene_row.insert(0, "sig_type", sig_type.split("_")[0])
-
-            # concatenate to main DataFrame
-            organelle_counted_per_gene = pd.concat(
-                [organelle_counted_per_gene, gene_row]
-            ).fillna(0)
-
-    return organelle_counted_per_gene
-
-
 def generate_consensus_signatures(
     signatures_dict, features: list[str], min_consensus_threshold=0.5
 ) -> dict:
     """
-    Generate consensus morphological signatures from multiple comparisons.
+    Generate consensus on/off morphological signatures across multiple comparisons.
 
-    This function aggregates on-morphology signatures across different negative control samples
-    for each positive control, finding features that consistently appear across multiple comparisons.
-    The off-morphology signatures are then defined as the complement of on-morphology features
-    from the full feature set.
+    For each positive control, aggregates on-morphology features across all
+    comparisons and retains only those features that appear in at least
+    `min_consensus_threshold` fraction of comparisons. Off-morphology features are
+    defined as all features NOT in the consensus on-set.
 
     Parameters
     ----------
     signatures_dict : dict
         Dictionary containing signature results with structure:
-        {comparison_id: {"controls": {"positive": gene, "negative": seed},
+        {comparison_id: {"controls": {"positive": label, "negative": seed},
                         "signatures": {"on": [...], "off": [...]}}}
     features : list[str]
         Complete list of all available morphological features
@@ -457,8 +103,9 @@ def generate_consensus_signatures(
     -------
     dict
         Dictionary with structure:
-        {gene: {"on": [feature1, feature2, ...], "off": [feature1, feature2, ...]}}
-        where "off" features are the complement of "on" features from the full feature set
+        {label: {"on": [feature1, feature2, ...], "off": [feature1, feature2, ...]}}
+        where "off" features are the complement of "on" features from the full feature
+        set
 
     Raises
     ------
@@ -471,29 +118,30 @@ def generate_consensus_signatures(
     # Input validation
     if not 0.0 <= min_consensus_threshold <= 1.0:
         raise ValueError(
-            f"min_consensus_threshold must be between 0.0 and 1.0, got {min_consensus_threshold}"
+            "min_consensus_threshold must be between 0.0 and 1.0, "
+            f"got {min_consensus_threshold}"
         )
 
     if not signatures_dict:
         return {}
 
-    # Group on-morphology signatures by positive control gene
-    on_signatures_by_gene = defaultdict(list)
+    # Group on-morphology signatures by positive control label
+    on_signatures_by_label = defaultdict(list)
 
     try:
         for _, sig_results in signatures_dict.items():
             positive_control = sig_results["controls"]["positive"]
             on_signature_features = sig_results["signatures"]["on"]
-            on_signatures_by_gene[positive_control].append(on_signature_features)
+            on_signatures_by_label[positive_control].append(on_signature_features)
 
     except KeyError as e:
         raise KeyError(f"Missing required key in signatures_dict: {e}")
 
-    # Generate consensus signatures for each gene
+    # Generate consensus signatures for each label
     consensus_signatures = {}
     full_features_set = set(features)
 
-    for gene, feature_lists in on_signatures_by_gene.items():
+    for label, feature_lists in on_signatures_by_label.items():
         # Calculate consensus on-features
         if not feature_lists:
             consensus_on_features = []
@@ -529,7 +177,7 @@ def generate_consensus_signatures(
         )
 
         # Store results
-        consensus_signatures[gene] = {
+        consensus_signatures[label] = {
             "on": consensus_on_features,
             "off": consensus_off_features,
         }
@@ -566,10 +214,6 @@ def add_cell_id_hash(
         If True, overwrites existing 'Metadata_cell_id' column. If False and the
         column exists, returns the DataFrame unchanged with a warning message,
         by default False.
-    null_replacement : str, optional
-        String to represent null values in the hash (does not modify original data),
-        by default "NULL".
-
     Returns
     -------
     pl.DataFrame
@@ -603,7 +247,7 @@ def add_cell_id_hash(
         else:
             profiles = profiles.drop("Metadata_cell_id")
 
-    # Create hash column using temporary null-filled versionx
+    # Create hash column using a null-filled version of each column (nulls → "NULL")
     hash_column = (
         pl.concat_str(
             [pl.col(col).cast(pl.Utf8).fill_null("NULL") for col in profiles.columns]
@@ -627,11 +271,12 @@ def shuffle_feature_profiles(
     seed: int = 42,
 ) -> pl.DataFrame:
     """
-    Create a shuffled version of the dataset where each morphological feature
-    column is independently shuffled (values permuted within each column).
+    Return a shuffled copy of the profiles DataFrame for use as a null baseline.
 
-    This breaks the correlation structure between features while preserving
-    the marginal distributions, creating a null baseline for comparison.
+    - ``method="row"``: shuffles entire rows, preserving feature correlations within
+    cells.
+    - ``method="column"``: shuffles each feature column independently, breaking
+      inter-feature correlations while preserving each feature's marginal distribution.
 
     Parameters
     ----------
@@ -685,3 +330,70 @@ def shuffle_feature_profiles(
         return shuffled_df
     else:
         raise ValueError(f"Unknown shuffle method: {method}")
+
+
+def split_data(
+    pycytominer_output: pl.DataFrame, dataset: str = "CP_and_DP"
+) -> pl.DataFrame:
+    """
+    Filter a pycytominer output DataFrame to retain only metadata and the
+    selected feature modality columns.
+
+    Parameters
+    ----------
+    pycytominer_output : pl.DataFrame
+        Polars DataFrame from pycytominer containing both metadata and feature columns.
+    dataset : str, optional
+        Feature modality to retain. One of:
+        - ``"CP"`` — CellProfiler features only (columns containing ``"CP__"``)
+        - ``"DP"`` — DeepProfiler features only (columns containing ``"DP__"``)
+        - ``"CP_and_DP"`` — both modalities (default)
+
+    Returns
+    -------
+    pl.DataFrame
+        Polars DataFrame with metadata and selected features
+    """
+    all_cols = pycytominer_output.columns
+
+    # Get DP, CP, or both features from all columns depending on desired dataset
+    if dataset == "CP":
+        feature_cols = [col for col in all_cols if "CP__" in col]
+    elif dataset == "DP":
+        feature_cols = [col for col in all_cols if "DP__" in col]
+    elif dataset == "CP_and_DP":
+        feature_cols = [col for col in all_cols if "P__" in col]
+    else:
+        raise ValueError(
+            f"Invalid dataset '{dataset}'. Choose from 'CP', 'DP', or 'CP_and_DP'."
+        )
+
+    # Metadata columns is all columns except feature columns
+    metadata_cols = [col for col in all_cols if "P__" not in col]
+
+    # Select metadata and feature columns
+    selected_cols = metadata_cols + feature_cols
+
+    return pycytominer_output.select(selected_cols)
+
+
+def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFrame:
+    """
+    Strip a feature-modality prefix from all matching column names.
+
+    For example, ``"CP__Cells_AreaShape_Area"`` becomes ``"Cells_AreaShape_Area"``
+    when ``prefix="CP__"``.
+
+    Parameters
+    ----------
+    df : pl.DataFrame
+        Input DataFrame whose column names may contain the prefix.
+    prefix : str, default ``"CP__"``
+        Prefix string to strip from matching column names.
+
+    Returns
+    -------
+    pl.DataFrame
+        DataFrame with the prefix removed from all matching column names.
+    """
+    return df.rename(lambda x: x.replace(prefix, "") if prefix in x else x)
diff --git a/utils/io_utils.py b/utils/io_utils.py
index 6d9f1a9..7f36002 100644
--- a/utils/io_utils.py
+++ b/utils/io_utils.py
@@ -1,3 +1,11 @@
+"""
+Module: io_utils.py
+
+Utility functions for file I/O, including loading single-cell profiles,
+configuration files (YAML/JSON/pickle), downloading files from URLs,
+extracting compressed archives, and concatenating profile DataFrames.
+"""
+
 import gzip
 import json
 import pathlib
@@ -48,7 +56,8 @@ def load_profiles(
     FileNotFoundError
         If the file at `fpath` does not exist.
     ValueError
-        If the file format is not supported. Supported formats are: .parquet, .pq, .arrow.
+        If the file format is not supported. Supported formats are: .parquet, .pq,
+        .arrow.
     """
 
     # type checking
@@ -61,7 +70,8 @@ def load_profiles(
     # check for supported file format
     if fpath.suffix.lower() not in [".parquet", ".pq", ".arrow"]:
         raise ValueError(
-            f"Unsupported file format: {fpath.suffix}. Supported formats are: .parquet, .pq, .arrow"
+            f"Unsupported file format: {fpath.suffix}. Supported formats are: ",
+            ".parquet, .pq, .arrow",
         )
 
     # load profiles
@@ -86,10 +96,12 @@ def load_profiles(
     if verbose:
         print(f"Loading profiles from {fpath}...")
         print(
-            f"Loaded profiles shape: rows: {loaded_profiles.shape[0]}, columns: {loaded_profiles.shape[1]}"
+            f"Loaded profiles shape: rows: {loaded_profiles.shape[0]}, "
+            f"columns: {loaded_profiles.shape[1]}"
         )
         print(
-            f"Estimated loaded dataframe size: {round(loaded_profiles.estimated_size('mb'), 2)} MB"
+            "Estimated loaded dataframe size:",
+            f"{round(loaded_profiles.estimated_size('mb'), 2)} MB",
         )
 
     return loaded_profiles
@@ -97,6 +109,7 @@ def load_profiles(
 
 def load_configs(fpath: str | pathlib.Path) -> dict:
     """Load a configuration file and return its contents as a dictionary.
+
     Parameters
     ----------
     fpath : str or pathlib.Path
@@ -143,7 +156,8 @@ def load_configs(fpath: str | pathlib.Path) -> dict:
             raise ValueError(f"Error parsing pickle file {fpath}: {e}")
     else:
         raise ValueError(
-            f"Unsupported file format: {fpath.suffix}. Expected .yaml, .json, .pkl, or .pickle"
+            f"Unsupported file format: {fpath.suffix}. Expected .yaml, .json, .pkl, or "
+            ".pickle"
         )
     return config
 
@@ -153,11 +167,9 @@ def download_file(
     output_path: pathlib.Path | str,
     chunk_size: int = 8192,
 ) -> pathlib.Path:
-    """Downloads a file from a URL with progress tracking.
+    """Download a file from a URL and save it to disk with progress tracking.
 
-    Downloads a file from the specified URL and saves it to the given output path.
-    The download is performed in chunks to handle large files efficiently, and the progress is displayed using
-    the `tqdm` library.
+    Downloads in chunks for memory efficiency. Progress is displayed via `tqdm`.
 
     Parameters
     ----------
@@ -198,17 +210,12 @@ def download_file(
     if output_path.exists() and not output_path.is_file():
         raise FileExistsError(f"Output path {output_path} exists and is not a file.")
 
-    # starting downloading process
     try:
-        # sending GET request to the source URL
         with requests.get(source_url, stream=True) as response:
-            # raise an error if the request was unsuccessful
             response.raise_for_status()
 
-            # get the total size of the file from the response headers
             total_size = int(response.headers.get("content-length", 0))
 
-            # using tqdm to track the download progress
             with (
                 open(output_path, "wb") as file,
                 tqdm(
@@ -219,12 +226,9 @@ def download_file(
                     unit_divisor=1024,
                 ) as pbar,
             ):
-                # iterating over the response content in chunks
                 for chunk in response.iter_content(chunk_size=chunk_size):
                     if chunk:
                         file.write(chunk)
-
-                        # this updates the progress bar
                         pbar.update(len(chunk))
         return output_path
 
@@ -300,28 +304,88 @@ def download_compressed_file(
     output_path: pathlib.Path | str,
     chunk_size: int = 8192,
     extract: bool = True,
-) -> None:
+) -> pathlib.Path:
     """
-    Download and optionally extract a compressed file from a URL.
+    Download a file from a URL and optionally extract it.
 
     Parameters
     ----------
     source_url : str
-        The URL of the compressed file to download.
+        URL of the file to download.
     output_path : pathlib.Path | str
-        The local path where the downloaded file should be saved.
+        Local path where the downloaded file will be saved.
+        Must include the correct file extension (e.g. ``.zip``) so the archive
+        format can be inferred during extraction.
     chunk_size : int, optional
-        The size of chunks to download in bytes, by default 8192.
+        Download chunk size in bytes, by default 8192.
     extract : bool, optional
-        Whether to extract the file after downloading, by default True.
+        If True, extracts the archive after downloading, by default True.
 
     Returns
     -------
     pathlib.Path
-        The path to the downloaded (and possibly extracted) file.
+        Path to the downloaded file.
     """
     downloaded_path = download_file(source_url, output_path, chunk_size)
     if extract:
         extract_file(downloaded_path)
 
     return downloaded_path
+
+
+def load_and_concat_profiles(
+    profile_dir: str | pathlib.Path,
+    shared_features: list[str] | None = None,
+    specific_plates: list[pathlib.Path] | None = None,
+) -> pl.DataFrame:
+    """
+    Load all profile files from a directory and concatenate them into a single Polars DataFrame.
+
+    Parameters
+    ----------
+    profile_dir : str or pathlib.Path
+        Directory containing the profile files (.parquet).
+    shared_features : Optional[list[str]], optional
+        List of shared feature names to filter the profiles. If None, all features are loaded.
+    specific_plates : Optional[list[pathlib.Path]], optional
+        List of specific plate file paths to load. If None, all profiles in the directory are loaded.
+
+    Returns
+    -------
+    pl.DataFrame
+        Concatenated Polars DataFrame containing all loaded profiles.
+    """
+    # Ensure profile_dir is a pathlib.Path
+    if isinstance(profile_dir, str):
+        profile_dir = pathlib.Path(profile_dir)
+    elif not isinstance(profile_dir, pathlib.Path):
+        raise TypeError("profile_dir must be a string or a pathlib.Path object")
+
+    # Validate specific_plates
+    if specific_plates is not None:
+        if not isinstance(specific_plates, list):
+            raise TypeError("specific_plates must be a list of pathlib.Path objects")
+        if not all(isinstance(path, pathlib.Path) for path in specific_plates):
+            raise TypeError(
+                "All elements in specific_plates must be pathlib.Path objects"
+            )
+
+    # Use specific_plates if provided, otherwise gather all .parquet files
+    if specific_plates is not None:
+        # Validate that all specific plate files exist
+        for plate_path in specific_plates:
+            if not plate_path.exists():
+                raise FileNotFoundError(f"Profile file not found: {plate_path}")
+        files_to_load = specific_plates
+    else:
+        files_to_load = list(profile_dir.glob("*.parquet"))
+        if not files_to_load:
+            raise FileNotFoundError(f"No profile files found in {profile_dir}")
+
+    # Load and concatenate profiles
+    loaded_profiles = [
+        load_profiles(f, shared_features=shared_features) for f in files_to_load
+    ]
+
+    # Concatenate all loaded profiles
+    return pl.concat(loaded_profiles, rechunk=True)
diff --git a/utils/validator.py b/utils/validator.py
deleted file mode 100644
index a0d7e1b..0000000
--- a/utils/validator.py
+++ /dev/null
@@ -1,133 +0,0 @@
-"""
-Utility functions for validating parameter grids for clustering optimization.
-"""
-
-from typing import Any
-
-# Global static variables for valid parameter names and types
-VALID_CLUSTER_PARAMS: set[str] = {
-    "cluster_method",
-    "cluster_resolution",
-    "dim_reduction",
-    "n_neighbors",
-    "neighbor_distance_metric",
-    "pca_variance_explained",
-    "pca_n_components_to_capture_variance",
-    "pca_svd_solver",
-}
-
-VALID_PARAM_TYPES: set[str] = {"float", "int", "categorical"}
-
-
-def _validate_param_grid(param_grid: dict[str, Any]) -> None:
-    """Validate the parameter grid for optimized_clustering function.
-
-    This function checks that the provided param_grid contains valid parameter names
-    and types for the cluster_profiles function. It raises a ValueError if any invalid
-    parameters are found.
-
-    Parameters
-    ----------
-    param_grid : dict[str, Any]
-        Dictionary defining the parameter search space. Each key should be a parameter
-        name from cluster_profiles, and each value should be a dictionary with 'type'
-        and range info.
-
-    Raises
-    ------
-    ValueError
-        If param_grid contains unsupported parameter types or invalid parameter names.
-    TypeError
-        If param_grid structure is invalid (missing required keys, wrong value types).
-    """
-
-    for param_name, param_config in param_grid.items():
-        # 1. Check if parameter name is valid
-        if param_name not in VALID_CLUSTER_PARAMS:
-            raise ValueError(
-                f"Invalid parameter name: '{param_name}'. "
-                f"Valid parameters are: {sorted(VALID_CLUSTER_PARAMS)}"
-            )
-
-        # 2. Check if param_config is a dictionary
-        if not isinstance(param_config, dict):
-            raise TypeError(
-                f"Parameter config for '{param_name}' must be a dictionary, "
-                f"got {type(param_config).__name__}"
-            )
-
-        # 3. Check if 'type' key exists
-        if "type" not in param_config:
-            raise TypeError(
-                f"Parameter config for '{param_name}' must contain a 'type' key"
-            )
-
-        param_type = param_config["type"]
-
-        # 4. Check if type is valid
-        if param_type not in VALID_PARAM_TYPES:
-            raise ValueError(
-                f"Invalid parameter type '{param_type}' for '{param_name}'. "
-                f"Valid types are: {sorted(VALID_PARAM_TYPES)}"
-            )
-
-        # 5. Validate type-specific requirements
-        if param_type in ["float", "int"]:
-            # Check for 'low' and 'high' keys
-            if "low" not in param_config:
-                raise TypeError(
-                    f"Parameter config for '{param_name}' with type '{param_type}' "
-                    f"must contain a 'low' key"
-                )
-            if "high" not in param_config:
-                raise TypeError(
-                    f"Parameter config for '{param_name}' with type '{param_type}' "
-                    f"must contain a 'high' key"
-                )
-
-            # Check that low and high are numbers
-            if not isinstance(param_config["low"], (int, float)):
-                raise TypeError(
-                    f"'low' value for '{param_name}' must be a number, "
-                    f"got {type(param_config['low']).__name__}"
-                )
-            if not isinstance(param_config["high"], (int, float)):
-                raise TypeError(
-                    f"'high' value for '{param_name}' must be a number, "
-                    f"got {type(param_config['high']).__name__}"
-                )
-
-            # Check that low < high
-            if param_config["low"] >= param_config["high"]:
-                raise ValueError(
-                    f"'low' must be less than 'high' for '{param_name}'. "
-                    f"Got low={param_config['low']}, high={param_config['high']}"
-                )
-
-            # If 'log' is present, check it's a boolean
-            if "log" in param_config and not isinstance(param_config["log"], bool):
-                raise TypeError(
-                    f"'log' value for '{param_name}' must be a boolean, "
-                    f"got {type(param_config['log']).__name__}"
-                )
-
-        elif param_type == "categorical":
-            # Check for 'choices' key
-            if "choices" not in param_config:
-                raise TypeError(
-                    f"Parameter config for '{param_name}' with type 'categorical' "
-                    f"must contain a 'choices' key"
-                )
-
-            # Check that choices is a list or tuple
-            if not isinstance(param_config["choices"], (list, tuple)):
-                raise TypeError(
-                    f"'choices' for '{param_name}' must be a list or tuple, "
-                    f"got {type(param_config['choices']).__name__}"
-                )
-
-            # Check that choices is not empty
-            if len(param_config["choices"]) == 0:
-                raise ValueError(
-                    f"'choices' for '{param_name}' must contain at least one option"
-                )

From ec78b567c2187e01af6576dd498ee7675c212c32 Mon Sep 17 00:00:00 2001
From: Erik Serrano <erikishere3@gmail.com>
Date: Sat, 7 Mar 2026 22:40:29 -0700
Subject: [PATCH 02/16] download module update

---
 .../0.download-data/1.download-data.ipynb     | 113 ++++++++++++++----
 .../0.download-data/2.preprocessing.ipynb     |  31 +++--
 .../3.subset-jump-controls.ipynb              |  16 +--
 notebooks/0.download-data/dl-configs.yaml     |   8 ++
 .../nbconverted/1.download-data.py            |  77 ++++++++++--
 .../nbconverted/2.preprocessing.py            |  29 +++--
 .../nbconverted/3.subset-jump-controls.py     |  14 +--
 7 files changed, 209 insertions(+), 79 deletions(-)
 create mode 100644 notebooks/0.download-data/dl-configs.yaml

diff --git a/notebooks/0.download-data/1.download-data.ipynb b/notebooks/0.download-data/1.download-data.ipynb
index 6a6d223..b5e7563 100644
--- a/notebooks/0.download-data/1.download-data.ipynb
+++ b/notebooks/0.download-data/1.download-data.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "# Downloading Single-Cell Profiles\n",
     "\n",
-    "This notebook focuses on downloading metadata and single-cell profiles from three key datasets:\n",
+    "This notebook downloading metadata and single-cell profiles from three key datasets:\n",
     "\n",
     "1. **CPJUMP1 Pilot Dataset** ([link](https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1)): Metadata is downloaded and processed to identify and organize plates containing wells treated with compound perturbations for downstream analysis.\n",
     "2. **MitoCheck Dataset**: Normalized and feature-selected single-cell profiles are downloaded for further analysis.\n",
@@ -74,7 +74,7 @@
    "outputs": [],
    "source": [
     "# setting config path\n",
-    "config_path = pathlib.Path(\"../nb-configs.yaml\").resolve(strict=True)\n",
+    "config_path = pathlib.Path(\"dl-configs.yaml\").resolve(strict=True)\n",
     "\n",
     "# setting results setting a data directory\n",
     "data_dir = pathlib.Path(\"./data\").resolve()\n",
@@ -114,7 +114,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "5b8bfe5f",
    "metadata": {},
    "outputs": [
@@ -125,17 +125,17 @@
       "plates that will be downloaded are:  shape: (12,)\n",
       "Series: 'Assay_Plate_Barcode' [str]\n",
       "[\n",
-      "\t\"BR00117054\"\n",
-      "\t\"BR00117012\"\n",
-      "\t\"BR00117008\"\n",
-      "\t\"BR00117016\"\n",
+      "\t\"BR00117019\"\n",
       "\t\"BR00117055\"\n",
-      "\t…\n",
-      "\t\"BR00117011\"\n",
       "\t\"BR00117013\"\n",
-      "\t\"BR00117010\"\n",
+      "\t\"BR00117054\"\n",
+      "\t\"BR00117011\"\n",
+      "\t…\n",
       "\t\"BR00117017\"\n",
-      "\t\"BR00117019\"\n",
+      "\t\"BR00117009\"\n",
+      "\t\"BR00117016\"\n",
+      "\t\"BR00117010\"\n",
+      "\t\"BR00117012\"\n",
       "]\n",
       "shape:  (12, 13)\n"
      ]
@@ -241,6 +241,79 @@
     "exp_metadata"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "a4665c17",
+   "metadata": {},
+   "source": [
+    "\n",
+    "In this section, we download:\n",
+    "\n",
+    "1. **Compound metadata** from the CPJUMP1 repository  \n",
+    "2. **Mechanism of action (MOA) metadata** from the Broad Repurposing Hub\n",
+    "\n",
+    "We then merge both datasets into a single compound metadata table.\n",
+    "\n",
+    "If a compound has missing MOA information, the value in `Metadata_moa` is replaced with `\"unknown\"`. This indicates that no MOA annotation is currently available for that compound."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "22e417e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# downloading compound metadata from cpjump1 repo\n",
+    "CPJUMP_compound_metadata = pl.read_csv(\n",
+    "    nb_configs[\"links\"][\"CPJUMP1-compound-metadata-source\"],\n",
+    "    separator=\"\\t\",\n",
+    "    has_header=True,\n",
+    "    encoding=\"utf-8\",\n",
+    ")\n",
+    "\n",
+    "# downloading compound moa metadata from broad institute drug repurposing hub\n",
+    "broad_compound_moa_metadata = pl.read_csv(\n",
+    "    nb_configs[\"links\"][\"Broad-compounds-moa-source\"],\n",
+    "    separator=\"\\t\",\n",
+    "    skip_rows=9,\n",
+    "    encoding=\"utf8-lossy\",\n",
+    ")\n",
+    "\n",
+    "# for both dataframes make sure that all columns have \"Metadata_\" in the column name\n",
+    "CPJUMP_compound_metadata = CPJUMP_compound_metadata.rename(\n",
+    "    {col: f\"Metadata_{col}\" for col in CPJUMP_compound_metadata.columns}\n",
+    ")\n",
+    "broad_compound_moa_metadata = broad_compound_moa_metadata.rename(\n",
+    "    {col: f\"Metadata_{col}\" for col in broad_compound_moa_metadata.columns}\n",
+    ")\n",
+    "\n",
+    "# replace null values in the boroad compound moa to \"unknown\"\n",
+    "broad_compound_moa_metadata = broad_compound_moa_metadata.with_columns(\n",
+    "    pl.col(\"Metadata_moa\").fill_null(\"unknown\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "01db7db8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "complete_compound_metadata = CPJUMP_compound_metadata.join(\n",
+    "    broad_compound_moa_metadata,\n",
+    "    left_on=\"Metadata_pert_iname\",\n",
+    "    right_on=\"Metadata_pert_iname\",\n",
+    "    how=\"left\",\n",
+    ")\n",
+    "\n",
+    "# save the complete compound metadata as a tsv file\n",
+    "complete_compound_metadata.write_csv(\n",
+    "    cpjump1_dir / f\"cpjump1_{pert_type}_compound-metadata.tsv\", separator=\"\\t\"\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7021b414",
@@ -255,7 +328,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "id": "06783224",
    "metadata": {},
    "outputs": [
@@ -263,7 +336,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "File /home/erikserrano/Projects/buscar/notebooks/0.download-data/data/sc-profiles/mitocheck/normalized_data already exists. Skipping download.\n"
+      "File /home/erikserrano/Projects/buscar/notebooks/_new_update0.download-data/data/sc-profiles/mitocheck/normalized_data already exists. Skipping download.\n"
      ]
     }
    ],
@@ -284,16 +357,16 @@
    "source": [
     "## Downloading CFReT Data\n",
     "\n",
-    "In this section, we download feature-selected single-cell profiles from the CFReT plate `localhost230405150001`. This plate contains three treatments: DMSO (control), drug_x, and TGFRi. The dataset consists of high-content imaging data that has already undergone feature selection, making it suitable for downstream analysis.\n",
+    "This section downloads and saves feature-selected single-cell profiles from the CFReT plate `localhost230405150001`.\n",
     "\n",
-    "**Key Points:**\n",
-    "- Only the processed single-cell profiles are downloaded [here](https://github.com/WayScience/cellpainting_predicts_cardiac_fibrosis/tree/main/3.process_cfret_features/data/single_cell_profiles)\n",
-    "- The CFReT dataset was used and published in [this study](https://doi.org/10.1161/CIRCULATIONAHA.124.071956)."
+    "- Only processed single-cell profiles are downloaded (no raw data).\n",
+    "- Data is saved as a Parquet file for fast access.\n",
+    "- Used in published cardiac fibrosis research ([study link](https://doi.org/10.1161/CIRCULATIONAHA.124.071956))."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "id": "4d9fd47c",
    "metadata": {},
    "outputs": [
@@ -301,7 +374,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "File /home/erikserrano/Projects/buscar/notebooks/0.download-data/data/sc-profiles/cfret/localhost230405150001_sc_feature_selected.parquet already exists. Skipping download.\n"
+      "File /home/erikserrano/Projects/buscar/notebooks/_new_update0.download-data/data/sc-profiles/cfret/localhost230405150001_sc_feature_selected.parquet already exists. Skipping download.\n"
      ]
     }
    ],
@@ -344,7 +417,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.11"
+   "version": "3.12.9"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb
index d1178ff..8943c1c 100644
--- a/notebooks/0.download-data/2.preprocessing.ipynb
+++ b/notebooks/0.download-data/2.preprocessing.ipynb
@@ -199,16 +199,17 @@
     "# Setting profiles directory\n",
     "profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n",
     "\n",
-    "# setting connectivity map drug repurposing config\n",
-    "drug_repurposing_config_path = (data_dir / \"repurposing_drugs_20180907.txt\").resolve(\n",
-    "    strict=True\n",
-    ")\n",
     "\n",
     "# Experimental metadata\n",
     "exp_metadata_path = (\n",
     "    profiles_dir / \"cpjump1\" / \"cpjump1_compound_experimental-metadata.csv\"\n",
     ").resolve(strict=True)\n",
     "\n",
+    "# cpjump1 compound metadta\n",
+    "cmp_metadata_path = (\n",
+    "    profiles_dir / \"cpjump1\" / \"cpjump1_compound_compound-metadata.tsv\"\n",
+    ").resolve(strict=True)\n",
+    "\n",
     "# Setting CFReT profiles directory\n",
     "cfret_profiles_dir = (profiles_dir / \"cfret\").resolve(strict=True)\n",
     "cfret_profiles_path = (\n",
@@ -333,14 +334,17 @@
    "source": [
     "# load drug repurposing moa file and add prefix to metadata columns\n",
     "rep_moa_df = pl.read_csv(\n",
-    "    drug_repurposing_config_path, separator=\"\\t\", skip_rows=9, encoding=\"utf8-lossy\"\n",
-    ").rename(lambda x: f\"Metadata_{x}\" if not x.startswith(\"Metadata_\") else x)\n",
+    "    cmp_metadata_path,\n",
+    "    separator=\"\\t\",\n",
+    "    columns=[\"Metadata_pert_iname\", \"Metadata_target\", \"Metadata_moa\"],\n",
+    ").unique(subset=[\"Metadata_pert_iname\"])\n",
     "\n",
     "# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname\n",
     "cpjump1_profiles = cpjump1_profiles.join(\n",
     "    rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n",
     ")\n",
     "\n",
+    "\n",
     "# split meta and feature\n",
     "meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)\n",
     "\n",
@@ -364,7 +368,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4a0ba6ad",
+   "id": "92bacbc9",
    "metadata": {},
    "source": [
     "## Preprocessing MitoCheck Dataset\n",
@@ -387,7 +391,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "id": "c5471d3e",
    "metadata": {},
    "outputs": [],
@@ -441,7 +445,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "id": "c57da947",
    "metadata": {},
    "outputs": [],
@@ -474,7 +478,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "1d7ced04",
    "metadata": {},
    "outputs": [],
@@ -527,7 +531,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "id": "42108980",
    "metadata": {},
    "outputs": [],
@@ -574,7 +578,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 15,
    "id": "1763d383",
    "metadata": {},
    "outputs": [],
@@ -582,6 +586,7 @@
     "# load in cfret profiles and add a unique cell ID\n",
     "cfret_profiles = pl.read_parquet(cfret_profiles_path)\n",
     "\n",
+    "\n",
     "# adding a unique cell ID based on all features\n",
     "cfret_profiles = add_cell_id_hash(cfret_profiles, force=True)\n",
     "\n",
@@ -623,7 +628,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.11"
+   "version": "3.12.9"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/0.download-data/3.subset-jump-controls.ipynb b/notebooks/0.download-data/3.subset-jump-controls.ipynb
index c736f80..39e1f46 100644
--- a/notebooks/0.download-data/3.subset-jump-controls.ipynb
+++ b/notebooks/0.download-data/3.subset-jump-controls.ipynb
@@ -142,13 +142,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# setting data path\n",
-    "data_dir = pathlib.Path(\"../0.download-data/data\").resolve(strict=True)\n",
-    "download_module_results_dir = pathlib.Path(\"../0.download-data/results\").resolve(\n",
-    "    strict=True\n",
-    ")\n",
-    "\n",
     "# setting directory where all the single-cell profiles are stored\n",
+    "data_dir = pathlib.Path.cwd() / \"data\"\n",
     "profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n",
     "\n",
     "cpjump1_data_path = (\n",
@@ -161,11 +156,6 @@
     "    profiles_dir / \"cpjump1\" / \"feature_selected_sc_qc_features.json\"\n",
     ").resolve(strict=True)\n",
     "\n",
-    "# setting cpjump1 data dir\n",
-    "cpjump_crispr_data_dir = (data_dir / \"sc-profiles\" / \"cpjump1-crispr-negcon\").resolve()\n",
-    "cpjump_crispr_data_dir.mkdir(exist_ok=True)\n",
-    "\n",
-    "\n",
     "# setting negative control\n",
     "negcon_data_dir = (profiles_dir / \"cpjump1\" / \"negcon\").resolve()\n",
     "negcon_data_dir.mkdir(exist_ok=True)\n",
@@ -224,7 +214,7 @@
     "\n",
     "    # save the file\n",
     "    subsampled_df.write_parquet(\n",
-    "        negcon_data_dir / f\"cpjump1_crispr_negcon_seed{seed_val}.parquet\"\n",
+    "        negcon_data_dir / f\"cpjump1_compound_negcon_seed{seed_val}.parquet\"\n",
     "    )"
    ]
   },
@@ -268,7 +258,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.11"
+   "version": "3.12.9"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/0.download-data/dl-configs.yaml b/notebooks/0.download-data/dl-configs.yaml
new file mode 100644
index 0000000..9a698f5
--- /dev/null
+++ b/notebooks/0.download-data/dl-configs.yaml
@@ -0,0 +1,8 @@
+links:
+  MitoCheck-profiles-source: https://zenodo.org/records/7967386/files/3.normalize_data__normalized_data.zip?download=1
+  CFReT-profiles-source: https://github.com/WayScience/cellpainting_predicts_cardiac_fibrosis/raw/refs/heads/main/3.process_cfret_features/data/single_cell_profiles/localhost230405150001_sc_feature_selected.parquet?download=
+  CPJUMP1-experimental-metadata-source: https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/raw/refs/heads/main/benchmark/output/experiment-metadata.tsv
+  CPJUMP1-compound-metadata-source: https://raw.githubusercontent.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/main/metadata/external_metadata/JUMP-Target-1_compound_metadata_targets.tsv
+  Broad-compounds-moa-source: https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_drugs_20180907.txt
+  CPJUMP-plate-maps-source: https://raw.githubusercontent.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1/refs/heads/main/metadata/platemaps/2020_11_04_CPJUMP1/platemap/JUMP-Target-1_crispr_platemap.txt
+  CPJUMP1-profiles-source: https://cellpainting-gallery.s3.amazonaws.com/cpg0000-jump-pilot/source_4/workspace/profiles
diff --git a/notebooks/0.download-data/nbconverted/1.download-data.py b/notebooks/0.download-data/nbconverted/1.download-data.py
index 092cc3c..f50bbe7 100644
--- a/notebooks/0.download-data/nbconverted/1.download-data.py
+++ b/notebooks/0.download-data/nbconverted/1.download-data.py
@@ -2,7 +2,7 @@
 
 # # Downloading Single-Cell Profiles
 #
-# This notebook focuses on downloading metadata and single-cell profiles from three key datasets:
+# This notebook downloading metadata and single-cell profiles from three key datasets:
 #
 # 1. **CPJUMP1 Pilot Dataset** ([link](https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1)): Metadata is downloaded and processed to identify and organize plates containing wells treated with compound perturbations for downstream analysis.
 # 2. **MitoCheck Dataset**: Normalized and feature-selected single-cell profiles are downloaded for further analysis.
@@ -37,7 +37,7 @@
 
 
 # setting config path
-config_path = pathlib.Path("../nb-configs.yaml").resolve(strict=True)
+config_path = pathlib.Path("dl-configs.yaml").resolve(strict=True)
 
 # setting results setting a data directory
 data_dir = pathlib.Path("./data").resolve()
@@ -69,7 +69,7 @@
 #
 # For this notebook, we focus on plates containing both U2OS and A549 parental cell lines that have been treated with compounds for 48 hours. More information about the batch and plate metadata can be found in the [CPJUMP1 documentation](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/README.md#batch-and-plate-metadata).
 
-# In[ ]:
+# In[4]:
 
 
 # loading config file and setting experimental metadata URL
@@ -102,13 +102,72 @@
 exp_metadata
 
 
+#
+# In this section, we download:
+#
+# 1. **Compound metadata** from the CPJUMP1 repository
+# 2. **Mechanism of action (MOA) metadata** from the Broad Repurposing Hub
+#
+# We then merge both datasets into a single compound metadata table.
+#
+# If a compound has missing MOA information, the value in `Metadata_moa` is replaced with `"unknown"`. This indicates that no MOA annotation is currently available for that compound.
+
+# In[5]:
+
+
+# downloading compound metadata from cpjump1 repo
+CPJUMP_compound_metadata = pl.read_csv(
+    nb_configs["links"]["CPJUMP1-compound-metadata-source"],
+    separator="\t",
+    has_header=True,
+    encoding="utf-8",
+)
+
+# downloading compound moa metadata from broad institute drug repurposing hub
+broad_compound_moa_metadata = pl.read_csv(
+    nb_configs["links"]["Broad-compounds-moa-source"],
+    separator="\t",
+    skip_rows=9,
+    encoding="utf8-lossy",
+)
+
+# for both dataframes make sure that all columns have "Metadata_" in the column name
+CPJUMP_compound_metadata = CPJUMP_compound_metadata.rename(
+    {col: f"Metadata_{col}" for col in CPJUMP_compound_metadata.columns}
+)
+broad_compound_moa_metadata = broad_compound_moa_metadata.rename(
+    {col: f"Metadata_{col}" for col in broad_compound_moa_metadata.columns}
+)
+
+# replace null values in the boroad compound moa to "unknown"
+broad_compound_moa_metadata = broad_compound_moa_metadata.with_columns(
+    pl.col("Metadata_moa").fill_null("unknown")
+)
+
+
+# In[6]:
+
+
+complete_compound_metadata = CPJUMP_compound_metadata.join(
+    broad_compound_moa_metadata,
+    left_on="Metadata_pert_iname",
+    right_on="Metadata_pert_iname",
+    how="left",
+)
+
+# save the complete compound metadata as a tsv file
+complete_compound_metadata.write_csv(
+    cpjump1_dir / f"cpjump1_{pert_type}_compound-metadata.tsv", separator="\t"
+)
+
+
 # ## Downloading MitoCheck Data
 #
 # In this section, we download the MitoCheck data generated in [this study](https://pmc.ncbi.nlm.nih.gov/articles/PMC3108885/).
 #
 # Specifically, we are downloading data that has already been normalized and feature-selected. The normalization and feature selection pipeline is available [here](https://github.com/WayScience/mitocheck_data/tree/main/3.normalize_data).
 
-# In[5]:
+# In[7]:
 
 
 # url source for the MitoCheck data
@@ -122,13 +181,13 @@
 
 # ## Downloading CFReT Data
 #
-# In this section, we download feature-selected single-cell profiles from the CFReT plate `localhost230405150001`. This plate contains three treatments: DMSO (control), drug_x, and TGFRi. The dataset consists of high-content imaging data that has already undergone feature selection, making it suitable for downstream analysis.
+# This section downloads and saves feature-selected single-cell profiles from the CFReT plate `localhost230405150001`.
 #
-# **Key Points:**
-# - Only the processed single-cell profiles are downloaded [here](https://github.com/WayScience/cellpainting_predicts_cardiac_fibrosis/tree/main/3.process_cfret_features/data/single_cell_profiles)
-# - The CFReT dataset was used and published in [this study](https://doi.org/10.1161/CIRCULATIONAHA.124.071956).
+# - Only processed single-cell profiles are downloaded (no raw data).
+# - Data is saved as a Parquet file for fast access.
+# - Used in published cardiac fibrosis research ([study link](https://doi.org/10.1161/CIRCULATIONAHA.124.071956)).
 
-# In[ ]:
+# In[8]:
 
 
 # setting the source for the CFReT data
diff --git a/notebooks/0.download-data/nbconverted/2.preprocessing.py b/notebooks/0.download-data/nbconverted/2.preprocessing.py
index 5004e89..7e4799a 100644
--- a/notebooks/0.download-data/nbconverted/2.preprocessing.py
+++ b/notebooks/0.download-data/nbconverted/2.preprocessing.py
@@ -167,16 +167,17 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 # Setting profiles directory
 profiles_dir = (data_dir / "sc-profiles").resolve(strict=True)
 
-# setting connectivity map drug repurposing config
-drug_repurposing_config_path = (data_dir / "repurposing_drugs_20180907.txt").resolve(
-    strict=True
-)
 
 # Experimental metadata
 exp_metadata_path = (
     profiles_dir / "cpjump1" / "cpjump1_compound_experimental-metadata.csv"
 ).resolve(strict=True)
 
+# cpjump1 compound metadta
+cmp_metadata_path = (
+    profiles_dir / "cpjump1" / "cpjump1_compound_compound-metadata.tsv"
+).resolve(strict=True)
+
 # Setting CFReT profiles directory
 cfret_profiles_dir = (profiles_dir / "cfret").resolve(strict=True)
 cfret_profiles_path = (
@@ -258,7 +259,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)
 
 
-# Next, we annotate the compound treatments in the CPJUMP1 dataset. We annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.
+# Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.
 #
 
 # In[6]:
@@ -266,14 +267,17 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 
 # load drug repurposing moa file and add prefix to metadata columns
 rep_moa_df = pl.read_csv(
-    drug_repurposing_config_path, separator="\t", skip_rows=9, encoding="utf8-lossy"
-).rename(lambda x: f"Metadata_{x}" if not x.startswith("Metadata_") else x)
+    cmp_metadata_path,
+    separator="\t",
+    columns=["Metadata_pert_iname", "Metadata_target", "Metadata_moa"],
+).unique(subset=["Metadata_pert_iname"])
 
 # merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname
 cpjump1_profiles = cpjump1_profiles.join(
     rep_moa_df, on="Metadata_pert_iname", how="left"
 )
 
+
 # split meta and feature
 meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)
 
@@ -312,7 +316,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 #
 # The preprocessing ensures that all MitoCheck datasets share a common feature space and are ready for comparative analysis with CPJUMP1 profiles.
 
-# In[7]:
+# In[8]:
 
 
 # load in mitocheck profiles and save as parquet
@@ -356,7 +360,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 
 # Filter Cell Profiler (CP) features and preprocess columns by removing the "CP__" prefix to standardize feature names for downstream analysis.
 
-# In[8]:
+# In[9]:
 
 
 # Split profiles to only retain cell profiler features
@@ -379,7 +383,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 
 # Splitting the metadata and feature columns for each dataset to enable targeted downstream analysis and ensure consistent data structure across all profiles.
 
-# In[9]:
+# In[10]:
 
 
 # manually selecting metadata features that are present across all 3 profiles
@@ -428,7 +432,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
     )
 
 
-# In[10]:
+# In[11]:
 
 
 # create concatenated mitocheck profiles
@@ -466,12 +470,13 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 # - **Unique cell identification**: Adding `Metadata_cell_id` column with unique hash values based on all profile features to enable precise cell tracking and deduplication
 #
 
-# In[11]:
+# In[15]:
 
 
 # load in cfret profiles and add a unique cell ID
 cfret_profiles = pl.read_parquet(cfret_profiles_path)
 
+
 # adding a unique cell ID based on all features
 cfret_profiles = add_cell_id_hash(cfret_profiles, force=True)
 
diff --git a/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py b/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py
index 0be0975..31283ad 100644
--- a/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py
+++ b/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py
@@ -113,13 +113,8 @@ def load_group_stratified_data(
 # In[3]:
 
 
-# setting data path
-data_dir = pathlib.Path("../0.download-data/data").resolve(strict=True)
-download_module_results_dir = pathlib.Path("../0.download-data/results").resolve(
-    strict=True
-)
-
 # setting directory where all the single-cell profiles are stored
+data_dir = pathlib.Path.cwd() / "data"
 profiles_dir = (data_dir / "sc-profiles").resolve(strict=True)
 
 cpjump1_data_path = (
@@ -132,11 +127,6 @@ def load_group_stratified_data(
     profiles_dir / "cpjump1" / "feature_selected_sc_qc_features.json"
 ).resolve(strict=True)
 
-# setting cpjump1 data dir
-cpjump_crispr_data_dir = (data_dir / "sc-profiles" / "cpjump1-crispr-negcon").resolve()
-cpjump_crispr_data_dir.mkdir(exist_ok=True)
-
-
 # setting negative control
 negcon_data_dir = (profiles_dir / "cpjump1" / "negcon").resolve()
 negcon_data_dir.mkdir(exist_ok=True)
@@ -175,7 +165,7 @@ def load_group_stratified_data(
 
     # save the file
     subsampled_df.write_parquet(
-        negcon_data_dir / f"cpjump1_crispr_negcon_seed{seed_val}.parquet"
+        negcon_data_dir / f"cpjump1_compound_negcon_seed{seed_val}.parquet"
     )
 
 

From 5940d0e8ae13c2414d9bd98cc7d94b3158db6156 Mon Sep 17 00:00:00 2001
From: Erik Serrano <erikishere3@gmail.com>
Date: Sat, 7 Mar 2026 22:41:37 -0700
Subject: [PATCH 03/16] doc updates

---
 utils/io_utils.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/utils/io_utils.py b/utils/io_utils.py
index 7f36002..3b2af25 100644
--- a/utils/io_utils.py
+++ b/utils/io_utils.py
@@ -37,7 +37,8 @@ def load_profiles(
     fpath : str | pathlib.Path
         Path to the file containing single-cell profiles.
     convert_to_f32 : bool, optional
-        If True, converts all Float64 columns to Float32 to save memory. Default is False
+        If True, converts all Float64 columns to Float32 to save memory. Default is
+        False
     verbose : bool, optional
         If True, prints information about the loaded profiles. Default is False.
     shared_features : list[str] | None, optional
@@ -251,7 +252,8 @@ def extract_file(
     file_path : pathlib.Path | str
         Path to the compressed file.
     extract_dir : pathlib.Path | str, optional
-        Directory where the file should be extracted. If None, extracts to the same directory as the file.
+        Directory where the file should be extracted. If None, extracts to the same
+        directory as the file.
 
     Returns
     -------
@@ -339,16 +341,19 @@ def load_and_concat_profiles(
     specific_plates: list[pathlib.Path] | None = None,
 ) -> pl.DataFrame:
     """
-    Load all profile files from a directory and concatenate them into a single Polars DataFrame.
+    Load all profile files from a directory and concatenate them into a single Polars
+    DataFrame.
 
     Parameters
     ----------
     profile_dir : str or pathlib.Path
         Directory containing the profile files (.parquet).
     shared_features : Optional[list[str]], optional
-        List of shared feature names to filter the profiles. If None, all features are loaded.
+        List of shared feature names to filter the profiles. If None, all features are
+        loaded.
     specific_plates : Optional[list[pathlib.Path]], optional
-        List of specific plate file paths to load. If None, all profiles in the directory are loaded.
+        List of specific plate file paths to load. If None, all profiles in the
+        directory are loaded.
 
     Returns
     -------

From 9f2f9f2795d0b31aea54d24fc25bb19a70b6c38b Mon Sep 17 00:00:00 2001
From: Erik Serrano <erikishere3@gmail.com>
Date: Sat, 7 Mar 2026 22:53:52 -0700
Subject: [PATCH 04/16] fixed execution call

---
 notebooks/0.download-data/2.preprocessing.ipynb        | 10 +++++-----
 .../0.download-data/nbconverted/2.preprocessing.py     | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb
index 8943c1c..85af33f 100644
--- a/notebooks/0.download-data/2.preprocessing.ipynb
+++ b/notebooks/0.download-data/2.preprocessing.ipynb
@@ -391,7 +391,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "id": "c5471d3e",
    "metadata": {},
    "outputs": [],
@@ -445,7 +445,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "id": "c57da947",
    "metadata": {},
    "outputs": [],
@@ -478,7 +478,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "id": "1d7ced04",
    "metadata": {},
    "outputs": [],
@@ -531,7 +531,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "id": "42108980",
    "metadata": {},
    "outputs": [],
@@ -578,7 +578,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 11,
    "id": "1763d383",
    "metadata": {},
    "outputs": [],
diff --git a/notebooks/0.download-data/nbconverted/2.preprocessing.py b/notebooks/0.download-data/nbconverted/2.preprocessing.py
index 7e4799a..26a1cfc 100644
--- a/notebooks/0.download-data/nbconverted/2.preprocessing.py
+++ b/notebooks/0.download-data/nbconverted/2.preprocessing.py
@@ -316,7 +316,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 #
 # The preprocessing ensures that all MitoCheck datasets share a common feature space and are ready for comparative analysis with CPJUMP1 profiles.
 
-# In[8]:
+# In[7]:
 
 
 # load in mitocheck profiles and save as parquet
@@ -360,7 +360,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 
 # Filter Cell Profiler (CP) features and preprocess columns by removing the "CP__" prefix to standardize feature names for downstream analysis.
 
-# In[9]:
+# In[8]:
 
 
 # Split profiles to only retain cell profiler features
@@ -383,7 +383,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 
 # Splitting the metadata and feature columns for each dataset to enable targeted downstream analysis and ensure consistent data structure across all profiles.
 
-# In[10]:
+# In[9]:
 
 
 # manually selecting metadata features that are present across all 3 profiles
@@ -432,7 +432,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
     )
 
 
-# In[11]:
+# In[10]:
 
 
 # create concatenated mitocheck profiles
@@ -470,7 +470,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 # - **Unique cell identification**: Adding `Metadata_cell_id` column with unique hash values based on all profile features to enable precise cell tracking and deduplication
 #
 
-# In[15]:
+# In[11]:
 
 
 # load in cfret profiles and add a unique cell ID

From cb44173fb3f10435192fd103c5e5b68aff9733ed Mon Sep 17 00:00:00 2001
From: Erik Serrano <erikishere3@gmail.com>
Date: Sun, 8 Mar 2026 13:45:46 -0600
Subject: [PATCH 05/16] updated module by adding cell type

---
 .../0.download-data/1.download-data.ipynb     | 19 ++---
 .../0.download-data/2.preprocessing.ipynb     | 70 +++----------------
 .../nbconverted/1.download-data.py            |  1 +
 .../nbconverted/2.preprocessing.py            | 69 +++---------------
 utils/io_utils.py                             |  3 +-
 5 files changed, 30 insertions(+), 132 deletions(-)

diff --git a/notebooks/0.download-data/1.download-data.ipynb b/notebooks/0.download-data/1.download-data.ipynb
index b5e7563..e7ea279 100644
--- a/notebooks/0.download-data/1.download-data.ipynb
+++ b/notebooks/0.download-data/1.download-data.ipynb
@@ -125,17 +125,17 @@
       "plates that will be downloaded are:  shape: (12,)\n",
       "Series: 'Assay_Plate_Barcode' [str]\n",
       "[\n",
-      "\t\"BR00117019\"\n",
-      "\t\"BR00117055\"\n",
-      "\t\"BR00117013\"\n",
       "\t\"BR00117054\"\n",
+      "\t\"BR00117055\"\n",
+      "\t\"BR00117010\"\n",
+      "\t\"BR00117009\"\n",
       "\t\"BR00117011\"\n",
       "\t…\n",
-      "\t\"BR00117017\"\n",
-      "\t\"BR00117009\"\n",
-      "\t\"BR00117016\"\n",
-      "\t\"BR00117010\"\n",
+      "\t\"BR00117013\"\n",
+      "\t\"BR00117008\"\n",
       "\t\"BR00117012\"\n",
+      "\t\"BR00117015\"\n",
+      "\t\"BR00117019\"\n",
       "]\n",
       "shape:  (12, 13)\n"
      ]
@@ -308,6 +308,7 @@
     "    how=\"left\",\n",
     ")\n",
     "\n",
+    "\n",
     "# save the complete compound metadata as a tsv file\n",
     "complete_compound_metadata.write_csv(\n",
     "    cpjump1_dir / f\"cpjump1_{pert_type}_compound-metadata.tsv\", separator=\"\\t\"\n",
@@ -336,7 +337,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "File /home/erikserrano/Projects/buscar/notebooks/_new_update0.download-data/data/sc-profiles/mitocheck/normalized_data already exists. Skipping download.\n"
+      "File /home/erikserrano/Projects/buscar/notebooks/0.download-data/data/sc-profiles/mitocheck/normalized_data already exists. Skipping download.\n"
      ]
     }
    ],
@@ -374,7 +375,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "File /home/erikserrano/Projects/buscar/notebooks/_new_update0.download-data/data/sc-profiles/cfret/localhost230405150001_sc_feature_selected.parquet already exists. Skipping download.\n"
+      "File /home/erikserrano/Projects/buscar/notebooks/0.download-data/data/sc-profiles/cfret/localhost230405150001_sc_feature_selected.parquet already exists. Skipping download.\n"
      ]
     }
    ],
diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb
index 85af33f..62405bc 100644
--- a/notebooks/0.download-data/2.preprocessing.ipynb
+++ b/notebooks/0.download-data/2.preprocessing.ipynb
@@ -31,13 +31,12 @@
     "import sys\n",
     "import json\n",
     "import pathlib\n",
-    "from typing import Optional\n",
     "\n",
     "import polars as pl\n",
     "\n",
     "sys.path.append(\"../../\")\n",
     "from utils.data_utils import split_meta_and_features, add_cell_id_hash\n",
-    "from utils.io_utils import load_profiles"
+    "from utils.io_utils import load_and_concat_profiles"
    ]
   },
   {
@@ -57,64 +56,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def load_and_concat_profiles(\n",
-    "    profile_dir: str | pathlib.Path,\n",
-    "    shared_features: Optional[list[str]] = None,\n",
-    "    specific_plates: Optional[list[pathlib.Path]] = None,\n",
-    ") -> pl.DataFrame:\n",
-    "    \"\"\"\n",
-    "    Load all profile files from a directory and concatenate them into a single Polars DataFrame.\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    profile_dir : str or pathlib.Path\n",
-    "        Directory containing the profile files (.parquet).\n",
-    "    shared_features : Optional[list[str]], optional\n",
-    "        List of shared feature names to filter the profiles. If None, all features are loaded.\n",
-    "    specific_plates : Optional[list[pathlib.Path]], optional\n",
-    "        List of specific plate file paths to load. If None, all profiles in the directory are loaded.\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    pl.DataFrame\n",
-    "        Concatenated Polars DataFrame containing all loaded profiles.\n",
-    "    \"\"\"\n",
-    "    # Ensure profile_dir is a pathlib.Path\n",
-    "    if isinstance(profile_dir, str):\n",
-    "        profile_dir = pathlib.Path(profile_dir)\n",
-    "    elif not isinstance(profile_dir, pathlib.Path):\n",
-    "        raise TypeError(\"profile_dir must be a string or a pathlib.Path object\")\n",
-    "\n",
-    "    # Validate specific_plates\n",
-    "    if specific_plates is not None:\n",
-    "        if not isinstance(specific_plates, list):\n",
-    "            raise TypeError(\"specific_plates must be a list of pathlib.Path objects\")\n",
-    "        if not all(isinstance(path, pathlib.Path) for path in specific_plates):\n",
-    "            raise TypeError(\n",
-    "                \"All elements in specific_plates must be pathlib.Path objects\"\n",
-    "            )\n",
-    "\n",
-    "    # Use specific_plates if provided, otherwise gather all .parquet files\n",
-    "    if specific_plates is not None:\n",
-    "        # Validate that all specific plate files exist\n",
-    "        for plate_path in specific_plates:\n",
-    "            if not plate_path.exists():\n",
-    "                raise FileNotFoundError(f\"Profile file not found: {plate_path}\")\n",
-    "        files_to_load = specific_plates\n",
-    "    else:\n",
-    "        files_to_load = list(profile_dir.glob(\"*.parquet\"))\n",
-    "        if not files_to_load:\n",
-    "            raise FileNotFoundError(f\"No profile files found in {profile_dir}\")\n",
-    "\n",
-    "    # Load and concatenate profiles\n",
-    "    loaded_profiles = [\n",
-    "        load_profiles(f, shared_features=shared_features) for f in files_to_load\n",
-    "    ]\n",
-    "\n",
-    "    # Concatenate all loaded profiles\n",
-    "    return pl.concat(loaded_profiles, rechunk=True)\n",
-    "\n",
-    "\n",
     "def split_data(\n",
     "    pycytominer_output: pl.DataFrame, dataset: str = \"CP_and_DP\"\n",
     ") -> pl.DataFrame:\n",
@@ -322,7 +263,7 @@
    "id": "3df9bbf5",
    "metadata": {},
    "source": [
-    "Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.\n"
+    "Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each row with  Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP) and cell type. This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.\n"
    ]
   },
   {
@@ -344,6 +285,13 @@
     "    rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n",
     ")\n",
     "\n",
+    "# merge cell type metadata with cpjump1_profiles on Metadata_Plate\n",
+    "cell_type_metadata = exp_metadata.select([\"Assay_Plate_Barcode\", \"Cell_type\"]).rename(\n",
+    "    {\"Assay_Plate_Barcode\": \"Metadata_Plate\", \"Cell_type\": \"Metadata_cell_type\"}\n",
+    ")\n",
+    "cpjump1_profiles = cpjump1_profiles.join(\n",
+    "    cell_type_metadata, on=\"Metadata_Plate\", how=\"left\"\n",
+    ")\n",
     "\n",
     "# split meta and feature\n",
     "meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)\n",
diff --git a/notebooks/0.download-data/nbconverted/1.download-data.py b/notebooks/0.download-data/nbconverted/1.download-data.py
index f50bbe7..a49bb30 100644
--- a/notebooks/0.download-data/nbconverted/1.download-data.py
+++ b/notebooks/0.download-data/nbconverted/1.download-data.py
@@ -155,6 +155,7 @@
     how="left",
 )
 
+
 # save the complete compound metadata as a tsv file
 complete_compound_metadata.write_csv(
     cpjump1_dir / f"cpjump1_{pert_type}_compound-metadata.tsv", separator="\t"
diff --git a/notebooks/0.download-data/nbconverted/2.preprocessing.py b/notebooks/0.download-data/nbconverted/2.preprocessing.py
index 26a1cfc..5c20011 100644
--- a/notebooks/0.download-data/nbconverted/2.preprocessing.py
+++ b/notebooks/0.download-data/nbconverted/2.preprocessing.py
@@ -26,7 +26,7 @@
 
 sys.path.append("../../")
 from utils.data_utils import add_cell_id_hash, split_meta_and_features
-from utils.io_utils import load_profiles
+from utils.io_utils import load_and_concat_profiles
 
 # ## Helper functions
 #
@@ -35,64 +35,6 @@
 # In[2]:
 
 
-def load_and_concat_profiles(
-    profile_dir: str | pathlib.Path,
-    shared_features: list[str] | None = None,
-    specific_plates: list[pathlib.Path] | None = None,
-) -> pl.DataFrame:
-    """
-    Load all profile files from a directory and concatenate them into a single Polars DataFrame.
-
-    Parameters
-    ----------
-    profile_dir : str or pathlib.Path
-        Directory containing the profile files (.parquet).
-    shared_features : Optional[list[str]], optional
-        List of shared feature names to filter the profiles. If None, all features are loaded.
-    specific_plates : Optional[list[pathlib.Path]], optional
-        List of specific plate file paths to load. If None, all profiles in the directory are loaded.
-
-    Returns
-    -------
-    pl.DataFrame
-        Concatenated Polars DataFrame containing all loaded profiles.
-    """
-    # Ensure profile_dir is a pathlib.Path
-    if isinstance(profile_dir, str):
-        profile_dir = pathlib.Path(profile_dir)
-    elif not isinstance(profile_dir, pathlib.Path):
-        raise TypeError("profile_dir must be a string or a pathlib.Path object")
-
-    # Validate specific_plates
-    if specific_plates is not None:
-        if not isinstance(specific_plates, list):
-            raise TypeError("specific_plates must be a list of pathlib.Path objects")
-        if not all(isinstance(path, pathlib.Path) for path in specific_plates):
-            raise TypeError(
-                "All elements in specific_plates must be pathlib.Path objects"
-            )
-
-    # Use specific_plates if provided, otherwise gather all .parquet files
-    if specific_plates is not None:
-        # Validate that all specific plate files exist
-        for plate_path in specific_plates:
-            if not plate_path.exists():
-                raise FileNotFoundError(f"Profile file not found: {plate_path}")
-        files_to_load = specific_plates
-    else:
-        files_to_load = list(profile_dir.glob("*.parquet"))
-        if not files_to_load:
-            raise FileNotFoundError(f"No profile files found in {profile_dir}")
-
-    # Load and concatenate profiles
-    loaded_profiles = [
-        load_profiles(f, shared_features=shared_features) for f in files_to_load
-    ]
-
-    # Concatenate all loaded profiles
-    return pl.concat(loaded_profiles, rechunk=True)
-
-
 def split_data(
     pycytominer_output: pl.DataFrame, dataset: str = "CP_and_DP"
 ) -> pl.DataFrame:
@@ -259,7 +201,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)
 
 
-# Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.
+# Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each row with  Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP) and cell type. This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.
 #
 
 # In[6]:
@@ -277,6 +219,13 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
     rep_moa_df, on="Metadata_pert_iname", how="left"
 )
 
+# merge cell type metadata with cpjump1_profiles on Metadata_Plate
+cell_type_metadata = exp_metadata.select(["Assay_Plate_Barcode", "Cell_type"]).rename(
+    {"Assay_Plate_Barcode": "Metadata_Plate", "Cell_type": "Metadata_cell_type"}
+)
+cpjump1_profiles = cpjump1_profiles.join(
+    cell_type_metadata, on="Metadata_Plate", how="left"
+)
 
 # split meta and feature
 meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)
diff --git a/utils/io_utils.py b/utils/io_utils.py
index 3b2af25..127a8e7 100644
--- a/utils/io_utils.py
+++ b/utils/io_utils.py
@@ -71,8 +71,7 @@ def load_profiles(
     # check for supported file format
     if fpath.suffix.lower() not in [".parquet", ".pq", ".arrow"]:
         raise ValueError(
-            f"Unsupported file format: {fpath.suffix}. Supported formats are: ",
-            ".parquet, .pq, .arrow",
+            f"Unsupported file format: {fpath.suffix}. Supported formats are: .parquet, .pq, .arrow"
         )
 
     # load profiles

From c086123dfb89b4f9fe94448c74af83512044b08d Mon Sep 17 00:00:00 2001
From: Erik Serrano <erikishere3@gmail.com>
Date: Mon, 9 Mar 2026 15:13:42 -0600
Subject: [PATCH 06/16] updates

---
 buscar/metrics.py   | 206 ++++++++++++++++++++++++++------------------
 utils/data_utils.py |  70 +++++++++++----
 2 files changed, 173 insertions(+), 103 deletions(-)

diff --git a/buscar/metrics.py b/buscar/metrics.py
index e48d938..8e36b1c 100644
--- a/buscar/metrics.py
+++ b/buscar/metrics.py
@@ -16,7 +16,7 @@
 
 
 @beartype
-def _normalize_scores_if_emd(
+def _normalize_scores(
     scores_df: pl.DataFrame,
     target_state: str,
     on_method: bool = False,
@@ -98,6 +98,7 @@ def compute_earth_movers_distance(
     profile2: pl.DataFrame,
     subsample_size: int | None = None,
     seed: int | None = 0,
+    n_threads: int = 1,
 ) -> float:
     """Computing the earth mover's distance between two profiles
 
@@ -116,10 +117,23 @@ def compute_earth_movers_distance(
     float
         Earth Mover's Distance (Wasserstein distance) between the two profiles
     """
+
+    # if n_threads is -1, change varaible to "max" (use all available threads)
+    # docs: https://pythonot.github.io/all.html#ot.emd2
+    if n_threads == -1:
+        n_threads = "max"
+    elif n_threads < 1:
+        raise ValueError("n_threads must be a positive integer or -1 for max threads.")
+
     # Convert the profiles to numpy arrays
     p1 = profile1.to_numpy()
     p2 = profile2.to_numpy()
 
+    # check if either profile is empty and raise an error if so
+    # this avoid division by zero errors when computing the EMD
+    if profile1.is_empty() or profile2.is_empty():
+        raise ValueError("Both profiles must contain at least one row.")
+
     # Subsample if requested
     if subsample_size is not None:
         rng = np.random.default_rng(seed)  # set random seed for reproducibility
@@ -137,7 +151,7 @@ def compute_earth_movers_distance(
     target_weights = np.ones(p2.shape[0]) / p2.shape[0]
 
     # Compute the Earth Mover's Distance (EMD)
-    emd_value = ot.emd2(ref_weights, target_weights, M)
+    emd_value = ot.emd2(ref_weights, target_weights, M, numThreads=n_threads)
 
     return emd_value
 
@@ -175,28 +189,32 @@ def affected_off_features_ratio(
 
     # generate signatures for the off features and count how many are affected
     affected_off_sig, _, _ = get_signatures(
-        ref_profiles, target_profiles, morph_feats=off_signature, test_method=method
+        ref_profiles,
+        target_profiles,
+        morph_feats=off_signature,
+        test_method=method,
     )
 
     return len(affected_off_sig) / len(off_signature)
 
 
-def calculate_off_score(
+@beartype
+def calculate_score(
     ref_profile: pl.DataFrame,
     target_profile: pl.DataFrame,
-    off_signature: list[str],
-    method: Literal["affected_ratio", "emd"] = "affected_ratio",
+    signature: list[str],
+    signature_type: Literal["on", "off"],
+    on_calculation: Literal["emd"] = "emd",
+    off_calculation: Literal["ratio_affected", "emd"] = "ratio_affected",
     ratio_stats_method: str = "ks_test",
+    n_threads: int = 1,
     seed: int = 0,
 ) -> float:
-    """Calculating off scores
-
-    To calculate the off scores, we search for features within the
-    off-morphological signatures that have become significant. If so, this indicates that
-    the treatment has affected some morphological features that were not affected prior.
+    """Calculate on or off score for a given morphological signature.
 
-    The equation is (true off-morphological signatures / total off-morphological signatures).
-    This ratio tracks whether the treatment induces changes in off-morphological features.
+    Depending on ``signature_type``, this function measures either the magnitude of
+    change in expected features ("on") or the unintended effects on features that should
+    remain unchanged ("off").
 
     Parameters
     ----------
@@ -204,68 +222,56 @@ def calculate_off_score(
         DataFrame containing the reference morphological profile.
     target_profile : pl.DataFrame
         DataFrame containing the target morphological profile.
-    off_signature : list[str]
-        List of feature names that constitute the off-morphological signature.
-    method : str, optional
-        Statistical test method to use for determining significance, by default "ks_test"
+    signature : list[str]
+        List of feature names that constitute the morphological signature.
+    signature_type : Literal["on", "off"]
+        Whether to compute an on-score ("on") or off-score ("off").
+    on_calculation : Literal["emd"], optional
+        Method used to compute the on-score. Only Earth Mover's Distance ("emd") is
+        currently supported, by default "emd".
+    off_calculation : Literal["ratio_affected", "emd"], optional
+        Method used to compute the off-score:
+        - "ratio_affected": proportion of off features that became significant.
+        - "emd": Earth Mover's Distance in off-feature space.
+        By default "ratio_affected".
     ratio_stats_method : str, optional
-        Statistical test used when ``method`` is set to ``"affected_ratio"`` to assess
-        significance of changes in off-signature features.
+        Statistical test used when ``off_calculation`` is ``"ratio_affected"`` to assess
+        significance of changes in off-signature features, by default "ks_test".
+    seed : int, optional
+        Random seed for reproducibility in stochastic methods, by default 0.
 
     Returns
     -------
     float
-        Off score indicating the proportion of off features that have become significant.
+        Computed score for the given signature type and calculation method.
     """
 
-    # apply earth movers distance
-    if method == "emd":
-        return compute_earth_movers_distance(
-            ref_profile.select(pl.col(off_signature)),
-            target_profile.select(pl.col(off_signature)),
-            seed=seed,
-        )
-
-    if method == "affected_ratio":
-        return affected_off_features_ratio(
-            ref_profile, target_profile, off_signature, method=ratio_stats_method
-        )
-
-
-@beartype
-def calculate_on_score(
-    ref_profile: pl.DataFrame,
-    target_profile: pl.DataFrame,
-    on_signature: list[str],
-    method: Literal["emd"] = "emd",
-) -> float:
-    """Calculate on score
-
-    To calculate the on score, we measure the distance between the reference and target
-    profiles in the on-morphological signature space. A lower on score indicates that the
-    target profile is more similar to the reference profile in terms of the features that
-    are expected to change.
+    if signature_type == "on":
+        if on_calculation == "emd":
+            return compute_earth_movers_distance(
+                ref_profile.select(pl.col(signature)),
+                target_profile.select(pl.col(signature)),
+                n_threads=n_threads,
+            )
+        else:
+            raise ValueError(
+                f"Invalid on_calculation '{on_calculation}'. Must be 'emd'."
+            )
 
-    Parameters
-    ----------
-    ref_profile : pl.DataFrame
-        Reference morphological profile.
-    target_profile : pl.DataFrame
-        Target morphological profile.
-    on_signature : list[str]
-        List of features that constitute the on-morphological signature.
-    method : Literal["emd"], optional
-        Method for calculating on scores, by default "emd"
-    Returns
-    -------
-    float
-        On score indicating the magnitude of change in on features.
-    """
+    elif signature_type == "off":
+        if off_calculation == "ratio_affected":
+            return affected_off_features_ratio(
+                ref_profile, target_profile, signature, method=ratio_stats_method
+            )
+        else:
+            raise ValueError(
+                f"Invalid off_calculation '{off_calculation}'. Must be 'ratio_affected'"
+                " or 'emd'."
+            )
 
-    if method == "emd":
-        return compute_earth_movers_distance(
-            ref_profile.select(pl.col(on_signature)),
-            target_profile.select(pl.col(on_signature)),
+    else:
+        raise ValueError(
+            f"Invalid signature_type '{signature_type}'. Must be 'on' or 'off'."
         )
 
 
@@ -278,10 +284,13 @@ def measure_phenotypic_activity(
     ref_state: str,
     target_state: str,
     treatment_col: str,
+    state_col: str | None = None,
     on_method: Literal["emd"] = "emd",
-    off_method: Literal["affected_ratio", "emd"] = "affected_ratio",
+    off_method: Literal["ratio_affected", "emd"] = "ratio_affected",
+    raw_emd_scores: bool = False,
     ratio_stats_method: str = "ks_test",
     seed: int = 0,
+    n_threads: int = 1,
 ) -> pl.DataFrame:
     """Measure phenotypic activity by comparing morphological profiles across
     conditions.
@@ -315,16 +324,19 @@ def measure_phenotypic_activity(
         Value in treatment_col representing the desired phenotypic state.
     treatment_col : str, optional
         Column name containing treatment identifiers, by default "Metadata_treatment"
+    state_col : str, optional
+        Column containing cell state or treatment identifier. If None, defaults to
+        treatment_col indicating the state of intrest is within the treatment_col.
     on_method : Literal["emd"], optional
         Method for computing on-scores. Currently only Earth Mover's Distance (EMD)
         is supported, by default "emd"
-    off_method : Literal["affected_ratio", "emd"], optional
+    off_method : Literal["ratio_affected", "emd"], optional
         Method for computing off-scores:
-        - "affected_ratio": proportion of off features that became significant
+        - "ratio_affected": proportion of off features that became significant
         - "emd": Earth Mover's Distance in off-feature space
-        by default "affected_ratio"
+        by default "ratio_affected"
     ratio_stats_method : str, optional
-        Statistical test used when ``off_method`` is set to ``"affected_ratio"`` to
+        Statistical test used when ``off_method`` is set to ``"ratio_affected"`` to
         assess significance of changes in off-signature features.
     seed : int, optional
         Random seed for reproducibility in stochastic methods, by default 0
@@ -394,9 +406,7 @@ def measure_phenotypic_activity(
             continue
 
         # extract morphological features for reference condition (excluding metadata)
-        ref_profile = profiles.filter(pl.col(treatment_col) == ref_state).drop(
-            meta_cols
-        )
+        ref_profile = profiles.filter(pl.col(state_col) == ref_state).drop(meta_cols)
 
         # extract morphological features for current treatment condition
         target_profile = profiles.filter(pl.col(treatment_col) == treatment).drop(
@@ -411,19 +421,44 @@ def measure_phenotypic_activity(
             )
 
         # compute distance in on-feature space (expected changes)
-        on_score = calculate_on_score(ref_profile, target_profile, on_signature)
+        on_score = calculate_score(
+            ref_profile,
+            target_profile,
+            on_signature,
+            signature_type="on",
+            on_calculation=on_method,
+            n_threads=n_threads,
+            seed=seed,
+        )
 
         # compute distance in off-feature space (unintended changes)
-        off_score = calculate_off_score(
-            ref_profile, target_profile, off_signature, method=off_method, seed=seed
+        off_score = calculate_score(
+            ref_profile,
+            target_profile,
+            off_signature,
+            signature_type="off",
+            ratio_stats_method=ratio_stats_method,
+            off_calculation=off_method,
+            seed=seed,
         )
 
         # store computed scores for this treatment
-        scores.append((ref_state, treatment, on_score, off_score))
+        # print type of all outputs (
+        print(
+            f"ref_state: {type(ref_state)}, treatment: {type(treatment)}, "
+            f"on_score: {type(on_score)}, off_score: {type(off_score)}"
+        )
+        print(
+            f"ref_state: {ref_state}, treatment: {treatment}, "
+            f"on_score: {on_score}, off_score: {off_score}"
+        )
+        scores.append([ref_state, treatment, on_score, off_score])
 
     # construct dataframe from collected scores
     scores_df = pl.DataFrame(
-        scores, schema=["ref_profile", "treatment", "on_score", "off_score"]
+        scores,
+        schema=["ref_profile", "treatment", "on_score", "off_score"],
+        orient="row",
     )
 
     # rank treatments: prioritize low on-scores, then low off-scores
@@ -433,11 +468,12 @@ def measure_phenotypic_activity(
 
     # normalize scores if EMD method was used to enable comparison across different
     # feature sets
-    scores_df = _normalize_scores_if_emd(
-        scores_df,
-        target_state,
-        on_method=(on_method == "emd"),
-        off_method=(off_method == "emd"),
-    )
+    if not raw_emd_scores:
+        return _normalize_scores(
+            scores_df,
+            target_state,
+            on_method=(on_method == "emd"),
+            off_method=(off_method == "emd"),
+        )
 
     return scores_df
diff --git a/utils/data_utils.py b/utils/data_utils.py
index c8ef769..6eb8252 100644
--- a/utils/data_utils.py
+++ b/utils/data_utils.py
@@ -267,16 +267,16 @@ def add_cell_id_hash(
 def shuffle_feature_profiles(
     profiles: pl.DataFrame,
     feature_cols: list[str],
-    method: Literal["row", "column"] = "row",
+    method: Literal["row", "column", "label"] = "row",
+    label_col: str | None = None,
     seed: int = 42,
 ) -> pl.DataFrame:
     """
-    Return a shuffled copy of the profiles DataFrame for use as a null baseline.
+    Create a shuffled version of the dataset where each morphological feature
+    column is independently shuffled (values permuted within each column).
 
-    - ``method="row"``: shuffles entire rows, preserving feature correlations within
-    cells.
-    - ``method="column"``: shuffles each feature column independently, breaking
-      inter-feature correlations while preserving each feature's marginal distribution.
+    This breaks the correlation structure between features while preserving
+    the marginal distributions, creating a null baseline for comparison.
 
     Parameters
     ----------
@@ -315,19 +315,17 @@ def shuffle_feature_profiles(
 
     # column-wise shuffling
     elif method == "column":
-        shuffled_features = {}
-        for col in feature_cols:
-            values = profiles[col].to_numpy().copy()
-            np.random.shuffle(values)
-            shuffled_features[col] = values
-
-        # Build the shuffled dataframe
-        shuffled_df = profiles.select(meta_cols)
-        for col in feature_cols:
-            shuffled_df = shuffled_df.with_columns(
-                pl.Series(name=col, values=shuffled_features[col])
+        return profiles.with_columns(
+            [pl.col(col).shuffle(seed=seed + i) for i, col in enumerate(feature_cols)]
+        )
+    elif method == "label":
+        if label_col is None:
+            raise ValueError(
+                "label_col must be specified when using 'label' shuffle method."
             )
-        return shuffled_df
+
+        # return the profiels with
+        return profiles.with_columns(pl.col(label_col).shuffle(seed=seed))
     else:
         raise ValueError(f"Unknown shuffle method: {method}")
 
@@ -397,3 +395,39 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
         DataFrame with the prefix removed from all matching column names.
     """
     return df.rename(lambda x: x.replace(prefix, "") if prefix in x else x)
+
+
+def shuffle_signatures(
+    on_sig: list[str], off_sig: list[str], all_features: list[str], seed: int = 0
+) -> tuple[list[str], list[str]]:
+    """
+    Breaks biological meaning of on/off signatures by randomly sampling
+    features from the full feature space, while preserving the original
+    on/off size ratio.
+
+    Preserves:
+      - len(on_sig) and len(off_sig)  ← ratio intact
+      - Features drawn from same pool as real signatures
+
+    Breaks:
+      - Which specific features are "on" vs "off"
+      - Any biological grouping derived from KS test
+    """
+    rng = np.random.default_rng(seed)
+
+    n_on = len(on_sig)
+    n_off = len(off_sig)
+
+    # guard: need enough features to fill both without overlap
+    assert n_on + n_off <= len(all_features), (
+        f"Not enough features ({len(all_features)}) to fill "
+        f"on ({n_on}) + off ({n_off}) without replacement"
+    )
+
+    # sample without replacement so on and off don't overlap
+    sampled = rng.choice(all_features, size=n_on + n_off, replace=False)
+
+    shuffled_on = sampled[:n_on].tolist()
+    shuffled_off = sampled[n_on:].tolist()
+
+    return shuffled_on, shuffled_off

From ff9ad68333cba20215603a5eca4765db1d25e069 Mon Sep 17 00:00:00 2001
From: Erik Serrano <31600622+axiomcura@users.noreply.github.com>
Date: Fri, 20 Mar 2026 09:30:27 -0600
Subject: [PATCH 07/16] Update notebooks/0.download-data/2.preprocessing.ipynb

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 notebooks/0.download-data/2.preprocessing.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb
index 62405bc..78d0f73 100644
--- a/notebooks/0.download-data/2.preprocessing.ipynb
+++ b/notebooks/0.download-data/2.preprocessing.ipynb
@@ -146,7 +146,7 @@
     "    profiles_dir / \"cpjump1\" / \"cpjump1_compound_experimental-metadata.csv\"\n",
     ").resolve(strict=True)\n",
     "\n",
-    "# cpjump1 compound metadta\n",
+    "# cpjump1 compound metadata\n",
     "cmp_metadata_path = (\n",
     "    profiles_dir / \"cpjump1\" / \"cpjump1_compound_compound-metadata.tsv\"\n",
     ").resolve(strict=True)\n",

From 98e10e8bb791c92c699619420f0b406a3b3977b4 Mon Sep 17 00:00:00 2001
From: Erik Serrano <31600622+axiomcura@users.noreply.github.com>
Date: Fri, 20 Mar 2026 09:30:39 -0600
Subject: [PATCH 08/16] Update
 notebooks/0.download-data/nbconverted/2.preprocessing.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 notebooks/0.download-data/nbconverted/2.preprocessing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/0.download-data/nbconverted/2.preprocessing.py b/notebooks/0.download-data/nbconverted/2.preprocessing.py
index 5c20011..0ba4dee 100644
--- a/notebooks/0.download-data/nbconverted/2.preprocessing.py
+++ b/notebooks/0.download-data/nbconverted/2.preprocessing.py
@@ -115,7 +115,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
     profiles_dir / "cpjump1" / "cpjump1_compound_experimental-metadata.csv"
 ).resolve(strict=True)
 
-# cpjump1 compound metadta
+# cpjump1 compound metadata
 cmp_metadata_path = (
     profiles_dir / "cpjump1" / "cpjump1_compound_compound-metadata.tsv"
 ).resolve(strict=True)

From d5cf012dc5cbf0174765840645284faaa2f3ec10 Mon Sep 17 00:00:00 2001
From: Erik Serrano <31600622+axiomcura@users.noreply.github.com>
Date: Fri, 20 Mar 2026 09:30:48 -0600
Subject: [PATCH 09/16] Update
 notebooks/0.download-data/nbconverted/1.download-data.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 notebooks/0.download-data/nbconverted/1.download-data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/0.download-data/nbconverted/1.download-data.py b/notebooks/0.download-data/nbconverted/1.download-data.py
index a49bb30..c899b8e 100644
--- a/notebooks/0.download-data/nbconverted/1.download-data.py
+++ b/notebooks/0.download-data/nbconverted/1.download-data.py
@@ -2,7 +2,7 @@
 
 # # Downloading Single-Cell Profiles
 #
-# This notebook downloading metadata and single-cell profiles from three key datasets:
+# This notebook downloads metadata and single-cell profiles from three key datasets:
 #
 # 1. **CPJUMP1 Pilot Dataset** ([link](https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1)): Metadata is downloaded and processed to identify and organize plates containing wells treated with compound perturbations for downstream analysis.
 # 2. **MitoCheck Dataset**: Normalized and feature-selected single-cell profiles are downloaded for further analysis.

From edb1b86ba72dba09d7b8a55f499ed913b6029e55 Mon Sep 17 00:00:00 2001
From: Erik Serrano <31600622+axiomcura@users.noreply.github.com>
Date: Fri, 20 Mar 2026 09:31:09 -0600
Subject: [PATCH 10/16] Update notebooks/0.download-data/1.download-data.ipynb

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 notebooks/0.download-data/1.download-data.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/0.download-data/1.download-data.ipynb b/notebooks/0.download-data/1.download-data.ipynb
index e7ea279..c17269b 100644
--- a/notebooks/0.download-data/1.download-data.ipynb
+++ b/notebooks/0.download-data/1.download-data.ipynb
@@ -288,7 +288,7 @@
     "    {col: f\"Metadata_{col}\" for col in broad_compound_moa_metadata.columns}\n",
     ")\n",
     "\n",
-    "# replace null values in the boroad compound moa to \"unknown\"\n",
+    "# replace null values in the broad compound moa to \"unknown\"\n",
     "broad_compound_moa_metadata = broad_compound_moa_metadata.with_columns(\n",
     "    pl.col(\"Metadata_moa\").fill_null(\"unknown\")\n",
     ")"

From f0381c9b57a3e92d9c804b339da007c2351b1294 Mon Sep 17 00:00:00 2001
From: Erik Serrano <31600622+axiomcura@users.noreply.github.com>
Date: Fri, 20 Mar 2026 09:31:17 -0600
Subject: [PATCH 11/16] Update
 notebooks/0.download-data/nbconverted/1.download-data.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 notebooks/0.download-data/nbconverted/1.download-data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/0.download-data/nbconverted/1.download-data.py b/notebooks/0.download-data/nbconverted/1.download-data.py
index c899b8e..942a4b6 100644
--- a/notebooks/0.download-data/nbconverted/1.download-data.py
+++ b/notebooks/0.download-data/nbconverted/1.download-data.py
@@ -139,7 +139,7 @@
     {col: f"Metadata_{col}" for col in broad_compound_moa_metadata.columns}
 )
 
-# replace null values in the boroad compound moa to "unknown"
+# replace null values in the broad compound moa to "unknown"
 broad_compound_moa_metadata = broad_compound_moa_metadata.with_columns(
     pl.col("Metadata_moa").fill_null("unknown")
 )

From 3937330d36458d6a8f0558d538ecc99f34f33395 Mon Sep 17 00:00:00 2001
From: Erik Serrano <31600622+axiomcura@users.noreply.github.com>
Date: Fri, 20 Mar 2026 09:31:25 -0600
Subject: [PATCH 12/16] Update notebooks/0.download-data/1.download-data.ipynb

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 notebooks/0.download-data/1.download-data.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/0.download-data/1.download-data.ipynb b/notebooks/0.download-data/1.download-data.ipynb
index c17269b..5c0a9b6 100644
--- a/notebooks/0.download-data/1.download-data.ipynb
+++ b/notebooks/0.download-data/1.download-data.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "# Downloading Single-Cell Profiles\n",
     "\n",
-    "This notebook downloading metadata and single-cell profiles from three key datasets:\n",
+    "This notebook downloads metadata and single-cell profiles from three key datasets:\n",
     "\n",
     "1. **CPJUMP1 Pilot Dataset** ([link](https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1)): Metadata is downloaded and processed to identify and organize plates containing wells treated with compound perturbations for downstream analysis.\n",
     "2. **MitoCheck Dataset**: Normalized and feature-selected single-cell profiles are downloaded for further analysis.\n",

From a1ac7a6814d15d3997cd3353e1f967d2cb61325e Mon Sep 17 00:00:00 2001
From: Erik Serrano <erikishere3@gmail.com>
Date: Sun, 22 Mar 2026 11:18:06 -0600
Subject: [PATCH 13/16] updated notebook to download CPJUMP1 crispr  data

---
 .pre-commit-config.yaml                       |   2 +-
 .../0.download-data/1.download-data.ipynb     | 243 ++++++++++--------
 2 files changed, 139 insertions(+), 106 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 755d4c5..0d3f322 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -35,7 +35,7 @@ repos:
 
   # Ruff for linting and formatting Python files
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.15.5
+    rev: v0.15.7
     hooks:
     -   id: ruff-check
         args: ["--fix"]
diff --git a/notebooks/0.download-data/1.download-data.ipynb b/notebooks/0.download-data/1.download-data.ipynb
index 5c0a9b6..fe3741b 100644
--- a/notebooks/0.download-data/1.download-data.ipynb
+++ b/notebooks/0.download-data/1.download-data.ipynb
@@ -54,8 +54,8 @@
    "outputs": [],
    "source": [
     "# setting perturbation type\n",
-    "# other options are \"compound\", \"orf\",\n",
-    "pert_type = \"compound\""
+    "# other options are \"compound\", \"crispr\",\n",
+    "pert_type = \"crispr\""
    ]
   },
   {
@@ -107,9 +107,14 @@
    "source": [
     "## Downloading CPJUMP1 Metadata\n",
     "\n",
-    "In this section, we download the [experimental metadata](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/benchmark/output/experiment-metadata.tsv) for the CPJUMP1 dataset. This metadata provides detailed information about each experimental batch, including plate barcodes, cell lines, perturbation types, and incubation times. Access to this metadata is essential for selecting and organizing the relevant subset of CPJUMP1 data for downstream analysis.\n",
+    "In this section, we download the [experimental metadata](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/benchmark/output/experiment-metadata.tsv) for the CPJUMP1 dataset. This metadata contains detailed information about each experimental batch, including plate barcodes, cell lines, perturbation types, and incubation times. More information about the batch and plate metadata can be found in the [CPJUMP1 documentation](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/README.md#batch-and-plate-metadata).\n",
     "\n",
-    "For this notebook, we focus on plates containing both U2OS and A549 parental cell lines that have been treated with compounds for 48 hours. More information about the batch and plate metadata can be found in the [CPJUMP1 documentation](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/README.md#batch-and-plate-metadata)."
+    "We apply perturbation-specific filters to select plates from the `2020_11_04_CPJUMP1` batch:\n",
+    "\n",
+    "- **Compound-treated plates**: Plates where U2OS or A549 parental cell lines were treated with compound perturbations for 48 hours, with no antibiotics.\n",
+    "- **CRISPR-treated plates**: Plates where U2OS or A549 cell lines were treated with CRISPR perturbations for 144 hours (long time point), with antibiotics absent.\n",
+    "\n",
+    "Note: Both datasets contain anomalies. In the compound plates, two U2OS plates show anomalies in the MitoTracker stain. In the CRISPR plates, all four U2OS plates exhibit anomalies in the WGA stain. Documentation states there shouldn't be an impact. "
    ]
   },
   {
@@ -122,22 +127,19 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "plates that will be downloaded are:  shape: (12,)\n",
+      "plates that will be downloaded are:  shape: (8,)\n",
       "Series: 'Assay_Plate_Barcode' [str]\n",
       "[\n",
-      "\t\"BR00117054\"\n",
-      "\t\"BR00117055\"\n",
-      "\t\"BR00117010\"\n",
-      "\t\"BR00117009\"\n",
-      "\t\"BR00117011\"\n",
-      "\t…\n",
-      "\t\"BR00117013\"\n",
-      "\t\"BR00117008\"\n",
-      "\t\"BR00117012\"\n",
-      "\t\"BR00117015\"\n",
-      "\t\"BR00117019\"\n",
+      "\t\"BR00117003\"\n",
+      "\t\"BR00117004\"\n",
+      "\t\"BR00116997\"\n",
+      "\t\"BR00117005\"\n",
+      "\t\"BR00116999\"\n",
+      "\t\"BR00116996\"\n",
+      "\t\"BR00117000\"\n",
+      "\t\"BR00116998\"\n",
       "]\n",
-      "shape:  (12, 13)\n"
+      "shape:  (8, 13)\n"
      ]
     },
     {
@@ -150,59 +152,50 @@
        "  white-space: pre-wrap;\n",
        "}\n",
        "</style>\n",
-       "<small>shape: (12, 13)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>Batch</th><th>Plate_Map_Name</th><th>Assay_Plate_Barcode</th><th>Perturbation</th><th>Cell_type</th><th>Time</th><th>Density</th><th>Antibiotics</th><th>Cell_line</th><th>Time_delay</th><th>Times_imaged</th><th>Anomaly</th><th>Number_of_images</th></tr><tr><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>i64</td><td>i64</td><td>str</td><td>str</td><td>str</td><td>i64</td><td>str</td><td>i64</td></tr></thead><tbody><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_compound_platema…</td><td>&quot;BR00117008&quot;</td><td>&quot;compound&quot;</td><td>&quot;A549&quot;</td><td>48</td><td>80</td><td>&quot;absent&quot;</td><td>&quot;Parental&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;none&quot;</td><td>27648</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_compound_platema…</td><td>&quot;BR00117009&quot;</td><td>&quot;compound&quot;</td><td>&quot;A549&quot;</td><td>48</td><td>80</td><td>&quot;absent&quot;</td><td>&quot;Parental&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;none&quot;</td><td>27648</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_compound_platema…</td><td>&quot;BR00117010&quot;</td><td>&quot;compound&quot;</td><td>&quot;U2OS&quot;</td><td>48</td><td>100</td><td>&quot;absent&quot;</td><td>&quot;Parental&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;Mitotracker&quot;</td><td>27648</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_compound_platema…</td><td>&quot;BR00117011&quot;</td><td>&quot;compound&quot;</td><td>&quot;U2OS&quot;</td><td>48</td><td>100</td><td>&quot;absent&quot;</td><td>&quot;Parental&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;none&quot;</td><td>27648</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_compound_platema…</td><td>&quot;BR00117012&quot;</td><td>&quot;compound&quot;</td><td>&quot;U2OS&quot;</td><td>48</td><td>100</td><td>&quot;absent&quot;</td><td>&quot;Parental&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;none&quot;</td><td>27648</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_compound_platema…</td><td>&quot;BR00117016&quot;</td><td>&quot;compound&quot;</td><td>&quot;A549&quot;</td><td>48</td><td>100</td><td>&quot;absent&quot;</td><td>&quot;Parental&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;none&quot;</td><td>49152</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_compound_platema…</td><td>&quot;BR00117017&quot;</td><td>&quot;compound&quot;</td><td>&quot;A549&quot;</td><td>48</td><td>100</td><td>&quot;absent&quot;</td><td>&quot;Parental&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;none&quot;</td><td>49144</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_compound_platema…</td><td>&quot;BR00117019&quot;</td><td>&quot;compound&quot;</td><td>&quot;A549&quot;</td><td>48</td><td>100</td><td>&quot;absent&quot;</td><td>&quot;Parental&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;none&quot;</td><td>49152</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_compound_platema…</td><td>&quot;BR00117054&quot;</td><td>&quot;compound&quot;</td><td>&quot;A549&quot;</td><td>48</td><td>120</td><td>&quot;absent&quot;</td><td>&quot;Parental&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;none&quot;</td><td>27648</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_compound_platema…</td><td>&quot;BR00117055&quot;</td><td>&quot;compound&quot;</td><td>&quot;A549&quot;</td><td>48</td><td>120</td><td>&quot;absent&quot;</td><td>&quot;Parental&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;none&quot;</td><td>27648</td></tr></tbody></table></div>"
+       "<small>shape: (8, 13)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>Batch</th><th>Plate_Map_Name</th><th>Assay_Plate_Barcode</th><th>Perturbation</th><th>Cell_type</th><th>Time</th><th>Density</th><th>Antibiotics</th><th>Cell_line</th><th>Time_delay</th><th>Times_imaged</th><th>Anomaly</th><th>Number_of_images</th></tr><tr><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>i64</td><td>i64</td><td>str</td><td>str</td><td>str</td><td>i64</td><td>str</td><td>i64</td></tr></thead><tbody><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_crispr_platemap&quot;</td><td>&quot;BR00116996&quot;</td><td>&quot;crispr&quot;</td><td>&quot;U2OS&quot;</td><td>144</td><td>100</td><td>&quot;absent&quot;</td><td>&quot;Cas9&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;WGA&quot;</td><td>27648</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_crispr_platemap&quot;</td><td>&quot;BR00116997&quot;</td><td>&quot;crispr&quot;</td><td>&quot;U2OS&quot;</td><td>144</td><td>100</td><td>&quot;absent&quot;</td><td>&quot;Cas9&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;WGA&quot;</td><td>27648</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_crispr_platemap&quot;</td><td>&quot;BR00116998&quot;</td><td>&quot;crispr&quot;</td><td>&quot;U2OS&quot;</td><td>144</td><td>100</td><td>&quot;absent&quot;</td><td>&quot;Cas9&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;WGA&quot;</td><td>27648</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_crispr_platemap&quot;</td><td>&quot;BR00116999&quot;</td><td>&quot;crispr&quot;</td><td>&quot;U2OS&quot;</td><td>144</td><td>100</td><td>&quot;absent&quot;</td><td>&quot;Cas9&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;WGA&quot;</td><td>27648</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_crispr_platemap&quot;</td><td>&quot;BR00117000&quot;</td><td>&quot;crispr&quot;</td><td>&quot;A549&quot;</td><td>144</td><td>100</td><td>&quot;absent&quot;</td><td>&quot;Cas9&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;none&quot;</td><td>27640</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_crispr_platemap&quot;</td><td>&quot;BR00117003&quot;</td><td>&quot;crispr&quot;</td><td>&quot;A549&quot;</td><td>144</td><td>100</td><td>&quot;absent&quot;</td><td>&quot;Cas9&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;none&quot;</td><td>27632</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_crispr_platemap&quot;</td><td>&quot;BR00117004&quot;</td><td>&quot;crispr&quot;</td><td>&quot;A549&quot;</td><td>144</td><td>100</td><td>&quot;absent&quot;</td><td>&quot;Cas9&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;none&quot;</td><td>27648</td></tr><tr><td>&quot;2020_11_04_CPJUMP1&quot;</td><td>&quot;JUMP-Target-1_crispr_platemap&quot;</td><td>&quot;BR00117005&quot;</td><td>&quot;crispr&quot;</td><td>&quot;A549&quot;</td><td>144</td><td>100</td><td>&quot;absent&quot;</td><td>&quot;Cas9&quot;</td><td>&quot;Day0&quot;</td><td>1</td><td>&quot;none&quot;</td><td>27568</td></tr></tbody></table></div>"
       ],
       "text/plain": [
-       "shape: (12, 13)\n",
-       "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n",
-       "│ Batch     ┆ Plate_Map ┆ Assay_Pla ┆ Perturbat ┆ … ┆ Time_dela ┆ Times_ima ┆ Anomaly   ┆ Number_o │\n",
-       "│ ---       ┆ _Name     ┆ te_Barcod ┆ ion       ┆   ┆ y         ┆ ged       ┆ ---       ┆ f_images │\n",
-       "│ str       ┆ ---       ┆ e         ┆ ---       ┆   ┆ ---       ┆ ---       ┆ str       ┆ ---      │\n",
-       "│           ┆ str       ┆ ---       ┆ str       ┆   ┆ str       ┆ i64       ┆           ┆ i64      │\n",
-       "│           ┆           ┆ str       ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n",
-       "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011700 ┆ compound  ┆ … ┆ Day0      ┆ 1         ┆ none      ┆ 27648    │\n",
-       "│ 4_CPJUMP1 ┆ et-1_comp ┆ 8         ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ound_plat ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ema…      ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011700 ┆ compound  ┆ … ┆ Day0      ┆ 1         ┆ none      ┆ 27648    │\n",
-       "│ 4_CPJUMP1 ┆ et-1_comp ┆ 9         ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ound_plat ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ema…      ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011701 ┆ compound  ┆ … ┆ Day0      ┆ 1         ┆ Mitotrack ┆ 27648    │\n",
-       "│ 4_CPJUMP1 ┆ et-1_comp ┆ 0         ┆           ┆   ┆           ┆           ┆ er        ┆          │\n",
-       "│           ┆ ound_plat ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ema…      ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011701 ┆ compound  ┆ … ┆ Day0      ┆ 1         ┆ none      ┆ 27648    │\n",
-       "│ 4_CPJUMP1 ┆ et-1_comp ┆ 1         ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ound_plat ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ema…      ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011701 ┆ compound  ┆ … ┆ Day0      ┆ 1         ┆ none      ┆ 27648    │\n",
-       "│ 4_CPJUMP1 ┆ et-1_comp ┆ 2         ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ound_plat ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ema…      ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│ …         ┆ …         ┆ …         ┆ …         ┆ … ┆ …         ┆ …         ┆ …         ┆ …        │\n",
-       "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011701 ┆ compound  ┆ … ┆ Day0      ┆ 1         ┆ none      ┆ 49152    │\n",
-       "│ 4_CPJUMP1 ┆ et-1_comp ┆ 6         ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ound_plat ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ema…      ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011701 ┆ compound  ┆ … ┆ Day0      ┆ 1         ┆ none      ┆ 49144    │\n",
-       "│ 4_CPJUMP1 ┆ et-1_comp ┆ 7         ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ound_plat ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ema…      ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011701 ┆ compound  ┆ … ┆ Day0      ┆ 1         ┆ none      ┆ 49152    │\n",
-       "│ 4_CPJUMP1 ┆ et-1_comp ┆ 9         ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ound_plat ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ema…      ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011705 ┆ compound  ┆ … ┆ Day0      ┆ 1         ┆ none      ┆ 27648    │\n",
-       "│ 4_CPJUMP1 ┆ et-1_comp ┆ 4         ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ound_plat ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ema…      ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│ 2020_11_0 ┆ JUMP-Targ ┆ BR0011705 ┆ compound  ┆ … ┆ Day0      ┆ 1         ┆ none      ┆ 27648    │\n",
-       "│ 4_CPJUMP1 ┆ et-1_comp ┆ 5         ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ound_plat ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "│           ┆ ema…      ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
-       "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘"
+       "shape: (8, 13)\n",
+       "┌────────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬─────────┬───────────┐\n",
+       "│ Batch      ┆ Plate_Map ┆ Assay_Pla ┆ Perturbat ┆ … ┆ Time_dela ┆ Times_ima ┆ Anomaly ┆ Number_of │\n",
+       "│ ---        ┆ _Name     ┆ te_Barcod ┆ ion       ┆   ┆ y         ┆ ged       ┆ ---     ┆ _images   │\n",
+       "│ str        ┆ ---       ┆ e         ┆ ---       ┆   ┆ ---       ┆ ---       ┆ str     ┆ ---       │\n",
+       "│            ┆ str       ┆ ---       ┆ str       ┆   ┆ str       ┆ i64       ┆         ┆ i64       │\n",
+       "│            ┆           ┆ str       ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "╞════════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═════════╪═══════════╡\n",
+       "│ 2020_11_04 ┆ JUMP-Targ ┆ BR0011699 ┆ crispr    ┆ … ┆ Day0      ┆ 1         ┆ WGA     ┆ 27648     │\n",
+       "│ _CPJUMP1   ┆ et-1_cris ┆ 6         ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ pr_platem ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ ap        ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│ 2020_11_04 ┆ JUMP-Targ ┆ BR0011699 ┆ crispr    ┆ … ┆ Day0      ┆ 1         ┆ WGA     ┆ 27648     │\n",
+       "│ _CPJUMP1   ┆ et-1_cris ┆ 7         ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ pr_platem ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ ap        ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│ 2020_11_04 ┆ JUMP-Targ ┆ BR0011699 ┆ crispr    ┆ … ┆ Day0      ┆ 1         ┆ WGA     ┆ 27648     │\n",
+       "│ _CPJUMP1   ┆ et-1_cris ┆ 8         ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ pr_platem ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ ap        ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│ 2020_11_04 ┆ JUMP-Targ ┆ BR0011699 ┆ crispr    ┆ … ┆ Day0      ┆ 1         ┆ WGA     ┆ 27648     │\n",
+       "│ _CPJUMP1   ┆ et-1_cris ┆ 9         ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ pr_platem ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ ap        ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│ 2020_11_04 ┆ JUMP-Targ ┆ BR0011700 ┆ crispr    ┆ … ┆ Day0      ┆ 1         ┆ none    ┆ 27640     │\n",
+       "│ _CPJUMP1   ┆ et-1_cris ┆ 0         ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ pr_platem ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ ap        ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│ 2020_11_04 ┆ JUMP-Targ ┆ BR0011700 ┆ crispr    ┆ … ┆ Day0      ┆ 1         ┆ none    ┆ 27632     │\n",
+       "│ _CPJUMP1   ┆ et-1_cris ┆ 3         ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ pr_platem ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ ap        ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│ 2020_11_04 ┆ JUMP-Targ ┆ BR0011700 ┆ crispr    ┆ … ┆ Day0      ┆ 1         ┆ none    ┆ 27648     │\n",
+       "│ _CPJUMP1   ┆ et-1_cris ┆ 4         ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ pr_platem ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ ap        ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│ 2020_11_04 ┆ JUMP-Targ ┆ BR0011700 ┆ crispr    ┆ … ┆ Day0      ┆ 1         ┆ none    ┆ 27568     │\n",
+       "│ _CPJUMP1   ┆ et-1_cris ┆ 5         ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ pr_platem ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "│            ┆ ap        ┆           ┆           ┆   ┆           ┆           ┆         ┆           │\n",
+       "└────────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴─────────┴───────────┘"
       ]
      },
      "execution_count": 4,
@@ -222,13 +215,30 @@
     ")\n",
     "\n",
     "# apply a single filter to select only rows matching all criteria\n",
-    "exp_metadata = exp_metadata.filter(\n",
-    "    (pl.col(\"Perturbation\").str.contains(pert_type))  # selecting based on pert type\n",
-    "    & (pl.col(\"Time\") == 48)  # time of incubation with compound\n",
-    "    & (pl.col(\"Cell_type\").is_in([\"U2OS\", \"A549\"]))  # selecting based on cell type\n",
-    "    & (pl.col(\"Cell_line\") == \"Parental\")  # selecting only the parental cell line\n",
-    "    & (pl.col(\"Batch\") == \"2020_11_04_CPJUMP1\")  # selecting only the specified batch\n",
-    ")\n",
+    "if pert_type == \"compound\":\n",
+    "    exp_metadata = exp_metadata.filter(\n",
+    "        (pl.col(\"Perturbation\").str.contains(pert_type))  # selecting based on pert type\n",
+    "        & (\n",
+    "            pl.col(\"Time\") == 48\n",
+    "        )  # time of incubation with compound (select long time point)\n",
+    "        & (pl.col(\"Cell_type\").is_in([\"U2OS\", \"A549\"]))  # selecting based on cell type\n",
+    "        & (pl.col(\"Antibiotics\") == \"absent\")  # selecting only the\n",
+    "        & (pl.col(\"Cell_line\") == \"Parental\")  # selecting only the parental cell line\n",
+    "        & (\n",
+    "            pl.col(\"Batch\") == \"2020_11_04_CPJUMP1\"\n",
+    "        )  # selecting only CellProfiler features\n",
+    "        & (pl.col(\"Density\") == 100)  # selecting only the baseline cell density\n",
+    "    )\n",
+    "if pert_type == \"crispr\":\n",
+    "    exp_metadata = exp_metadata.filter(\n",
+    "        (pl.col(\"Perturbation\").str.contains(pert_type))  # selecting based on pert type\n",
+    "        & (pl.col(\"Time\") == 144)  # selecting the long time point\n",
+    "        & (pl.col(\"Cell_type\").is_in([\"U2OS\", \"A549\"]))  # selecting based on cell type\n",
+    "        & (pl.col(\"Antibiotics\") == \"absent\")  # selecting only the\n",
+    "        & (\n",
+    "            pl.col(\"Batch\") == \"2020_11_04_CPJUMP1\"\n",
+    "        )  # selecting only CellProfiler features\n",
+    "    )\n",
     "\n",
     "# save the experimental metadata as a csv file\n",
     "exp_metadata.write_csv(cpjump1_dir / f\"cpjump1_{pert_type}_experimental-metadata.csv\")\n",
@@ -262,31 +272,40 @@
    "execution_count": 5,
    "id": "22e417e3",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skipping this step:This is a crispr perturbation type, there's no moa info available.\n"
+     ]
+    }
+   ],
    "source": [
-    "# downloading compound metadata from cpjump1 repo\n",
-    "CPJUMP_compound_metadata = pl.read_csv(\n",
-    "    nb_configs[\"links\"][\"CPJUMP1-compound-metadata-source\"],\n",
-    "    separator=\"\\t\",\n",
-    "    has_header=True,\n",
-    "    encoding=\"utf-8\",\n",
-    ")\n",
-    "\n",
-    "# downloading compound moa metadata from broad institute drug repurposing hub\n",
-    "broad_compound_moa_metadata = pl.read_csv(\n",
-    "    nb_configs[\"links\"][\"Broad-compounds-moa-source\"],\n",
-    "    separator=\"\\t\",\n",
-    "    skip_rows=9,\n",
-    "    encoding=\"utf8-lossy\",\n",
-    ")\n",
-    "\n",
-    "# for both dataframes make sure that all columns have \"Metadata_\" in the column name\n",
-    "CPJUMP_compound_metadata = CPJUMP_compound_metadata.rename(\n",
-    "    {col: f\"Metadata_{col}\" for col in CPJUMP_compound_metadata.columns}\n",
-    ")\n",
-    "broad_compound_moa_metadata = broad_compound_moa_metadata.rename(\n",
-    "    {col: f\"Metadata_{col}\" for col in broad_compound_moa_metadata.columns}\n",
-    ")\n",
+    "if pert_type == \"compound\":\n",
+    "    # downloading compound metadata from cpjump1 repo\n",
+    "    CPJUMP_compound_metadata = pl.read_csv(\n",
+    "        nb_configs[\"links\"][\"CPJUMP1-compound-metadata-source\"],\n",
+    "        separator=\"\\t\",\n",
+    "        has_header=True,\n",
+    "        encoding=\"utf-8\",\n",
+    "    )\n",
+    "\n",
+    "    # downloading compound moa metadata from broad institute drug repurposing hub\n",
+    "    broad_compound_moa_metadata = pl.read_csv(\n",
+    "        nb_configs[\"links\"][\"Broad-compounds-moa-source\"],\n",
+    "        separator=\"\\t\",\n",
+    "        skip_rows=9,\n",
+    "        encoding=\"utf8-lossy\",\n",
+    "    )\n",
+    "\n",
+    "    # for both dataframes make sure that all columns have \"Metadata_\" in the column name\n",
+    "    CPJUMP_compound_metadata = CPJUMP_compound_metadata.rename(\n",
+    "        {col: f\"Metadata_{col}\" for col in CPJUMP_compound_metadata.columns}\n",
+    "    )\n",
+    "    broad_compound_moa_metadata = broad_compound_moa_metadata.rename(\n",
+    "        {col: f\"Metadata_{col}\" for col in broad_compound_moa_metadata.columns}\n",
+    "    )\n",
     "\n",
     "# replace null values in the broad compound moa to \"unknown\"\n",
     "broad_compound_moa_metadata = broad_compound_moa_metadata.with_columns(\n",
@@ -308,11 +327,25 @@
     "    how=\"left\",\n",
     ")\n",
     "\n",
+    "    # now merge moa metadata to the cpjump1 compound metadata\n",
+    "    complete_compound_metadata = CPJUMP_compound_metadata.join(\n",
+    "        broad_compound_moa_metadata,\n",
+    "        left_on=\"Metadata_pert_iname\",\n",
+    "        right_on=\"Metadata_pert_iname\",\n",
+    "        how=\"left\",\n",
+    "    )\n",
     "\n",
-    "# save the complete compound metadata as a tsv file\n",
-    "complete_compound_metadata.write_csv(\n",
-    "    cpjump1_dir / f\"cpjump1_{pert_type}_compound-metadata.tsv\", separator=\"\\t\"\n",
-    ")"
+    "    # save the complete compound metadata as a tsv file\n",
+    "    complete_compound_metadata.write_csv(\n",
+    "        cpjump1_dir / f\"cpjump1_{pert_type}_compound-metadata.tsv\", separator=\"\\t\"\n",
+    "    )\n",
+    "\n",
+    "# if the pertrubations type is not compound, then skip it\n",
+    "else:\n",
+    "    print(\n",
+    "        \"Skipping this step:\"\n",
+    "        f\"This is a {pert_type} perturbation type, there's no moa info available.\"\n",
+    "    )"
    ]
   },
   {
@@ -329,7 +362,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "id": "06783224",
    "metadata": {},
    "outputs": [
@@ -367,7 +400,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "id": "4d9fd47c",
    "metadata": {},
    "outputs": [

From 44b904eef65f78305c5bfbd47397b562cf6fd4d1 Mon Sep 17 00:00:00 2001
From: Erik Serrano <erikishere3@gmail.com>
Date: Sun, 22 Mar 2026 11:50:05 -0600
Subject: [PATCH 14/16] updated preprocessing for CPJUMP1 CRISPR plates

---
 .../0.download-data/2.preprocessing.ipynb     | 116 +++++++++++-------
 1 file changed, 73 insertions(+), 43 deletions(-)

diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb
index 78d0f73..341b859 100644
--- a/notebooks/0.download-data/2.preprocessing.ipynb
+++ b/notebooks/0.download-data/2.preprocessing.ipynb
@@ -130,6 +130,18 @@
   {
    "cell_type": "code",
    "execution_count": 3,
+   "id": "3dfe8d86",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the type of perturbation for the dataset\n",
+    "# options are: \"compound\" or \"crispr\"\n",
+    "pert_type = \"crispr\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
    "id": "3ea207e4",
    "metadata": {},
    "outputs": [],
@@ -143,13 +155,16 @@
     "\n",
     "# Experimental metadata\n",
     "exp_metadata_path = (\n",
-    "    profiles_dir / \"cpjump1\" / \"cpjump1_compound_experimental-metadata.csv\"\n",
+    "    profiles_dir / \"cpjump1\" / f\"cpjump1_{pert_type}_experimental-metadata.csv\"\n",
     ").resolve(strict=True)\n",
     "\n",
     "# cpjump1 compound metadata\n",
-    "cmp_metadata_path = (\n",
-    "    profiles_dir / \"cpjump1\" / \"cpjump1_compound_compound-metadata.tsv\"\n",
-    ").resolve(strict=True)\n",
+    "if pert_type == \"compound\":\n",
+    "    cmp_metadata_path = (\n",
+    "        profiles_dir / \"cpjump1\" / \"cpjump1_compound_compound-metadata.tsv\"\n",
+    "    ).resolve(strict=True)\n",
+    "else:\n",
+    "    cmp_metadata_path = None\n",
     "\n",
     "# Setting CFReT profiles directory\n",
     "cfret_profiles_dir = (profiles_dir / \"cfret\").resolve(strict=True)\n",
@@ -182,12 +197,12 @@
    "id": "7168a71a",
    "metadata": {},
    "source": [
-    "Create a list of paths that only points compound treated plates and load the shared features config file that can be found in this [repo](https://github.com/WayScience/JUMP-single-cell)"
+    "Create a list of paths pointing to the selected CPJUMP1 plates and load the shared features configuration file from the [JUMP-single-cell](https://github.com/WayScience/JUMP-single-cell) repository."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "c7944fc2",
    "metadata": {},
    "outputs": [],
@@ -198,7 +213,7 @@
     "compound_plate_names = (\n",
     "    exp_metadata.select(\"Assay_Plate_Barcode\").unique().to_series().to_list()\n",
     ")\n",
-    "compound_plate_paths = [\n",
+    "cpjump1_plate_paths = [\n",
     "    (profiles_dir / \"cpjump1\" / f\"{plate}_feature_selected_sc_qc.parquet\").resolve(\n",
     "        strict=True\n",
     "    )\n",
@@ -216,15 +231,15 @@
    "id": "c6bfd5c7",
    "metadata": {},
    "source": [
-    "## Preprocessing CPJUMP1 Compound data\n",
+    "## Preprocessing CPJUMP1 Data\n",
     "\n",
-    "Using the filtered compound plate file paths and shared features configuration, we load all individual profile files and concatenate them into a single comprehensive DataFrame. This step combines data from multiple experimental plates while maintaining the consistent feature space defined by the shared features list.\n",
+    "Using the filtered plate file paths and shared features configuration, we load all individual profile files and concatenate them into a single comprehensive DataFrame. This step combines data from multiple experimental plates — for either compound or CRISPR perturbation types — while maintaining a consistent feature space defined by the shared features list.\n",
     "\n",
     "The concatenation process ensures:\n",
     "- All profiles use the same feature set for downstream compatibility\n",
     "- Metadata columns are preserved across all plates\n",
     "- Data integrity is maintained during the merge operation\n",
-    "- Adding a unique cell id has column `Metadata_cell_id`"
+    "- A unique cell identifier is added via the `Metadata_cell_id` column"
    ]
   },
   {
@@ -232,25 +247,25 @@
    "id": "9ec882fa",
    "metadata": {},
    "source": [
-    "We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis."
+    "We load per-plate Parquet profiles for the selected perturbation type (compound or CRISPR), apply the shared feature set, and concatenate them into a single Polars DataFrame while preserving metadata. A unique `Metadata_cell_id` is added for each cell. The resulting `cpjump1_profiles` table is ready for downstream analysis."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "f6f7e08d",
    "metadata": {},
    "outputs": [],
    "source": [
     "# Loading compound profiles with shared features and concat into a single DataFrame\n",
     "concat_output_path = (\n",
-    "    cpjump1_output_dir / \"cpjump1_compound_concat_profiles.parquet\"\n",
+    "    cpjump1_output_dir / f\"cpjump1_{pert_type}_concat_profiles.parquet\"\n",
     ").resolve()\n",
     "\n",
     "# loaded and concatenated profiles\n",
     "cpjump1_profiles = load_and_concat_profiles(\n",
     "    profile_dir=profiles_dir,\n",
-    "    specific_plates=compound_plate_paths,\n",
+    "    specific_plates=cpjump1_plate_paths,\n",
     "    shared_features=shared_features,\n",
     ")\n",
     "\n",
@@ -263,35 +278,50 @@
    "id": "3df9bbf5",
    "metadata": {},
    "source": [
-    "Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each row with  Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP) and cell type. This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.\n"
+    "For compound-treated plates, we annotate each profile with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP), which provides drug and tool compound annotations including target information and clinical development status. Cell type metadata is also merged in from the experimental metadata. This step is skipped for CRISPR-treated plates."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "adfb9148",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skipping this step since the dataset is CPJUMP1 crispr and not compound\n"
+     ]
+    }
+   ],
    "source": [
     "# load drug repurposing moa file and add prefix to metadata columns\n",
-    "rep_moa_df = pl.read_csv(\n",
-    "    cmp_metadata_path,\n",
-    "    separator=\"\\t\",\n",
-    "    columns=[\"Metadata_pert_iname\", \"Metadata_target\", \"Metadata_moa\"],\n",
-    ").unique(subset=[\"Metadata_pert_iname\"])\n",
-    "\n",
-    "# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname\n",
-    "cpjump1_profiles = cpjump1_profiles.join(\n",
-    "    rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n",
-    ")\n",
+    "if pert_type == \"compound\":\n",
+    "    rep_moa_df = pl.read_csv(\n",
+    "        cmp_metadata_path,\n",
+    "        separator=\"\\t\",\n",
+    "        columns=[\"Metadata_pert_iname\", \"Metadata_target\", \"Metadata_moa\"],\n",
+    "    ).unique(subset=[\"Metadata_pert_iname\"])\n",
+    "\n",
+    "    # merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname\n",
+    "    cpjump1_profiles = cpjump1_profiles.join(\n",
+    "        rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n",
+    "    )\n",
     "\n",
-    "# merge cell type metadata with cpjump1_profiles on Metadata_Plate\n",
-    "cell_type_metadata = exp_metadata.select([\"Assay_Plate_Barcode\", \"Cell_type\"]).rename(\n",
-    "    {\"Assay_Plate_Barcode\": \"Metadata_Plate\", \"Cell_type\": \"Metadata_cell_type\"}\n",
-    ")\n",
-    "cpjump1_profiles = cpjump1_profiles.join(\n",
-    "    cell_type_metadata, on=\"Metadata_Plate\", how=\"left\"\n",
-    ")\n",
+    "    # merge cell type metadata with cpjump1_profiles on Metadata_Plate\n",
+    "    cell_type_metadata = exp_metadata.select(\n",
+    "        [\"Assay_Plate_Barcode\", \"Cell_type\"]\n",
+    "    ).rename(\n",
+    "        {\"Assay_Plate_Barcode\": \"Metadata_Plate\", \"Cell_type\": \"Metadata_cell_type\"}\n",
+    "    )\n",
+    "    cpjump1_profiles = cpjump1_profiles.join(\n",
+    "        cell_type_metadata, on=\"Metadata_Plate\", how=\"left\"\n",
+    "    )\n",
+    "else:\n",
+    "    print(\n",
+    "        f\"Skipping this step since the dataset is CPJUMP1 {pert_type} and not compound\"\n",
+    "    )\n",
     "\n",
     "# split meta and feature\n",
     "meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)\n",
@@ -299,18 +329,18 @@
     "# save the feature space information into a json file\n",
     "meta_features_dict = {\n",
     "    \"concat-profiles\": {\n",
+    "        \"data-type\": f\"{pert_type}_plates\",\n",
     "        \"meta-features\": meta_cols,\n",
     "        \"shared-features\": features_cols,\n",
     "    }\n",
     "}\n",
-    "with open(cpjump1_output_dir / \"concat_profiles_meta_features.json\", \"w\") as f:\n",
+    "with open(\n",
+    "    cpjump1_output_dir / f\"{pert_type}_concat_profiles_meta_features.json\", \"w\"\n",
+    ") as f:\n",
     "    json.dump(meta_features_dict, f, indent=4)\n",
     "\n",
     "# save concatenated profiles\n",
     "# Loading compound profiles with shared features and concat into a single DataFrame\n",
-    "concat_output_path = (\n",
-    "    cpjump1_output_dir / \"cpjump1_compound_concat_profiles.parquet\"\n",
-    ").resolve()\n",
     "cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path)"
    ]
   },
@@ -339,7 +369,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "id": "c5471d3e",
    "metadata": {},
    "outputs": [],
@@ -393,7 +423,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "id": "c57da947",
    "metadata": {},
    "outputs": [],
@@ -426,7 +456,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "1d7ced04",
    "metadata": {},
    "outputs": [],
@@ -479,7 +509,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "id": "42108980",
    "metadata": {},
    "outputs": [],
@@ -526,7 +556,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "id": "1763d383",
    "metadata": {},
    "outputs": [],

From 897b12b832ac8a24b6be198b78d31b5a567ee71e Mon Sep 17 00:00:00 2001
From: Erik Serrano <erikishere3@gmail.com>
Date: Sun, 22 Mar 2026 11:51:12 -0600
Subject: [PATCH 15/16] applied nbconvert to udpate the notebooks

---
 .../0.download-data/1.download-data.ipynb     |  63 +++---
 .../0.download-data/2.preprocessing.ipynb     |   2 +-
 .../nbconverted/1.download-data.py            | 193 ++++++++++--------
 .../nbconverted/2.preprocessing.py            | 144 +++++++------
 .../nbconverted/3.subset-jump-controls.py     |  16 +-
 5 files changed, 231 insertions(+), 187 deletions(-)

diff --git a/notebooks/0.download-data/1.download-data.ipynb b/notebooks/0.download-data/1.download-data.ipynb
index fe3741b..9477817 100644
--- a/notebooks/0.download-data/1.download-data.ipynb
+++ b/notebooks/0.download-data/1.download-data.ipynb
@@ -9,14 +9,14 @@
     "\n",
     "This notebook downloads metadata and single-cell profiles from three key datasets:\n",
     "\n",
-    "1. **CPJUMP1 Pilot Dataset** ([link](https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1)): Metadata is downloaded and processed to identify and organize plates containing wells treated with compound perturbations for downstream analysis.\n",
+    "1. **CPJUMP1 Pilot Dataset** ([link](https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1)): Experimental metadata is downloaded and processed to identify and organize plates containing wells treated with either **compound** or **CRISPR** perturbations for downstream analysis.\n",
     "2. **MitoCheck Dataset**: Normalized and feature-selected single-cell profiles are downloaded for further analysis.\n",
     "3. **CFReT Dataset**: Normalized and feature-selected single-cell profiles from the CFReT plate are downloaded for downstream analysis."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 21,
    "id": "7748e2b0",
    "metadata": {},
    "outputs": [],
@@ -48,7 +48,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 22,
    "id": "0420eb8e",
    "metadata": {},
    "outputs": [],
@@ -68,7 +68,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 23,
    "id": "b7381913",
    "metadata": {},
    "outputs": [],
@@ -119,7 +119,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 24,
    "id": "5b8bfe5f",
    "metadata": {},
    "outputs": [
@@ -130,14 +130,14 @@
       "plates that will be downloaded are:  shape: (8,)\n",
       "Series: 'Assay_Plate_Barcode' [str]\n",
       "[\n",
-      "\t\"BR00117003\"\n",
+      "\t\"BR00116999\"\n",
+      "\t\"BR00116998\"\n",
       "\t\"BR00117004\"\n",
-      "\t\"BR00116997\"\n",
       "\t\"BR00117005\"\n",
-      "\t\"BR00116999\"\n",
+      "\t\"BR00116997\"\n",
+      "\t\"BR00117003\"\n",
       "\t\"BR00116996\"\n",
       "\t\"BR00117000\"\n",
-      "\t\"BR00116998\"\n",
       "]\n",
       "shape:  (8, 13)\n"
      ]
@@ -198,7 +198,7 @@
        "└────────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴─────────┴───────────┘"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -222,7 +222,7 @@
     "            pl.col(\"Time\") == 48\n",
     "        )  # time of incubation with compound (select long time point)\n",
     "        & (pl.col(\"Cell_type\").is_in([\"U2OS\", \"A549\"]))  # selecting based on cell type\n",
-    "        & (pl.col(\"Antibiotics\") == \"absent\")  # selecting only the\n",
+    "        & (pl.col(\"Antibiotics\") == \"absent\")  # selecting only the plates without antibiotics\n",
     "        & (pl.col(\"Cell_line\") == \"Parental\")  # selecting only the parental cell line\n",
     "        & (\n",
     "            pl.col(\"Batch\") == \"2020_11_04_CPJUMP1\"\n",
@@ -234,7 +234,7 @@
     "        (pl.col(\"Perturbation\").str.contains(pert_type))  # selecting based on pert type\n",
     "        & (pl.col(\"Time\") == 144)  # selecting the long time point\n",
     "        & (pl.col(\"Cell_type\").is_in([\"U2OS\", \"A549\"]))  # selecting based on cell type\n",
-    "        & (pl.col(\"Antibiotics\") == \"absent\")  # selecting only the\n",
+    "        & (pl.col(\"Antibiotics\") == \"absent\")  # selecting only the plates without antibiotics\n",
     "        & (\n",
     "            pl.col(\"Batch\") == \"2020_11_04_CPJUMP1\"\n",
     "        )  # selecting only CellProfiler features\n",
@@ -269,7 +269,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 25,
    "id": "22e417e3",
    "metadata": {},
    "outputs": [
@@ -307,25 +307,16 @@
     "        {col: f\"Metadata_{col}\" for col in broad_compound_moa_metadata.columns}\n",
     "    )\n",
     "\n",
-    "# replace null values in the broad compound moa to \"unknown\"\n",
-    "broad_compound_moa_metadata = broad_compound_moa_metadata.with_columns(\n",
-    "    pl.col(\"Metadata_moa\").fill_null(\"unknown\")\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "01db7db8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "complete_compound_metadata = CPJUMP_compound_metadata.join(\n",
-    "    broad_compound_moa_metadata,\n",
-    "    left_on=\"Metadata_pert_iname\",\n",
-    "    right_on=\"Metadata_pert_iname\",\n",
-    "    how=\"left\",\n",
-    ")\n",
+    "    # replace null values in the broad compound moa to \"unknown\"\n",
+    "    broad_compound_moa_metadata = broad_compound_moa_metadata.with_columns(\n",
+    "        pl.col(\"Metadata_moa\").fill_null(\"unknown\")\n",
+    "    )\n",
+    "    complete_compound_metadata = CPJUMP_compound_metadata.join(\n",
+    "        broad_compound_moa_metadata,\n",
+    "        left_on=\"Metadata_pert_iname\",\n",
+    "        right_on=\"Metadata_pert_iname\",\n",
+    "        how=\"left\",\n",
+    "    )\n",
     "\n",
     "    # now merge moa metadata to the cpjump1 compound metadata\n",
     "    complete_compound_metadata = CPJUMP_compound_metadata.join(\n",
@@ -337,10 +328,8 @@
     "\n",
     "    # save the complete compound metadata as a tsv file\n",
     "    complete_compound_metadata.write_csv(\n",
-    "        cpjump1_dir / f\"cpjump1_{pert_type}_compound-metadata.tsv\", separator=\"\\t\"\n",
+    "        cpjump1_dir / \"cpjump1_compound_compound-metadata.tsv\", separator=\"\\t\"\n",
     "    )\n",
-    "\n",
-    "# if the pertrubations type is not compound, then skip it\n",
     "else:\n",
     "    print(\n",
     "        \"Skipping this step:\"\n",
@@ -362,7 +351,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 26,
    "id": "06783224",
    "metadata": {},
    "outputs": [
@@ -400,7 +389,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 27,
    "id": "4d9fd47c",
    "metadata": {},
    "outputs": [
diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb
index 341b859..e8648f6 100644
--- a/notebooks/0.download-data/2.preprocessing.ipynb
+++ b/notebooks/0.download-data/2.preprocessing.ipynb
@@ -233,7 +233,7 @@
    "source": [
     "## Preprocessing CPJUMP1 Data\n",
     "\n",
-    "Using the filtered plate file paths and shared features configuration, we load all individual profile files and concatenate them into a single comprehensive DataFrame. This step combines data from multiple experimental plates — for either compound or CRISPR perturbation types — while maintaining a consistent feature space defined by the shared features list.\n",
+    "Using the filtered plate file paths and shared features configuration, we load all individual profile files and concatenate them into a single comprehensive DataFrame. This step combines data from multiple experimental plates, for either compound or CRISPR perturbation types, while maintaining a consistent feature space defined by the shared features list.\n",
     "\n",
     "The concatenation process ensures:\n",
     "- All profiles use the same feature set for downstream compatibility\n",
diff --git a/notebooks/0.download-data/nbconverted/1.download-data.py b/notebooks/0.download-data/nbconverted/1.download-data.py
index 942a4b6..ed58818 100644
--- a/notebooks/0.download-data/nbconverted/1.download-data.py
+++ b/notebooks/0.download-data/nbconverted/1.download-data.py
@@ -1,39 +1,41 @@
 #!/usr/bin/env python
+# coding: utf-8
 
 # # Downloading Single-Cell Profiles
-#
+# 
 # This notebook downloads metadata and single-cell profiles from three key datasets:
-#
-# 1. **CPJUMP1 Pilot Dataset** ([link](https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1)): Metadata is downloaded and processed to identify and organize plates containing wells treated with compound perturbations for downstream analysis.
+# 
+# 1. **CPJUMP1 Pilot Dataset** ([link](https://github.com/jump-cellpainting/2024_Chandrasekaran_NatureMethods_CPJUMP1)): Experimental metadata is downloaded and processed to identify and organize plates containing wells treated with either **compound** or **CRISPR** perturbations for downstream analysis.
 # 2. **MitoCheck Dataset**: Normalized and feature-selected single-cell profiles are downloaded for further analysis.
 # 3. **CFReT Dataset**: Normalized and feature-selected single-cell profiles from the CFReT plate are downloaded for downstream analysis.
 
-# In[1]:
+# In[21]:
 
 
-import pathlib
 import sys
+import pathlib
 
 import polars as pl
 
 sys.path.append("../../")
 from utils.io_utils import download_compressed_file, load_configs
 
+
 # ## Downloading data
 
 # Parameters used in this notebook
 
-# In[2]:
+# In[22]:
 
 
 # setting perturbation type
-# other options are "compound", "orf",
-pert_type = "compound"
+# other options are "compound", "crispr",
+pert_type = "crispr"
 
 
 # setting input and output paths
 
-# In[3]:
+# In[23]:
 
 
 # setting config path
@@ -64,12 +66,17 @@
 
 
 # ## Downloading CPJUMP1 Metadata
-#
-# In this section, we download the [experimental metadata](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/benchmark/output/experiment-metadata.tsv) for the CPJUMP1 dataset. This metadata provides detailed information about each experimental batch, including plate barcodes, cell lines, perturbation types, and incubation times. Access to this metadata is essential for selecting and organizing the relevant subset of CPJUMP1 data for downstream analysis.
-#
-# For this notebook, we focus on plates containing both U2OS and A549 parental cell lines that have been treated with compounds for 48 hours. More information about the batch and plate metadata can be found in the [CPJUMP1 documentation](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/README.md#batch-and-plate-metadata).
+# 
+# In this section, we download the [experimental metadata](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/benchmark/output/experiment-metadata.tsv) for the CPJUMP1 dataset. This metadata contains detailed information about each experimental batch, including plate barcodes, cell lines, perturbation types, and incubation times. More information about the batch and plate metadata can be found in the [CPJUMP1 documentation](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods/blob/main/README.md#batch-and-plate-metadata).
+# 
+# We apply perturbation-specific filters to select plates from the `2020_11_04_CPJUMP1` batch:
+# 
+# - **Compound-treated plates**: Plates where U2OS or A549 parental cell lines were treated with compound perturbations for 48 hours, with no antibiotics.
+# - **CRISPR-treated plates**: Plates where U2OS or A549 cell lines were treated with CRISPR perturbations for 144 hours (long time point), with antibiotics absent.
+# 
+# Note: Both datasets contain anomalies. In the compound plates, two U2OS plates show anomalies in the MitoTracker stain. In the CRISPR plates, all four U2OS plates exhibit anomalies in the WGA stain. Documentation states there shouldn't be an impact. 
 
-# In[4]:
+# In[24]:
 
 
 # loading config file and setting experimental metadata URL
@@ -83,13 +90,30 @@
 )
 
 # apply a single filter to select only rows matching all criteria
-exp_metadata = exp_metadata.filter(
-    (pl.col("Perturbation").str.contains(pert_type))  # selecting based on pert type
-    & (pl.col("Time") == 48)  # time of incubation with compound
-    & (pl.col("Cell_type").is_in(["U2OS", "A549"]))  # selecting based on cell type
-    & (pl.col("Cell_line") == "Parental")  # selecting only the parental cell line
-    & (pl.col("Batch") == "2020_11_04_CPJUMP1")  # selecting only the specified batch
-)
+if pert_type == "compound":
+    exp_metadata = exp_metadata.filter(
+        (pl.col("Perturbation").str.contains(pert_type))  # selecting based on pert type
+        & (
+            pl.col("Time") == 48
+        )  # time of incubation with compound (select long time point)
+        & (pl.col("Cell_type").is_in(["U2OS", "A549"]))  # selecting based on cell type
+        & (pl.col("Antibiotics") == "absent")  # selecting only the plates without antibiotics
+        & (pl.col("Cell_line") == "Parental")  # selecting only the parental cell line
+        & (
+            pl.col("Batch") == "2020_11_04_CPJUMP1"
+        )  # selecting only CellProfiler features
+        & (pl.col("Density") == 100)  # selecting only the baseline cell density
+    )
+if pert_type == "crispr":
+    exp_metadata = exp_metadata.filter(
+        (pl.col("Perturbation").str.contains(pert_type))  # selecting based on pert type
+        & (pl.col("Time") == 144)  # selecting the long time point
+        & (pl.col("Cell_type").is_in(["U2OS", "A549"]))  # selecting based on cell type
+        & (pl.col("Antibiotics") == "absent")  # selecting only the plates without antibiotics
+        & (
+            pl.col("Batch") == "2020_11_04_CPJUMP1"
+        )  # selecting only CellProfiler features
+    )
 
 # save the experimental metadata as a csv file
 exp_metadata.write_csv(cpjump1_dir / f"cpjump1_{pert_type}_experimental-metadata.csv")
@@ -102,73 +126,81 @@
 exp_metadata
 
 
-#
+# 
 # In this section, we download:
-#
-# 1. **Compound metadata** from the CPJUMP1 repository
+# 
+# 1. **Compound metadata** from the CPJUMP1 repository  
 # 2. **Mechanism of action (MOA) metadata** from the Broad Repurposing Hub
-#
+# 
 # We then merge both datasets into a single compound metadata table.
-#
+# 
 # If a compound has missing MOA information, the value in `Metadata_moa` is replaced with `"unknown"`. This indicates that no MOA annotation is currently available for that compound.
 
-# In[5]:
-
-
-# downloading compound metadata from cpjump1 repo
-CPJUMP_compound_metadata = pl.read_csv(
-    nb_configs["links"]["CPJUMP1-compound-metadata-source"],
-    separator="\t",
-    has_header=True,
-    encoding="utf-8",
-)
-
-# downloading compound moa metadata from broad institute drug repurposing hub
-broad_compound_moa_metadata = pl.read_csv(
-    nb_configs["links"]["Broad-compounds-moa-source"],
-    separator="\t",
-    skip_rows=9,
-    encoding="utf8-lossy",
-)
-
-# for both dataframes make sure that all columns have "Metadata_" in the column name
-CPJUMP_compound_metadata = CPJUMP_compound_metadata.rename(
-    {col: f"Metadata_{col}" for col in CPJUMP_compound_metadata.columns}
-)
-broad_compound_moa_metadata = broad_compound_moa_metadata.rename(
-    {col: f"Metadata_{col}" for col in broad_compound_moa_metadata.columns}
-)
-
-# replace null values in the broad compound moa to "unknown"
-broad_compound_moa_metadata = broad_compound_moa_metadata.with_columns(
-    pl.col("Metadata_moa").fill_null("unknown")
-)
-
-
-# In[6]:
-
-
-complete_compound_metadata = CPJUMP_compound_metadata.join(
-    broad_compound_moa_metadata,
-    left_on="Metadata_pert_iname",
-    right_on="Metadata_pert_iname",
-    how="left",
-)
-
-
-# save the complete compound metadata as a tsv file
-complete_compound_metadata.write_csv(
-    cpjump1_dir / f"cpjump1_{pert_type}_compound-metadata.tsv", separator="\t"
-)
+# In[25]:
+
+
+if pert_type == "compound":
+    # downloading compound metadata from cpjump1 repo
+    CPJUMP_compound_metadata = pl.read_csv(
+        nb_configs["links"]["CPJUMP1-compound-metadata-source"],
+        separator="\t",
+        has_header=True,
+        encoding="utf-8",
+    )
+
+    # downloading compound moa metadata from broad institute drug repurposing hub
+    broad_compound_moa_metadata = pl.read_csv(
+        nb_configs["links"]["Broad-compounds-moa-source"],
+        separator="\t",
+        skip_rows=9,
+        encoding="utf8-lossy",
+    )
+
+    # for both dataframes make sure that all columns have "Metadata_" in the column name
+    CPJUMP_compound_metadata = CPJUMP_compound_metadata.rename(
+        {col: f"Metadata_{col}" for col in CPJUMP_compound_metadata.columns}
+    )
+    broad_compound_moa_metadata = broad_compound_moa_metadata.rename(
+        {col: f"Metadata_{col}" for col in broad_compound_moa_metadata.columns}
+    )
+
+    # replace null values in the broad compound moa to "unknown"
+    broad_compound_moa_metadata = broad_compound_moa_metadata.with_columns(
+        pl.col("Metadata_moa").fill_null("unknown")
+    )
+    complete_compound_metadata = CPJUMP_compound_metadata.join(
+        broad_compound_moa_metadata,
+        left_on="Metadata_pert_iname",
+        right_on="Metadata_pert_iname",
+        how="left",
+    )
+
+    # now merge moa metadata to the cpjump1 compound metadata
+    complete_compound_metadata = CPJUMP_compound_metadata.join(
+        broad_compound_moa_metadata,
+        left_on="Metadata_pert_iname",
+        right_on="Metadata_pert_iname",
+        how="left",
+    )
+
+    # save the complete compound metadata as a tsv file
+    complete_compound_metadata.write_csv(
+        cpjump1_dir / "cpjump1_compound_compound-metadata.tsv", separator="\t"
+    )
+else:
+    print(
+        "Skipping this step:"
+        f"This is a {pert_type} perturbation type, there's no moa info available."
+    )
 
 
 # ## Downloading MitoCheck Data
-#
+# 
 # In this section, we download the MitoCheck data generated in [this study](https://pmc.ncbi.nlm.nih.gov/articles/PMC3108885/).
-#
+# 
 # Specifically, we are downloading data that has already been normalized and feature-selected. The normalization and feature selection pipeline is available [here](https://github.com/WayScience/mitocheck_data/tree/main/3.normalize_data).
 
-# In[7]:
+# In[26]:
 
 
 # url source for the MitoCheck data
@@ -181,14 +213,14 @@
 
 
 # ## Downloading CFReT Data
-#
+# 
 # This section downloads and saves feature-selected single-cell profiles from the CFReT plate `localhost230405150001`.
-#
+# 
 # - Only processed single-cell profiles are downloaded (no raw data).
 # - Data is saved as a Parquet file for fast access.
 # - Used in published cardiac fibrosis research ([study link](https://doi.org/10.1161/CIRCULATIONAHA.124.071956)).
 
-# In[8]:
+# In[27]:
 
 
 # setting the source for the CFReT data
@@ -210,3 +242,4 @@
     # display
     print("shape: ", cfret_df.shape)
     cfret_df.head()
+
diff --git a/notebooks/0.download-data/nbconverted/2.preprocessing.py b/notebooks/0.download-data/nbconverted/2.preprocessing.py
index 0ba4dee..803c71e 100644
--- a/notebooks/0.download-data/nbconverted/2.preprocessing.py
+++ b/notebooks/0.download-data/nbconverted/2.preprocessing.py
@@ -1,35 +1,37 @@
 #!/usr/bin/env python
+# coding: utf-8
 
 # # 2. Preprocessing Data
-#
+# 
 # This notebook demonstrates how to preprocess single-cell profile data for downstream analysis. It covers the following steps:
-#
+# 
 # **Overview**
-#
+# 
 # - **Data Exploration**: Examining the structure and contents of the downloaded datasets
 # - **Metadata Handling**: Loading experimental metadata to guide data selection and organization
 # - **Feature Selection**: Applying a shared feature space for consistency across datasets
 # - **Profile Concatenation**: Merging profiles from multiple experimental plates into a unified DataFrame
 # - **Format Conversion**: Converting raw CSV files to Parquet format for efficient storage and access
 # - **Metadata and Feature Documentation**: Saving metadata and feature information to ensure reproducibility
-#
+# 
 # These preprocessing steps ensure that all datasets are standardized, well-documented, and ready for comparative and integrative analyses.
 
 # In[1]:
 
 
+import sys
 import json
 import pathlib
-import sys
 
 import polars as pl
 
 sys.path.append("../../")
-from utils.data_utils import add_cell_id_hash, split_meta_and_features
+from utils.data_utils import split_meta_and_features, add_cell_id_hash
 from utils.io_utils import load_and_concat_profiles
 
-# ## Helper functions
-#
+
+# ## Helper functions 
+# 
 # Contains helper function that pertains to this notebook.
 
 # In[2]:
@@ -97,12 +99,20 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 
 
 # Defining the input and output directories used throughout the notebook.
-#
+# 
 # > **Note:** The shared profiles utilized here are sourced from the [JUMP-single-cell](https://github.com/WayScience/JUMP-single-cell) repository. All preprocessing and profile generation steps are performed in that repository, and this notebook focuses on downstream analysis using the generated profiles.
 
 # In[3]:
 
 
+# Define the type of perturbation for the dataset
+# options are: "compound" or "crispr"
+pert_type = "crispr"
+
+
+# In[4]:
+
+
 # Setting data directory
 data_dir = pathlib.Path("./data").resolve(strict=True)
 
@@ -112,13 +122,16 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 
 # Experimental metadata
 exp_metadata_path = (
-    profiles_dir / "cpjump1" / "cpjump1_compound_experimental-metadata.csv"
+    profiles_dir / "cpjump1" / f"cpjump1_{pert_type}_experimental-metadata.csv"
 ).resolve(strict=True)
 
 # cpjump1 compound metadata
-cmp_metadata_path = (
-    profiles_dir / "cpjump1" / "cpjump1_compound_compound-metadata.tsv"
-).resolve(strict=True)
+if pert_type == "compound":
+    cmp_metadata_path = (
+        profiles_dir / "cpjump1" / "cpjump1_compound_compound-metadata.tsv"
+    ).resolve(strict=True)
+else:
+    cmp_metadata_path = None
 
 # Setting CFReT profiles directory
 cfret_profiles_dir = (profiles_dir / "cfret").resolve(strict=True)
@@ -146,9 +159,9 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 results_dir.mkdir(exist_ok=True)
 
 
-# Create a list of paths that only points compound treated plates and load the shared features config file that can be found in this [repo](https://github.com/WayScience/JUMP-single-cell)
+# Create a list of paths pointing to the selected CPJUMP1 plates and load the shared features configuration file from the [JUMP-single-cell](https://github.com/WayScience/JUMP-single-cell) repository.
 
-# In[4]:
+# In[5]:
 
 
 # Load experimental metadata
@@ -157,7 +170,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 compound_plate_names = (
     exp_metadata.select("Assay_Plate_Barcode").unique().to_series().to_list()
 )
-compound_plate_paths = [
+cpjump1_plate_paths = [
     (profiles_dir / "cpjump1" / f"{plate}_feature_selected_sc_qc.parquet").resolve(
         strict=True
     )
@@ -170,30 +183,30 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 shared_features = loaded_shared_features["shared-features"]
 
 
-# ## Preprocessing CPJUMP1 Compound data
-#
-# Using the filtered compound plate file paths and shared features configuration, we load all individual profile files and concatenate them into a single comprehensive DataFrame. This step combines data from multiple experimental plates while maintaining the consistent feature space defined by the shared features list.
-#
+# ## Preprocessing CPJUMP1 Data
+# 
+# Using the filtered plate file paths and shared features configuration, we load all individual profile files and concatenate them into a single comprehensive DataFrame. This step combines data from multiple experimental plates, for either compound or CRISPR perturbation types, while maintaining a consistent feature space defined by the shared features list.
+# 
 # The concatenation process ensures:
 # - All profiles use the same feature set for downstream compatibility
 # - Metadata columns are preserved across all plates
 # - Data integrity is maintained during the merge operation
-# - Adding a unique cell id has column `Metadata_cell_id`
+# - A unique cell identifier is added via the `Metadata_cell_id` column
 
-# We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis.
+# We load per-plate Parquet profiles for the selected perturbation type (compound or CRISPR), apply the shared feature set, and concatenate them into a single Polars DataFrame while preserving metadata. A unique `Metadata_cell_id` is added for each cell. The resulting `cpjump1_profiles` table is ready for downstream analysis.
 
-# In[5]:
+# In[6]:
 
 
 # Loading compound profiles with shared features and concat into a single DataFrame
 concat_output_path = (
-    cpjump1_output_dir / "cpjump1_compound_concat_profiles.parquet"
+    cpjump1_output_dir / f"cpjump1_{pert_type}_concat_profiles.parquet"
 ).resolve()
 
 # loaded and concatenated profiles
 cpjump1_profiles = load_and_concat_profiles(
     profile_dir=profiles_dir,
-    specific_plates=compound_plate_paths,
+    specific_plates=cpjump1_plate_paths,
     shared_features=shared_features,
 )
 
@@ -201,31 +214,37 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)
 
 
-# Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each row with  Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP) and cell type. This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.
-#
+# For compound-treated plates, we annotate each profile with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP), which provides drug and tool compound annotations including target information and clinical development status. Cell type metadata is also merged in from the experimental metadata. This step is skipped for CRISPR-treated plates.
 
-# In[6]:
+# In[7]:
 
 
 # load drug repurposing moa file and add prefix to metadata columns
-rep_moa_df = pl.read_csv(
-    cmp_metadata_path,
-    separator="\t",
-    columns=["Metadata_pert_iname", "Metadata_target", "Metadata_moa"],
-).unique(subset=["Metadata_pert_iname"])
-
-# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname
-cpjump1_profiles = cpjump1_profiles.join(
-    rep_moa_df, on="Metadata_pert_iname", how="left"
-)
+if pert_type == "compound":
+    rep_moa_df = pl.read_csv(
+        cmp_metadata_path,
+        separator="\t",
+        columns=["Metadata_pert_iname", "Metadata_target", "Metadata_moa"],
+    ).unique(subset=["Metadata_pert_iname"])
+
+    # merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname
+    cpjump1_profiles = cpjump1_profiles.join(
+        rep_moa_df, on="Metadata_pert_iname", how="left"
+    )
 
-# merge cell type metadata with cpjump1_profiles on Metadata_Plate
-cell_type_metadata = exp_metadata.select(["Assay_Plate_Barcode", "Cell_type"]).rename(
-    {"Assay_Plate_Barcode": "Metadata_Plate", "Cell_type": "Metadata_cell_type"}
-)
-cpjump1_profiles = cpjump1_profiles.join(
-    cell_type_metadata, on="Metadata_Plate", how="left"
-)
+    # merge cell type metadata with cpjump1_profiles on Metadata_Plate
+    cell_type_metadata = exp_metadata.select(
+        ["Assay_Plate_Barcode", "Cell_type"]
+    ).rename(
+        {"Assay_Plate_Barcode": "Metadata_Plate", "Cell_type": "Metadata_cell_type"}
+    )
+    cpjump1_profiles = cpjump1_profiles.join(
+        cell_type_metadata, on="Metadata_Plate", how="left"
+    )
+else:
+    print(
+        f"Skipping this step since the dataset is CPJUMP1 {pert_type} and not compound"
+    )
 
 # split meta and feature
 meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)
@@ -233,39 +252,39 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 # save the feature space information into a json file
 meta_features_dict = {
     "concat-profiles": {
+        "data-type": f"{pert_type}_plates",
         "meta-features": meta_cols,
         "shared-features": features_cols,
     }
 }
-with open(cpjump1_output_dir / "concat_profiles_meta_features.json", "w") as f:
+with open(
+    cpjump1_output_dir / f"{pert_type}_concat_profiles_meta_features.json", "w"
+) as f:
     json.dump(meta_features_dict, f, indent=4)
 
 # save concatenated profiles
 # Loading compound profiles with shared features and concat into a single DataFrame
-concat_output_path = (
-    cpjump1_output_dir / "cpjump1_compound_concat_profiles.parquet"
-).resolve()
 cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path)
 
 
 # ## Preprocessing MitoCheck Dataset
-#
+# 
 # This section processes the MitoCheck dataset by loading training data, positive controls, and negative controls from compressed CSV files. The data is standardized and converted to Parquet format for consistency with other datasets and improved performance.
-#
+# 
 # **Key preprocessing steps:**
-#
+# 
 # - **Loading datasets**: Reading training data, positive controls, and negative controls from compressed CSV files
 # - **Control labeling**: Adding phenotypic class labels ("poscon" and "negcon") to distinguish control types
-# - **Feature filtering**: Extracting only Cell Profiler (CP) features to match the CPJUMP1 dataset structure
+# - **Feature filtering**: Extracting only Cell Profiler (CP) features to match the CPJUMP1 dataset structure  
 # - **Column standardization**: Removing "CP__" prefixes and ensuring consistent naming conventions
 # - **Feature alignment**: Identifying shared features across all three datasets (training, positive controls, negative controls)
 # - **Metadata preservation**: Maintaining consistent metadata structure across all profile types
 # - **Format conversion**: Saving processed data in optimized Parquet format for efficient downstream analysis
 # - **adding cell id**: adding a cell id column `Metadata_cell_id`
-#
+# 
 # The preprocessing ensures that all MitoCheck datasets share a common feature space and are ready for comparative analysis with CPJUMP1 profiles.
 
-# In[7]:
+# In[8]:
 
 
 # load in mitocheck profiles and save as parquet
@@ -309,7 +328,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 
 # Filter Cell Profiler (CP) features and preprocess columns by removing the "CP__" prefix to standardize feature names for downstream analysis.
 
-# In[8]:
+# In[9]:
 
 
 # Split profiles to only retain cell profiler features
@@ -332,7 +351,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 
 # Splitting the metadata and feature columns for each dataset to enable targeted downstream analysis and ensure consistent data structure across all profiles.
 
-# In[9]:
+# In[10]:
 
 
 # manually selecting metadata features that are present across all 3 profiles
@@ -381,7 +400,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
     )
 
 
-# In[10]:
+# In[11]:
 
 
 # create concatenated mitocheck profiles
@@ -413,13 +432,13 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 
 
 # ## Preprocessing CFReT Dataset
-#
+# 
 # This section preprocesses the CFReT dataset to ensure compatibility with downstream analysis workflows.
-#
+# 
 # - **Unique cell identification**: Adding `Metadata_cell_id` column with unique hash values based on all profile features to enable precise cell tracking and deduplication
-#
+# 
 
-# In[11]:
+# In[12]:
 
 
 # load in cfret profiles and add a unique cell ID
@@ -448,3 +467,4 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 
 # overwrite dataset with cell
 cfret_profiles.select(meta_cols + features_cols).write_parquet(cfret_profiles_path)
+
diff --git a/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py b/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py
index 31283ad..aa44eff 100644
--- a/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py
+++ b/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py
@@ -1,23 +1,24 @@
 #!/usr/bin/env python
+# coding: utf-8
 
-# # 3. Subsetting CPJUMP1 controls
-#
+# # 3. Subsetting CPJUMP1 controls 
+# 
 # In this notebook, we subset control samples from the CPJUMP1 CRISPR dataset using stratified sampling. We generate 10 different random seeds to create multiple subsets, each containing 15% of the original control data stratified by plate and well metadata. This approach ensures reproducible sampling while maintaining the distribution of controls across experimental conditions.
-#
+# 
 # The subsampled datasets are saved as individual parquet files for downstream analysis and model training purposes.
-#
+# 
 
 # In[1]:
 
 
-import pathlib
 import sys
-
+import pathlib
 import polars as pl
 
 sys.path.append("../../")
 from utils.io_utils import load_profiles
 
+
 # Load helper functions
 
 # In[2]:
@@ -169,7 +170,7 @@ def load_group_stratified_data(
     )
 
 
-# Selecting only positive controls and saving it
+# Selecting only positive controls and saving it 
 
 # In[6]:
 
@@ -180,3 +181,4 @@ def load_group_stratified_data(
     & (pl.col("Metadata_control_type") == "poscon_cp")
 )
 poscon_cp_df.write_parquet(poscon_data_dir / "poscon_cp_df.parquet")
+

From c16fe9cb0b09d2848e3c1001fb0782e7a4e17beb Mon Sep 17 00:00:00 2001
From: Erik Serrano <erikishere3@gmail.com>
Date: Sun, 22 Mar 2026 15:34:37 -0600
Subject: [PATCH 16/16] removed subsetting notebook

---
 .../3.subset-jump-controls.ipynb              | 266 ------------------
 .../nbconverted/3.subset-jump-controls.py     | 184 ------------
 2 files changed, 450 deletions(-)
 delete mode 100644 notebooks/0.download-data/3.subset-jump-controls.ipynb
 delete mode 100644 notebooks/0.download-data/nbconverted/3.subset-jump-controls.py

diff --git a/notebooks/0.download-data/3.subset-jump-controls.ipynb b/notebooks/0.download-data/3.subset-jump-controls.ipynb
deleted file mode 100644
index 39e1f46..0000000
--- a/notebooks/0.download-data/3.subset-jump-controls.ipynb
+++ /dev/null
@@ -1,266 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "d6c178c2",
-   "metadata": {},
-   "source": [
-    "# 3. Subsetting CPJUMP1 controls \n",
-    "\n",
-    "In this notebook, we subset control samples from the CPJUMP1 CRISPR dataset using stratified sampling. We generate 10 different random seeds to create multiple subsets, each containing 15% of the original control data stratified by plate and well metadata. This approach ensures reproducible sampling while maintaining the distribution of controls across experimental conditions.\n",
-    "\n",
-    "The subsampled datasets are saved as individual parquet files for downstream analysis and model training purposes.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "c9632493",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "import pathlib\n",
-    "import polars as pl\n",
-    "\n",
-    "sys.path.append(\"../../\")\n",
-    "from utils.io_utils import load_profiles"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b42648b9",
-   "metadata": {},
-   "source": [
-    "Load helper functions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "64311431",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def load_group_stratified_data(\n",
-    "    profiles: str | pathlib.Path | pl.DataFrame,\n",
-    "    group_columns: list[str] = [\"Metadata_Plate\", \"Metadata_Well\"],\n",
-    "    sample_percentage: float = 0.2,\n",
-    "    seed: int = 0,\n",
-    ") -> pl.DataFrame:\n",
-    "    \"\"\"Memory-efficiently sample a percentage of rows from each group in a dataset.\n",
-    "\n",
-    "    This function performs stratified sampling by loading only the grouping columns first\n",
-    "    to dtermine group memberships and sizes, then samples indices from each group, and\n",
-    "    finally loads the full dataset filtered to only the sampled rows. This approach\n",
-    "    minimizes memory usage compared to loading the entire dataset upfront.\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    dataset_path : str or pathlib.Path\n",
-    "        Path to the parquet dataset file to sample from\n",
-    "    group_columns : list[str], default [\"Metadata_Plate\", \"Metadata_Well\"]\n",
-    "        Column names to use for grouping. Sampling will be performed independently\n",
-    "        within each unique combination of these columns\n",
-    "    sample_percentage : float, default 0.2\n",
-    "        Fraction of rows to sample from each group (must be between 0.0 and 1.0)\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    pl.DataFrame\n",
-    "        Subsampled dataframe containing the sampled rows from each group,\n",
-    "        preserving all original columns\n",
-    "\n",
-    "    Raises\n",
-    "    ------\n",
-    "    ValueError\n",
-    "        If sample_percentage is not between 0 and 1\n",
-    "    FileNotFoundError\n",
-    "        If dataset_path does not exist\n",
-    "    \"\"\"\n",
-    "    # validate inputs\n",
-    "    if not 0 <= sample_percentage <= 1:\n",
-    "        raise ValueError(\"sample_percentage must be between 0 and 1\")\n",
-    "\n",
-    "    # convert str types to pathlib types\n",
-    "    if isinstance(profiles, str):\n",
-    "        profiles = pathlib.Path(profiles).resolve(strict=True)\n",
-    "\n",
-    "    # load only the grouping columns to determine groups\n",
-    "    if isinstance(profiles, pl.DataFrame):\n",
-    "        # if a polars DataFrame is provided, use it directly\n",
-    "        metadata_df = profiles.select(group_columns).with_row_index(\"original_idx\")\n",
-    "    else:\n",
-    "        metadata_df = pl.read_parquet(profiles, columns=group_columns).with_row_index(\n",
-    "            \"original_idx\"\n",
-    "        )\n",
-    "\n",
-    "    # sample indices for each group based on the group_columns\n",
-    "    sampled_indices = (\n",
-    "        metadata_df\n",
-    "        # group rows by the specified columns (e.g., Plate and Well combinations)\n",
-    "        .group_by(group_columns)\n",
-    "        # for each group, randomly sample a fraction of the original row indices\n",
-    "        .agg(\n",
-    "            pl.col(\"original_idx\")\n",
-    "            .sample(\n",
-    "                fraction=sample_percentage, seed=seed\n",
-    "            )  # sample specified percentage from each group\n",
-    "            .alias(\"sampled_idx\")  # rename the sampled indices column\n",
-    "        )\n",
-    "        # extract only the sampled indices column, discarding group identifiers\n",
-    "        .select(\"sampled_idx\")\n",
-    "        # convert list of indices per group into individual rows (flatten the structure)\n",
-    "        .explode(\"sampled_idx\")\n",
-    "        # extract the sampled indices as a single column series\n",
-    "        .get_column(\"sampled_idx\")\n",
-    "        .sort()\n",
-    "    )\n",
-    "\n",
-    "    # load the entire dataset and filter to sampled indices\n",
-    "    sampled_df = (\n",
-    "        profiles.with_row_index(\"idx\")\n",
-    "        .filter(pl.col(\"idx\").is_in(sampled_indices.implode()))\n",
-    "        .drop(\"idx\")\n",
-    "    )\n",
-    "\n",
-    "    return sampled_df"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8c9670ac",
-   "metadata": {},
-   "source": [
-    "Setting input and output paths"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "6a8dd258",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# setting directory where all the single-cell profiles are stored\n",
-    "data_dir = pathlib.Path.cwd() / \"data\"\n",
-    "profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n",
-    "\n",
-    "cpjump1_data_path = (\n",
-    "    profiles_dir / \"cpjump1\" / \"cpjump1_compound_concat_profiles.parquet\"\n",
-    ").resolve(strict=True)\n",
-    "\n",
-    "\n",
-    "# Setting feature selection path\n",
-    "shared_features_config_path = (\n",
-    "    profiles_dir / \"cpjump1\" / \"feature_selected_sc_qc_features.json\"\n",
-    ").resolve(strict=True)\n",
-    "\n",
-    "# setting negative control\n",
-    "negcon_data_dir = (profiles_dir / \"cpjump1\" / \"negcon\").resolve()\n",
-    "negcon_data_dir.mkdir(exist_ok=True)\n",
-    "poscon_data_dir = (profiles_dir / \"cpjump1\" / \"poscon\").resolve()\n",
-    "poscon_data_dir.mkdir(exist_ok=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c858d985",
-   "metadata": {},
-   "source": [
-    "Loading data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "71aea9c7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load all profiles\n",
-    "profiles_df = load_profiles(cpjump1_data_path)\n",
-    "\n",
-    "# create a negative control subset\n",
-    "negcon_df = profiles_df.filter(\n",
-    "    (pl.col(\"Metadata_pert_type\") == \"control\")\n",
-    "    & (pl.col(\"Metadata_control_type\") == \"negcon\")\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cc45a182",
-   "metadata": {},
-   "source": [
-    "generating 10 seeds of randomly sampled negative controls"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "705e5ef0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for seed_val in range(10):\n",
-    "    # load the dataset with group stratified sub sampling\n",
-    "    subsampled_df = load_group_stratified_data(\n",
-    "        profiles=negcon_df,\n",
-    "        group_columns=[\"Metadata_Plate\", \"Metadata_Well\"],\n",
-    "        sample_percentage=0.15,\n",
-    "        seed=seed_val,\n",
-    "    )\n",
-    "\n",
-    "    # save the file\n",
-    "    subsampled_df.write_parquet(\n",
-    "        negcon_data_dir / f\"cpjump1_compound_negcon_seed{seed_val}.parquet\"\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e3946e99",
-   "metadata": {},
-   "source": [
-    "Selecting only positive controls and saving it "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "23bbf122",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# write as parquet file\n",
-    "poscon_cp_df = profiles_df.filter(\n",
-    "    (pl.col(\"Metadata_pert_type\") == \"control\")\n",
-    "    & (pl.col(\"Metadata_control_type\") == \"poscon_cp\")\n",
-    ")\n",
-    "poscon_cp_df.write_parquet(poscon_data_dir / \"poscon_cp_df.parquet\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "buscar",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py b/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py
deleted file mode 100644
index aa44eff..0000000
--- a/notebooks/0.download-data/nbconverted/3.subset-jump-controls.py
+++ /dev/null
@@ -1,184 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-
-# # 3. Subsetting CPJUMP1 controls 
-# 
-# In this notebook, we subset control samples from the CPJUMP1 CRISPR dataset using stratified sampling. We generate 10 different random seeds to create multiple subsets, each containing 15% of the original control data stratified by plate and well metadata. This approach ensures reproducible sampling while maintaining the distribution of controls across experimental conditions.
-# 
-# The subsampled datasets are saved as individual parquet files for downstream analysis and model training purposes.
-# 
-
-# In[1]:
-
-
-import sys
-import pathlib
-import polars as pl
-
-sys.path.append("../../")
-from utils.io_utils import load_profiles
-
-
-# Load helper functions
-
-# In[2]:
-
-
-def load_group_stratified_data(
-    profiles: str | pathlib.Path | pl.DataFrame,
-    group_columns: list[str] = ["Metadata_Plate", "Metadata_Well"],
-    sample_percentage: float = 0.2,
-    seed: int = 0,
-) -> pl.DataFrame:
-    """Memory-efficiently sample a percentage of rows from each group in a dataset.
-
-    This function performs stratified sampling by loading only the grouping columns first
-    to dtermine group memberships and sizes, then samples indices from each group, and
-    finally loads the full dataset filtered to only the sampled rows. This approach
-    minimizes memory usage compared to loading the entire dataset upfront.
-
-    Parameters
-    ----------
-    dataset_path : str or pathlib.Path
-        Path to the parquet dataset file to sample from
-    group_columns : list[str], default ["Metadata_Plate", "Metadata_Well"]
-        Column names to use for grouping. Sampling will be performed independently
-        within each unique combination of these columns
-    sample_percentage : float, default 0.2
-        Fraction of rows to sample from each group (must be between 0.0 and 1.0)
-
-    Returns
-    -------
-    pl.DataFrame
-        Subsampled dataframe containing the sampled rows from each group,
-        preserving all original columns
-
-    Raises
-    ------
-    ValueError
-        If sample_percentage is not between 0 and 1
-    FileNotFoundError
-        If dataset_path does not exist
-    """
-    # validate inputs
-    if not 0 <= sample_percentage <= 1:
-        raise ValueError("sample_percentage must be between 0 and 1")
-
-    # convert str types to pathlib types
-    if isinstance(profiles, str):
-        profiles = pathlib.Path(profiles).resolve(strict=True)
-
-    # load only the grouping columns to determine groups
-    if isinstance(profiles, pl.DataFrame):
-        # if a polars DataFrame is provided, use it directly
-        metadata_df = profiles.select(group_columns).with_row_index("original_idx")
-    else:
-        metadata_df = pl.read_parquet(profiles, columns=group_columns).with_row_index(
-            "original_idx"
-        )
-
-    # sample indices for each group based on the group_columns
-    sampled_indices = (
-        metadata_df
-        # group rows by the specified columns (e.g., Plate and Well combinations)
-        .group_by(group_columns)
-        # for each group, randomly sample a fraction of the original row indices
-        .agg(
-            pl.col("original_idx")
-            .sample(
-                fraction=sample_percentage, seed=seed
-            )  # sample specified percentage from each group
-            .alias("sampled_idx")  # rename the sampled indices column
-        )
-        # extract only the sampled indices column, discarding group identifiers
-        .select("sampled_idx")
-        # convert list of indices per group into individual rows (flatten the structure)
-        .explode("sampled_idx")
-        # extract the sampled indices as a single column series
-        .get_column("sampled_idx")
-        .sort()
-    )
-
-    # load the entire dataset and filter to sampled indices
-    sampled_df = (
-        profiles.with_row_index("idx")
-        .filter(pl.col("idx").is_in(sampled_indices.implode()))
-        .drop("idx")
-    )
-
-    return sampled_df
-
-
-# Setting input and output paths
-
-# In[3]:
-
-
-# setting directory where all the single-cell profiles are stored
-data_dir = pathlib.Path.cwd() / "data"
-profiles_dir = (data_dir / "sc-profiles").resolve(strict=True)
-
-cpjump1_data_path = (
-    profiles_dir / "cpjump1" / "cpjump1_compound_concat_profiles.parquet"
-).resolve(strict=True)
-
-
-# Setting feature selection path
-shared_features_config_path = (
-    profiles_dir / "cpjump1" / "feature_selected_sc_qc_features.json"
-).resolve(strict=True)
-
-# setting negative control
-negcon_data_dir = (profiles_dir / "cpjump1" / "negcon").resolve()
-negcon_data_dir.mkdir(exist_ok=True)
-poscon_data_dir = (profiles_dir / "cpjump1" / "poscon").resolve()
-poscon_data_dir.mkdir(exist_ok=True)
-
-
-# Loading data
-
-# In[4]:
-
-
-# load all profiles
-profiles_df = load_profiles(cpjump1_data_path)
-
-# create a negative control subset
-negcon_df = profiles_df.filter(
-    (pl.col("Metadata_pert_type") == "control")
-    & (pl.col("Metadata_control_type") == "negcon")
-)
-
-
-# generating 10 seeds of randomly sampled negative controls
-
-# In[5]:
-
-
-for seed_val in range(10):
-    # load the dataset with group stratified sub sampling
-    subsampled_df = load_group_stratified_data(
-        profiles=negcon_df,
-        group_columns=["Metadata_Plate", "Metadata_Well"],
-        sample_percentage=0.15,
-        seed=seed_val,
-    )
-
-    # save the file
-    subsampled_df.write_parquet(
-        negcon_data_dir / f"cpjump1_compound_negcon_seed{seed_val}.parquet"
-    )
-
-
-# Selecting only positive controls and saving it 
-
-# In[6]:
-
-
-# write as parquet file
-poscon_cp_df = profiles_df.filter(
-    (pl.col("Metadata_pert_type") == "control")
-    & (pl.col("Metadata_control_type") == "poscon_cp")
-)
-poscon_cp_df.write_parquet(poscon_data_dir / "poscon_cp_df.parquet")
-