Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ repos:

# Format TOML files early (before other tools might read config)
- repo: https://github.com/pappasam/toml-sort
rev: v0.24.3
rev: v0.24.4
hooks:
- id: toml-sort
args: ["--in-place"]
Expand All @@ -35,7 +35,7 @@ repos:

# Ruff for linting and formatting Python files
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.15.7
rev: v0.15.8
hooks:
- id: ruff-check
args: ["--fix"]
Expand Down
570 changes: 570 additions & 0 deletions notebooks/4.cpjump1-analysis/1.generate-on-off-signatures.ipynb

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,352 @@
#!/usr/bin/env python

# # 1. generating on and off signatures
#
# In this notebook, we generate on and off signatures for the compound CPJUMP1 [dataset](https://github.com/carpenter-singh-lab/2024_Chandrasekaran_NatureMethods). These signatures are generated separately for each positive control, meaning that every signature derived from the JUMP dataset is specific to an individual positive control.
#
# The dataset is split by cell type, specifically U2OS and A549 cells, and signatures are generated independently for each cell type.
#

# In[2]:


import itertools
import json
import pathlib
import sys
from collections import defaultdict

import polars as pl
import tqdm

# add buscar function
sys.path.append("../../")
from buscar.signatures import get_signatures
from utils.data_utils import generate_consensus_signatures
from utils.io_utils import load_configs, load_profiles

# Parameters
#
# The following parameters are used throughout this notebook to ensure consistency and reproducibility:
#
# - **`seed`** (int): Random seed value used for reproducibility in statistical tests and data sampling.
# - **`method`** (str): The statistical test method (e.g., `ks_test`) employed to identify significant "on" and "off" morphological signatures.
#

# In[3]:


seed = 0
method = "ks_test"


# Setting input and output paths

# In[ ]:


# set data data
data_dir = pathlib.Path("../0.download-data/data/sc-profiles/").resolve(strict=True)

# create a results dir
results_dir = pathlib.Path("./results")
results_dir.mkdir(exist_ok=True)

# set an output directory for signatures
signature_results_dir = (results_dir / "signatures").resolve()
signature_results_dir.mkdir(exist_ok=True)

# set path to cpjump1 experimental metadata
cpjump1_experimental_data_path = (
data_dir / "cpjump1/cpjump1_compound_experimental-metadata.csv"
).resolve(strict=True)

# set paths to cpjump1 poscon and negcon profiles
cpjump1_negcon_profile_path = list(
(data_dir / "cpjump1/negcon").resolve(strict=True).glob("*.parquet")
)
cpjump1_poscon_profile_path = (
data_dir / "cpjump1/poscon" / "poscon_cp_df.parquet"
).resolve(strict=True)

# set path to config file that has the shared cpjump1 features
cpjump1_shared_features_path = (
data_dir / "cpjump1/feature_selected_sc_qc_features.json"
).resolve(strict=True)

# set cpjump path
crispr_cpjump1_path = (
data_dir / "cpjump1/cpjump1_compound_concat_profiles.parquet"
).resolve(strict=True)


# setting mappings folder
mappings_dir = (results_dir / "mappings").resolve()
mappings_dir.mkdir(exist_ok=True)


# In[7]:


cpjump1_experimental_data = pl.read_csv(cpjump1_experimental_data_path)
cpjump1_experimental_data


# Identify the plates containing U2OS and A549 cells treated for 144 hours.

# In[10]:


# Split the dataset by cell type and treatment duration
# Filter U2OS cells (all records)
cpjump1_u2os_exp_metadata = cpjump1_experimental_data.filter(
pl.col("Cell_type") == "U2OS"
)

# Filter A549 cells with density of 100 for consistency
cpjump1_a549_exp_metadata = cpjump1_experimental_data.filter(
(pl.col("Cell_type") == "A549") & (pl.col("Density") == 100)
)

# Extract plate identifiers for each cell type
u2os_plates = cpjump1_u2os_exp_metadata["Assay_Plate_Barcode"].unique().to_list()
a549_plates = cpjump1_a549_exp_metadata["Assay_Plate_Barcode"].unique().to_list()

# save the extracted plates to a JSON file for future reference
with open(mappings_dir / "extracted_plates.json", "w") as f:
json.dump({"U2OS": u2os_plates, "A549": a549_plates}, f, indent=4)

# Display the extracted plates for verification
print(f"U2OS plates: {u2os_plates}")
print(f"A549 plates: {a549_plates}")


# loading in dataset

# In[11]:


# load shared features
shared_features = load_configs(cpjump1_shared_features_path)["shared-features"]

# Loading the 10 randomly subsampled negative control profiles
cpjump1_negcon_df = []
for negcon_df_path in cpjump1_negcon_profile_path:
# Extract seed ID from file path
seed_id = negcon_df_path.stem.split("_")[-1]

# Load profile data
# filter to the selected plates
loaded_negcon_df = load_profiles(negcon_df_path).filter(
pl.col("Metadata_Plate").is_in(u2os_plates + a549_plates)
)

# Insert seed ID at the beginning of the dataframe
loaded_negcon_df = loaded_negcon_df.with_columns(
pl.lit(seed_id).alias("Metadata_seed_id")
).select(["Metadata_seed_id"] + loaded_negcon_df.columns)

# Append to list
cpjump1_negcon_df.append(loaded_negcon_df)

# Concatenate all negative control dataframes but set to the select plates
cpjump1_negcon_df = pl.concat(cpjump1_negcon_df)

# load positive controls and filter to selected plates
cpjump1_poscon_df = load_profiles(cpjump1_poscon_profile_path).filter(
pl.col("Metadata_Plate").is_in(u2os_plates + a549_plates)
)

# Display the unique positive control types
all_poscon_trts = cpjump1_poscon_df["Metadata_pert_iname"].unique().to_list()
print("Unique positive control types:")
print(f"Number of unique positive control treatments: {len(all_poscon_trts)}")
print(all_poscon_trts)


# For each positive control compound, we generate "on" and "off" morphological signatures by comparing profiles from all 10 randomly selected DMSO (negative control) seeds to the corresponding positive control profiles.

# On and off signatures for the u2os cells

# In[7]:


# Setting save path
cpjump1_u2os_sig_save_path = (
signature_results_dir / f"{method}_cpjump1_u2os_negcon_poscon_signatures.json"
).resolve()

# set negative controls for U2OS plates
cpjump1_u2os_negcon_df = cpjump1_negcon_df.filter(
pl.col("Metadata_Plate").is_in(u2os_plates)
)
cpjump1_u2os_poscon_df = cpjump1_poscon_df.filter(
pl.col("Metadata_Plate").is_in(u2os_plates)
& (pl.col("Metadata_pert_iname").is_in(all_poscon_trts))
)

# If the file does not exist, iterate all combinations and identify signatures
# If the file exists, skip the whole process and just load the saved file
if not cpjump1_u2os_sig_save_path.exists():
# Creating all possible combinations between the randomly selected negative
# control profiles and positive controls
negcon_and_poscon_combinations = list(
itertools.product(
cpjump1_u2os_negcon_df["Metadata_seed_id"].unique().to_list(),
cpjump1_u2os_poscon_df["Metadata_pert_iname"].unique().to_list(),
)
)

# Iterate through each randomly sampled negative control and positive control combination
cpjump1_u2os_signature_results = defaultdict(
lambda: None
) # Used for storing comparisons and signatures
for negcon_seed, poscon_trt in tqdm.tqdm(
negcon_and_poscon_combinations, desc="Processing combinations"
):
# Select negative control profile for current seed
selected_negcon_df = cpjump1_u2os_negcon_df.filter(
pl.col("Metadata_seed_id") == negcon_seed
)

# Select positive control profile for current gene
selected_poscon_df = cpjump1_u2os_poscon_df.filter(
pl.col("Metadata_pert_iname") == poscon_trt
)

# Find the morphological signatures
on_sig, off_sig, _ = get_signatures(
ref_profiles=selected_negcon_df,
exp_profiles=selected_poscon_df,
morph_feats=shared_features,
test_method=method,
seed=seed,
)

# Process signatures and store in dictionary
cpjump1_u2os_signature_results[f"{negcon_seed}_negcon_{poscon_trt}_poscon"] = {
"controls": {"negative": negcon_seed, "positive": poscon_trt},
"signatures": {"on": on_sig, "off": off_sig},
"meta": {
"total-on-signatures": len(on_sig),
"total-off-signatures": len(off_sig),
},
}

# Save results to file
with open(cpjump1_u2os_sig_save_path, "w") as f:
json.dump(dict(cpjump1_u2os_signature_results), f, indent=4)

else:
with open(cpjump1_u2os_sig_save_path) as f:
cpjump1_u2os_signature_results = json.load(f)


# On and Off signatures for the a549 cells

# In[8]:


# Setting save path
cpjump1_a549_sig_save_path = (
signature_results_dir / f"{method}_cpjump1_a549_negcon_poscon_signatures.json"
).resolve()

# set negative controls for A549 plates
cpjump1_a549_negcon_df = cpjump1_negcon_df.filter(
pl.col("Metadata_Plate").is_in(a549_plates)
)
cpjump1_a549_poscon_df = cpjump1_poscon_df.filter(
pl.col("Metadata_Plate").is_in(a549_plates)
& (pl.col("Metadata_pert_iname").is_in(all_poscon_trts))
)
# If the file does not exist, iterate all combinations and identify signatures
# If the file exists, skip the whole process and just load the saved file
if not cpjump1_a549_sig_save_path.exists():
# Creating all possible combinations between the randomly selected negative
# control profiles and positive controls
negcon_and_poscon_combinations = list(
itertools.product(
cpjump1_a549_negcon_df["Metadata_seed_id"].unique().to_list(),
cpjump1_a549_poscon_df["Metadata_pert_iname"].unique().to_list(),
)
)

# Iterate through each randomly sampled negative control and positive control combination
cpjump1_a549_signature_results = defaultdict(
lambda: None
) # Used for storing comparisons and signatures
for negcon_seed, poscon_trt in tqdm.tqdm(
negcon_and_poscon_combinations, desc="Processing combinations"
):
# Select negative control profile for current seed
selected_negcon_df = cpjump1_a549_negcon_df.filter(
pl.col("Metadata_seed_id") == negcon_seed
)

# Select positive control profile for current gene
selected_poscon_df = cpjump1_a549_poscon_df.filter(
pl.col("Metadata_pert_iname") == poscon_trt
)

# Find the morphological signatures
on_sig, off_sig, _ = get_signatures(
ref_profiles=selected_negcon_df,
exp_profiles=selected_poscon_df,
morph_feats=shared_features,
test_method=method,
seed=seed,
)

# Process signatures and store in dictionary
cpjump1_a549_signature_results[f"{negcon_seed}_negcon_{poscon_trt}_poscon"] = {
"controls": {"negative": negcon_seed, "positive": poscon_trt},
"signatures": {"on": on_sig, "off": off_sig},
"meta": {
"total-on-signatures": len(on_sig),
"total-off-signatures": len(off_sig),
},
}

# Save results to file
with open(cpjump1_a549_sig_save_path, "w") as f:
json.dump(dict(cpjump1_a549_signature_results), f, indent=4)

else:
with open(cpjump1_a549_sig_save_path) as f:
cpjump1_a549_signature_results = json.load(f)


# Next we generate consensus on and off signatures for each positive control (poscon) gene by aggregating results from 10 independent negative control (DMSO) subsamples. Features included in the consensus signature are those identified as significant in at least 80% of comparisons (≥8 out of 10), highlighting robust and reproducible morphological differences.
#

# In[9]:


# Generate consensus on and off signatures of U2OS cpjump1 results
u2os_cpjump1_consensus_signatures = generate_consensus_signatures(
cpjump1_u2os_signature_results,
features=shared_features,
min_consensus_threshold=0.80,
)
u2os_save_path = (
signature_results_dir
/ f"{method}_u2os_cpjump1_consensus_signatures_per_poscon.json"
)

with open(u2os_save_path, mode="w") as f:
json.dump(u2os_cpjump1_consensus_signatures, f, indent=4)

# Generate consensus on and off signatures of A549 cpjump1 results
a549_cpjump1_consensus_signatures = generate_consensus_signatures(
cpjump1_a549_signature_results,
features=shared_features,
min_consensus_threshold=0.80,
)
save_path = (
signature_results_dir
/ f"{method}_a549_cpjump1_consensus_signatures_per_poscon.json"
)

with open(save_path, mode="w") as f:
json.dump(a549_cpjump1_consensus_signatures, f, indent=4)
Loading