Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
276 changes: 186 additions & 90 deletions notebooks/0.download-data/1.download-data.ipynb

Large diffs are not rendered by default.

186 changes: 80 additions & 106 deletions notebooks/0.download-data/2.preprocessing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,12 @@
"import sys\n",
"import json\n",
"import pathlib\n",
"from typing import Optional\n",
"\n",
"import polars as pl\n",
"\n",
"sys.path.append(\"../../\")\n",
"from utils.data_utils import (\n",
" split_meta_and_features,\n",
" add_cell_id_hash,\n",
" transform_ensg_to_gene_symbol,\n",
")\n",
"from utils.io_utils import load_profiles"
"from utils.data_utils import split_meta_and_features, add_cell_id_hash\n",
"from utils.io_utils import load_and_concat_profiles"
]
},
{
Expand All @@ -61,64 +56,6 @@
"metadata": {},
"outputs": [],
"source": [
"def load_and_concat_profiles(\n",
" profile_dir: str | pathlib.Path,\n",
" shared_features: Optional[list[str]] = None,\n",
" specific_plates: Optional[list[pathlib.Path]] = None,\n",
") -> pl.DataFrame:\n",
" \"\"\"\n",
" Load all profile files from a directory and concatenate them into a single Polars DataFrame.\n",
"\n",
" Parameters\n",
" ----------\n",
" profile_dir : str or pathlib.Path\n",
" Directory containing the profile files (.parquet).\n",
" shared_features : Optional[list[str]], optional\n",
" List of shared feature names to filter the profiles. If None, all features are loaded.\n",
" specific_plates : Optional[list[pathlib.Path]], optional\n",
" List of specific plate file paths to load. If None, all profiles in the directory are loaded.\n",
"\n",
" Returns\n",
" -------\n",
" pl.DataFrame\n",
" Concatenated Polars DataFrame containing all loaded profiles.\n",
" \"\"\"\n",
" # Ensure profile_dir is a pathlib.Path\n",
" if isinstance(profile_dir, str):\n",
" profile_dir = pathlib.Path(profile_dir)\n",
" elif not isinstance(profile_dir, pathlib.Path):\n",
" raise TypeError(\"profile_dir must be a string or a pathlib.Path object\")\n",
"\n",
" # Validate specific_plates\n",
" if specific_plates is not None:\n",
" if not isinstance(specific_plates, list):\n",
" raise TypeError(\"specific_plates must be a list of pathlib.Path objects\")\n",
" if not all(isinstance(path, pathlib.Path) for path in specific_plates):\n",
" raise TypeError(\n",
" \"All elements in specific_plates must be pathlib.Path objects\"\n",
" )\n",
"\n",
" # Use specific_plates if provided, otherwise gather all .parquet files\n",
" if specific_plates is not None:\n",
" # Validate that all specific plate files exist\n",
" for plate_path in specific_plates:\n",
" if not plate_path.exists():\n",
" raise FileNotFoundError(f\"Profile file not found: {plate_path}\")\n",
" files_to_load = specific_plates\n",
" else:\n",
" files_to_load = list(profile_dir.glob(\"*.parquet\"))\n",
" if not files_to_load:\n",
" raise FileNotFoundError(f\"No profile files found in {profile_dir}\")\n",
"\n",
" # Load and concatenate profiles\n",
" loaded_profiles = [\n",
" load_profiles(f, shared_features=shared_features) for f in files_to_load\n",
" ]\n",
"\n",
" # Concatenate all loaded profiles\n",
" return pl.concat(loaded_profiles, rechunk=True)\n",
"\n",
"\n",
"def split_data(\n",
" pycytominer_output: pl.DataFrame, dataset: str = \"CP_and_DP\"\n",
") -> pl.DataFrame:\n",
Expand Down Expand Up @@ -193,6 +130,18 @@
{
"cell_type": "code",
"execution_count": 3,
"id": "3dfe8d86",
"metadata": {},
"outputs": [],
"source": [
"# Define the type of perturbation for the dataset\n",
"# options are: \"compound\" or \"crispr\"\n",
"pert_type = \"crispr\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3ea207e4",
"metadata": {},
"outputs": [],
Expand All @@ -203,16 +152,20 @@
"# Setting profiles directory\n",
"profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n",
"\n",
"# setting connectivity map drug repurposing config\n",
"drug_repurposing_config_path = (data_dir / \"repurposing_drugs_20180907.txt\").resolve(\n",
" strict=True\n",
")\n",
"\n",
"# Experimental metadata\n",
"exp_metadata_path = (\n",
" profiles_dir / \"cpjump1\" / \"cpjump1_compound_experimental-metadata.csv\"\n",
" profiles_dir / \"cpjump1\" / f\"cpjump1_{pert_type}_experimental-metadata.csv\"\n",
").resolve(strict=True)\n",
"\n",
"# cpjump1 compound metadata\n",
"if pert_type == \"compound\":\n",
" cmp_metadata_path = (\n",
" profiles_dir / \"cpjump1\" / \"cpjump1_compound_compound-metadata.tsv\"\n",
" ).resolve(strict=True)\n",
"else:\n",
" cmp_metadata_path = None\n",
"\n",
"# Setting CFReT profiles directory\n",
"cfret_profiles_dir = (profiles_dir / \"cfret\").resolve(strict=True)\n",
"cfret_profiles_path = (\n",
Expand Down Expand Up @@ -244,12 +197,12 @@
"id": "7168a71a",
"metadata": {},
"source": [
"Create a list of paths that only points compound treated plates and load the shared features config file that can be found in this [repo](https://github.com/WayScience/JUMP-single-cell)"
"Create a list of paths pointing to the selected CPJUMP1 plates and load the shared features configuration file from the [JUMP-single-cell](https://github.com/WayScience/JUMP-single-cell) repository."
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"id": "c7944fc2",
"metadata": {},
"outputs": [],
Expand All @@ -260,7 +213,7 @@
"compound_plate_names = (\n",
" exp_metadata.select(\"Assay_Plate_Barcode\").unique().to_series().to_list()\n",
")\n",
"compound_plate_paths = [\n",
"cpjump1_plate_paths = [\n",
" (profiles_dir / \"cpjump1\" / f\"{plate}_feature_selected_sc_qc.parquet\").resolve(\n",
" strict=True\n",
" )\n",
Expand All @@ -278,41 +231,41 @@
"id": "c6bfd5c7",
"metadata": {},
"source": [
"## Preprocessing CPJUMP1 Compound data\n",
"## Preprocessing CPJUMP1 Data\n",
"\n",
"Using the filtered compound plate file paths and shared features configuration, we load all individual profile files and concatenate them into a single comprehensive DataFrame. This step combines data from multiple experimental plates while maintaining the consistent feature space defined by the shared features list.\n",
"Using the filtered plate file paths and shared features configuration, we load all individual profile files and concatenate them into a single comprehensive DataFrame. This step combines data from multiple experimental plates, for either compound or CRISPR perturbation types, while maintaining a consistent feature space defined by the shared features list.\n",
"\n",
"The concatenation process ensures:\n",
"- All profiles use the same feature set for downstream compatibility\n",
"- Metadata columns are preserved across all plates\n",
"- Data integrity is maintained during the merge operation\n",
"- Adding a unique cell id has column `Metadata_cell_id`"
"- A unique cell identifier is added via the `Metadata_cell_id` column"
]
},
{
"cell_type": "markdown",
"id": "9ec882fa",
"metadata": {},
"source": [
"We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis."
"We load per-plate Parquet profiles for the selected perturbation type (compound or CRISPR), apply the shared feature set, and concatenate them into a single Polars DataFrame while preserving metadata. A unique `Metadata_cell_id` is added for each cell. The resulting `cpjump1_profiles` table is ready for downstream analysis."
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"id": "f6f7e08d",
"metadata": {},
"outputs": [],
"source": [
"# Loading compound profiles with shared features and concat into a single DataFrame\n",
"concat_output_path = (\n",
" cpjump1_output_dir / \"cpjump1_compound_concat_profiles.parquet\"\n",
" cpjump1_output_dir / f\"cpjump1_{pert_type}_concat_profiles.parquet\"\n",
").resolve()\n",
"\n",
"# loaded and concatenated profiles\n",
"cpjump1_profiles = load_and_concat_profiles(\n",
" profile_dir=profiles_dir,\n",
" specific_plates=compound_plate_paths,\n",
" specific_plates=cpjump1_plate_paths,\n",
" shared_features=shared_features,\n",
")\n",
"\n",
Expand All @@ -325,50 +278,75 @@
"id": "3df9bbf5",
"metadata": {},
"source": [
"Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.\n"
"For compound-treated plates, we annotate each profile with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP), which provides drug and tool compound annotations including target information and clinical development status. Cell type metadata is also merged in from the experimental metadata. This step is skipped for CRISPR-treated plates."
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "adfb9148",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Skipping this step since the dataset is CPJUMP1 crispr and not compound\n"
]
}
],
"source": [
"# load drug repurposing moa file and add prefix to metadata columns\n",
"rep_moa_df = pl.read_csv(\n",
" drug_repurposing_config_path, separator=\"\\t\", skip_rows=9, encoding=\"utf8-lossy\"\n",
").rename(lambda x: f\"Metadata_{x}\" if not x.startswith(\"Metadata_\") else x)\n",
"if pert_type == \"compound\":\n",
" rep_moa_df = pl.read_csv(\n",
" cmp_metadata_path,\n",
" separator=\"\\t\",\n",
" columns=[\"Metadata_pert_iname\", \"Metadata_target\", \"Metadata_moa\"],\n",
" ).unique(subset=[\"Metadata_pert_iname\"])\n",
"\n",
" # merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname\n",
" cpjump1_profiles = cpjump1_profiles.join(\n",
" rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n",
" )\n",
"\n",
"# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname\n",
"cpjump1_profiles = cpjump1_profiles.join(\n",
" rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n",
")\n",
" # merge cell type metadata with cpjump1_profiles on Metadata_Plate\n",
" cell_type_metadata = exp_metadata.select(\n",
" [\"Assay_Plate_Barcode\", \"Cell_type\"]\n",
" ).rename(\n",
" {\"Assay_Plate_Barcode\": \"Metadata_Plate\", \"Cell_type\": \"Metadata_cell_type\"}\n",
" )\n",
" cpjump1_profiles = cpjump1_profiles.join(\n",
" cell_type_metadata, on=\"Metadata_Plate\", how=\"left\"\n",
" )\n",
"else:\n",
" print(\n",
" f\"Skipping this step since the dataset is CPJUMP1 {pert_type} and not compound\"\n",
" )\n",
"\n",
"# split meta and feature\n",
"meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)\n",
"\n",
"# save the feature space information into a json file\n",
"meta_features_dict = {\n",
" \"concat-profiles\": {\n",
" \"data-type\": f\"{pert_type}_plates\",\n",
" \"meta-features\": meta_cols,\n",
" \"shared-features\": features_cols,\n",
" }\n",
"}\n",
"with open(cpjump1_output_dir / \"concat_profiles_meta_features.json\", \"w\") as f:\n",
"with open(\n",
" cpjump1_output_dir / f\"{pert_type}_concat_profiles_meta_features.json\", \"w\"\n",
") as f:\n",
" json.dump(meta_features_dict, f, indent=4)\n",
"\n",
"# save concatenated profiles\n",
"# Loading compound profiles with shared features and concat into a single DataFrame\n",
"concat_output_path = (\n",
" cpjump1_output_dir / \"cpjump1_compound_concat_profiles.parquet\"\n",
").resolve()\n",
"cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path)"
]
},
{
"cell_type": "markdown",
"id": "4a0ba6ad",
"id": "92bacbc9",
"metadata": {},
"source": [
"## Preprocessing MitoCheck Dataset\n",
Expand All @@ -391,7 +369,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 8,
"id": "c5471d3e",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -445,7 +423,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 9,
"id": "c57da947",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -478,7 +456,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 10,
"id": "1d7ced04",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -532,7 +510,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"id": "42108980",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -568,11 +546,6 @@
" .to_list()\n",
" if gene.startswith(\"ENSG\")\n",
"]\n",
"decoded_genes = transform_ensg_to_gene_symbol(ensg_ids)\n",
"\n",
"# save the mapping of ENSG IDs to gene symbols in a json file\n",
"with open(mitocheck_dir / \"mitocheck_ensg_to_gene_symbol_mapping.json\", \"w\") as f:\n",
" json.dump(decoded_genes, f, indent=4)\n",
"\n",
"# save concatenated mitocheck profiles\n",
"concat_mitocheck_profiles.write_parquet(\n",
Expand All @@ -594,14 +567,15 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"id": "1763d383",
"metadata": {},
"outputs": [],
"source": [
"# load in cfret profiles and add a unique cell ID\n",
"cfret_profiles = pl.read_parquet(cfret_profiles_path)\n",
"\n",
"\n",
"# adding a unique cell ID based on all features\n",
"cfret_profiles = add_cell_id_hash(cfret_profiles, force=True)\n",
"\n",
Expand Down Expand Up @@ -643,7 +617,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
"version": "3.12.9"
}
},
"nbformat": 4,
Expand Down
Loading