From f7e8fded82c0f6e1cb0af3056da6620686cebc3d Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 31 May 2026 21:56:12 +0000
Subject: [PATCH 1/2] docs: add tutorial for running SACCELERATOR with
 LSF/Mamba and new Xenium ovarian cancer datasets

---
 docs/tutorial_lsf_mamba.md | 726 +++++++++++++++++++++++++++++++++++++
 1 file changed, 726 insertions(+)
 create mode 100644 docs/tutorial_lsf_mamba.md

diff --git a/docs/tutorial_lsf_mamba.md b/docs/tutorial_lsf_mamba.md
new file mode 100644
index 00000000..f7a2b615
--- /dev/null
+++ b/docs/tutorial_lsf_mamba.md
@@ -0,0 +1,726 @@
+# SACCELERATOR Tutorial: Running the Pipeline with LSF (bsub) and Mamba
+
+This tutorial covers:
+1. Running the pipeline on two example datasets (`libd_dlpfc` and `xenium-breast-cancer`)
+2. Managing Snakemake conda environments with proper names (avoiding hashed names)
+3. Running the pipeline on new datasets (Xenium Prime FFPE Human Ovarian Cancer & Xenium Fresh Frozen Human Ovarian Cancer)
+
+---
+
+## Part 1: Running the Pipeline on Example Datasets
+
+### 1.1 Prerequisites
+
+- Access to an HPC cluster with LSF job scheduler (`bsub`)
+- [Mamba](https://mamba.readthedocs.io/) installed (or Micromamba)
+- Snakemake ≥ 7.x installed in a base environment
+- Git
+
+### 1.2 Clone the Repository
+
+```bash
+git clone https://github.com/SpatialHackathon/SACCELERATOR.git
+cd SACCELERATOR
+```
+
+### 1.3 Set Up a Snakemake Environment
+
+```bash
+mamba create -n snakemake -c conda-forge -c bioconda snakemake mamba
+mamba activate snakemake
+```
+
+### 1.4 Configure the Pipeline
+
+You need to edit two config files in the `workflows/` directory.
+
+#### `workflows/excute_config.yaml`
+
+```yaml
+###### Universal parameters #######
+GIT_DIR: /path/to/your/SACCELERATOR    # Absolute path to your cloned repo
+DATASET_DIR: /path/to/your/data         # Where downloaded data will be stored
+SEED: 2023
+
+###### Dataset selected for execution #######
+datasets_selected:
+  - "libd_dlpfc"
+  - "xenium-breast-cancer"
+
+###### Methods selected for execution #######
+methods_selected:
+  - "spaGCN"
+  - "scanpy"
+  - "seurat"
+
+# Cluster numbers for datasets
+n_clusters:
+  libd_dlpfc: [7, 9, 11]
+  xenium-breast-cancer: [5, 7, 9]
+
+###### Metrics selected for execution #######
+metrics_selected:
+  - "ARI"
+  - "V_measure"
+
+###### Base clustering selection parameters #######
+selection_criteria:
+  - "Cross_method_ARI"
+n_neighbors: 6
+
+###### Consensus Clustering parameters #######
+bc_numbers: [8]
+consensus_algorithms:
+  - "lca"
+n_clust_consensus: {}
+
+# For weighted clustering
+lambda: null
+
+# For cross-method entropy
+cross_method_entropy: true
+```
+
+#### `workflows/path_config.yaml`
+
+Use the provided `path_config.yaml` as-is, or regenerate it:
+
+```bash
+cd workflows
+bash generate_path_config.sh
+# Then rename the output:
+mv generated_path_config.yml path_config.yaml
+```
+
+Make sure paths in `path_config.yaml` reference the correct relative locations for your selected datasets/methods/metrics.
+
+### 1.5 Create a Snakemake Profile for LSF
+
+Create a directory for your LSF profile:
+
+```bash
+mkdir -p ~/.config/snakemake/lsf
+```
+
+Create `~/.config/snakemake/lsf/config.yaml`:
+
+```yaml
+executor: cluster-generic
+cluster-generic-submit-cmd: >
+  bsub
+  -J {rule}_{wildcards}
+  -q normal
+  -n {threads}
+  -R "rusage[mem={resources.mem_mb}]"
+  -M {resources.mem_mb}
+  -W {resources.time_min}
+  -o logs/{rule}_{wildcards}.out
+  -e logs/{rule}_{wildcards}.err
+cluster-generic-status-cmd: "bjobs -noheader -o 'stat' {}"
+cluster-generic-cancel-cmd: "bkill {}"
+jobs: 50
+latency-wait: 60
+use-conda: true
+conda-frontend: mamba
+rerun-incomplete: true
+default-resources:
+  mem_mb: 8000
+  time_min: 120
+  threads: 1
+```
+
+> **Note**: For Snakemake <8, use the older profile format. For Snakemake 8+, see the [snakemake-executor-plugin-lsf](https://github.com/snakemake/snakemake-executor-plugin-lsf) plugin.
+
+Alternatively, for **Snakemake 8+** with the LSF executor plugin:
+
+```bash
+pip install snakemake-executor-plugin-lsf
+```
+
+Then your profile (`~/.config/snakemake/lsf/config.yaml`):
+
+```yaml
+executor: lsf
+jobs: 50
+latency-wait: 60
+use-conda: true
+conda-frontend: mamba
+rerun-incomplete: true
+default-resources:
+  mem_mb: 8000
+  runtime: 120
+  lsf_queue: normal
+```
+
+Create a logs directory:
+
+```bash
+mkdir -p logs
+```
+
+### 1.6 Run the Pipeline Steps
+
+Navigate to the workflows directory:
+
+```bash
+cd /path/to/your/SACCELERATOR/workflows
+```
+
+Run each step sequentially. Use the `--profile lsf` flag to submit jobs via bsub:
+
+```bash
+# Step 1: Download data
+snakemake -s 01_download.smk --profile lsf
+
+# Step 2: Preprocessing (QC, normalization, etc.)
+snakemake -s 02_preprocessing.smk --profile lsf
+
+# Step 3: Run clustering methods
+snakemake -s 03_methods.smk --profile lsf
+
+# Step 4: Calculate metrics
+snakemake -s 04_metrics.smk --profile lsf
+
+# Step 5: Aggregate results
+snakemake -s 05_aggregation.smk --profile lsf
+
+# Step 6: Select base clusterings
+snakemake -s 06_select_base_clusterings.smk --profile lsf
+
+# Step 7: Consensus clustering
+snakemake -s 07_consensus.smk --profile lsf
+```
+
+#### Dry Run First
+
+Always do a dry run before submitting:
+
+```bash
+snakemake -s 01_download.smk --profile lsf -n
+```
+
+---
+
+## Part 2: Conda Environment Naming — Avoiding Hashed Environment Names
+
+### The Problem
+
+By default, Snakemake creates conda environments using a hash of the YAML file content. This results in environment directories like:
+
+```
+.snakemake/conda/a1b2c3d4e5f6/
+```
+
+These are hard to inspect, debug, or reuse across projects.
+
+### Solution: Use Named Environments with `--conda-prefix` and Pre-created Envs
+
+There are several approaches to get properly named, reusable environments:
+
+#### Approach A: Shared `--conda-prefix` Directory (Recommended for Reuse)
+
+Set a shared prefix so all Snakemake runs (across projects) share the same conda env cache:
+
+```bash
+# Set a shared location for conda envs
+export SNAKEMAKE_CONDA_PREFIX=/path/to/shared/conda_envs
+
+snakemake -s 03_methods.smk --profile lsf --conda-prefix $SNAKEMAKE_CONDA_PREFIX
+```
+
+This ensures environments are reused across runs. The hashed names persist, but they are cached and shared.
+
+You can also add this to your profile config:
+
+```yaml
+# In ~/.config/snakemake/lsf/config.yaml
+conda-prefix: /path/to/shared/conda_envs
+```
+
+#### Approach B: Pre-create Named Environments and Use `--conda-prefix` with Symlinks
+
+Create a wrapper script that pre-creates environments with meaningful names and symlinks them:
+
+```bash
+#!/bin/bash
+# File: setup_named_envs.sh
+# Pre-create named conda environments from SACCELERATOR YAML files
+
+GIT_DIR="/path/to/your/SACCELERATOR"
+ENV_DIR="/path/to/shared/conda_envs"
+
+mkdir -p "$ENV_DIR"
+
+# Dataset environments
+for yml in "$GIT_DIR"/data/*//*.yml "$GIT_DIR"/data/*/*.yaml; do
+    [ -f "$yml" ] || continue
+    name=$(basename "$(dirname "$yml")")
+    echo "Creating env: data_${name}"
+    mamba env create -f "$yml" -n "saccelerator_data_${name}" --yes 2>/dev/null || \
+    mamba env update -f "$yml" -n "saccelerator_data_${name}" --prune
+done
+
+# Method environments
+for yml in "$GIT_DIR"/method/*/*.yml "$GIT_DIR"/method/*/*.yaml; do
+    [ -f "$yml" ] || continue
+    name=$(basename "$(dirname "$yml")")
+    echo "Creating env: method_${name}"
+    mamba env create -f "$yml" -n "saccelerator_method_${name}" --yes 2>/dev/null || \
+    mamba env update -f "$yml" -n "saccelerator_method_${name}" --prune
+done
+
+# Metric environments
+for yml in "$GIT_DIR"/metric/*/*.yml "$GIT_DIR"/metric/*/*.yaml; do
+    [ -f "$yml" ] || continue
+    name=$(basename "$(dirname "$yml")")
+    echo "Creating env: metric_${name}"
+    mamba env create -f "$yml" -n "saccelerator_metric_${name}" --yes 2>/dev/null || \
+    mamba env update -f "$yml" -n "saccelerator_metric_${name}" --prune
+done
+
+# Preprocessing environments
+for yml in "$GIT_DIR"/preprocessing/*/*.yml "$GIT_DIR"/preprocessing/*/*/*.yml; do
+    [ -f "$yml" ] || continue
+    name=$(basename "$yml" .yml)
+    echo "Creating env: preproc_${name}"
+    mamba env create -f "$yml" -n "saccelerator_preproc_${name}" --yes 2>/dev/null || \
+    mamba env update -f "$yml" -n "saccelerator_preproc_${name}" --prune
+done
+
+echo "All environments created. List them with: mamba env list"
+```
+
+#### Approach C: Use `envmodules` Directive (Alternative for HPC)
+
+If your cluster has environment modules, you can modify the Snakemake rules to use `envmodules:` instead of `conda:`. However, this requires modifying the `.smk` files, which is not recommended for this workflow.
+
+#### Approach D: Create Environments Before Running, Then Use `--conda-create-envs-only`
+
+First, let Snakemake create the environments without running the pipeline:
+
+```bash
+snakemake -s 03_methods.smk --profile lsf --conda-create-envs-only
+```
+
+This pre-creates all required environments. Once done, subsequent runs will reuse them automatically. Combined with `--conda-prefix`, this gives you persistent, reusable environments.
+
+#### How to Map Hashed Names to Tools
+
+To find out which hash corresponds to which tool:
+
+```bash
+# List all conda envs created by Snakemake
+ls .snakemake/conda/
+
+# Check what's in each env
+for dir in .snakemake/conda/*/; do
+    if [ -f "${dir}environment.yaml" ]; then
+        echo "=== $dir ==="
+        cat "${dir}environment.yaml"
+        echo ""
+    fi
+done
+```
+
+Or use Snakemake's built-in listing:
+
+```bash
+snakemake -s 03_methods.smk --list-conda-envs
+```
+
+---
+
+## Part 3: Running on New Datasets — Xenium Ovarian Cancer
+
+This section shows how to add and run the pipeline on two new 10x Genomics datasets:
+- [Xenium Prime FFPE Human Ovarian Cancer](https://www.10xgenomics.com/datasets/xenium-prime-ffpe-human-ovarian-cancer)
+- [Xenium Comparison Fresh Frozen Human Ovarian Cancer](https://www.10xgenomics.com/datasets/xenium-comparison-fresh-frozen-human-ovarian-cancer)
+
+### 3.1 Create a Data Module for the New Datasets
+
+Each dataset in SACCELERATOR needs:
+1. A download/processing script (Python or R)
+2. A conda environment YAML
+3. An optargs JSON file
+4. An entry in `path_config.yaml`
+5. An entry in `excute_config.yaml`
+
+#### Directory Structure
+
+```
+data/
+  xenium-ovarian-cancer-ffpe/
+    xenium-ovarian-cancer-ffpe.py
+    xenium-ovarian-cancer-ffpe.yml
+    xenium-ovarian-cancer-ffpe_optargs.json
+  xenium-ovarian-cancer-ff/
+    xenium-ovarian-cancer-ff.py
+    xenium-ovarian-cancer-ff.yml
+    xenium-ovarian-cancer-ff_optargs.json
+```
+
+#### 3.2 Create the Conda Environment YAML
+
+`data/xenium-ovarian-cancer-ffpe/xenium-ovarian-cancer-ffpe.yml`:
+
+```yaml
+channels:
+  - conda-forge
+dependencies:
+  - python=3.10
+  - anndata>=0.10
+  - pandas>=2.0
+  - scipy>=1.10
+  - requests>=2.28
+  - pip
+  - pip:
+    - spatialdata-io>=0.1
+    - pypdl>=0.5
+```
+
+Use the same for the FF dataset (or adjust as needed):
+
+`data/xenium-ovarian-cancer-ff/xenium-ovarian-cancer-ff.yml`:
+
+```yaml
+channels:
+  - conda-forge
+dependencies:
+  - python=3.10
+  - anndata>=0.10
+  - pandas>=2.0
+  - scipy>=1.10
+  - requests>=2.28
+  - pip
+  - pip:
+    - spatialdata-io>=0.1
+    - pypdl>=0.5
+```
+
+#### 3.3 Create the Optargs JSON
+
+`data/xenium-ovarian-cancer-ffpe/xenium-ovarian-cancer-ffpe_optargs.json`:
+
+```json
+{
+    "min_cells": 1,
+    "min_genes": 1,
+    "min_counts": 1
+}
+```
+
+Same for the FF dataset.
+
+#### 3.4 Create the Download/Processing Script
+
+The script must:
+- Accept `-o/--out_dir` as argument
+- Download raw data from 10x Genomics
+- Output the standardized format:
+  - `{out_dir}/experiment.json` — technology metadata
+  - `{out_dir}/samples.tsv` — sample table
+  - `{out_dir}/{sample_name}/counts.mtx` — cell×gene count matrix (Market Matrix format)
+  - `{out_dir}/{sample_name}/features.tsv` — gene metadata
+  - `{out_dir}/{sample_name}/observations.tsv` — cell metadata
+  - `{out_dir}/{sample_name}/coordinates.tsv` — spatial coordinates (columns: x, y)
+  - `{out_dir}/{sample_name}/labels.tsv` — ground truth labels (if available)
+
+**`data/xenium-ovarian-cancer-ffpe/xenium-ovarian-cancer-ffpe.py`**:
+
+```python
+#!/usr/bin/env python
+
+"""
+Download and process Xenium Prime FFPE Human Ovarian Cancer dataset.
+Source: https://www.10xgenomics.com/datasets/xenium-prime-ffpe-human-ovarian-cancer
+"""
+
+import argparse
+import json
+import os
+import shutil
+import tempfile
+
+import pandas as pd
+import scipy.io
+from pypdl import Downloader
+from spatialdata_io import xenium
+
+
+# Update these URLs from the 10x Genomics dataset page.
+# Go to the dataset page and find the direct download links for the output bundle.
+LINKS = {
+    # "https://cf.10xgenomics.com/samples/xenium/.../_outs.zip": "md5_checksum",
+    # Add the actual download URL once you retrieve it from the 10x dataset page
+}
+
+
+def download_links(links, temp_dir):
+    """Download all files from the links dict."""
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
+    }
+    dl = Downloader(headers=headers)
+    for link, checksum in links.items():
+        print(f"Downloading {link}")
+        file = dl.start(
+            url=link,
+            file_path=temp_dir,
+            segments=10,
+            display=True,
+            multithread=True,
+            block=True,
+            retries=3,
+        )
+        if checksum and not file.validate_hash(checksum, "md5"):
+            raise ValueError(f"File {file} is corrupted")
+
+
+def process_xenium_output(xenium_path, out_path, sample_name):
+    """Convert Xenium output to SACCELERATOR format."""
+    print(f"Processing {xenium_path} -> {out_path}/{sample_name}")
+
+    # Read using spatialdata-io
+    sdata = xenium(
+        xenium_path,
+        cells_boundaries=False,
+        nucleus_boundaries=False,
+        cells_as_circles=False,
+        cells_labels=False,
+        nucleus_labels=False,
+        transcripts=False,
+        morphology_mip=False,
+        morphology_focus=False,
+    )
+    adata = sdata["table"]
+
+    complete_path = os.path.join(out_path, sample_name)
+    os.makedirs(complete_path, exist_ok=True)
+
+    # Observations
+    obs = adata.obs.copy()
+    obs["selected"] = "true"
+    obs.to_csv(f"{complete_path}/observations.tsv", sep="\t", index_label="")
+
+    # Features
+    features = adata.var.copy()
+    features["selected"] = "true"
+    features.to_csv(f"{complete_path}/features.tsv", sep="\t", index_label="")
+
+    # Coordinates
+    coords = pd.DataFrame(adata.obsm["spatial"], columns=["x", "y"])
+    coords.index = adata.obs.index
+    coords.to_csv(f"{complete_path}/coordinates.tsv", sep="\t", index_label="")
+
+    # Count matrix (cells x genes in MMwrite)
+    scipy.io.mmwrite(f"{complete_path}/counts.mtx", adata.X)
+
+    return adata.shape[0]  # return number of cells
+
+
+def write_experiment_json(out_path):
+    experiment = {
+        "technology": "Xenium",
+        "species": "human",
+        "tissue": "ovary",
+        "is_3D": False,
+    }
+    with open(os.path.join(out_path, "experiment.json"), "w") as f:
+        json.dump(experiment, f, indent=2)
+
+
+def write_samples_tsv(out_path, samples_info):
+    df = pd.DataFrame(samples_info)
+    df.to_csv(f"{out_path}/samples.tsv", sep="\t", index_label=False)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Download Xenium Prime FFPE Human Ovarian Cancer dataset."
+    )
+    parser.add_argument(
+        "-o", "--out_dir", help="Output directory to write files to.", required=True
+    )
+    args = parser.parse_args()
+
+    out_dir = args.out_dir
+    os.makedirs(out_dir, exist_ok=True)
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Download
+        download_links(LINKS, temp_dir)
+
+        # Unzip and process
+        for file in os.listdir(temp_dir):
+            if file.endswith(".zip"):
+                sample_name = "FFPE_ovarian"
+                sample_path = os.path.join(temp_dir, file.replace(".zip", ""))
+                shutil.unpack_archive(os.path.join(temp_dir, file), sample_path)
+                process_xenium_output(sample_path, out_dir, sample_name)
+
+    # Write metadata
+    write_experiment_json(out_dir)
+    write_samples_tsv(out_dir, {
+        "patient": ["ovarian_ffpe"],
+        "sample": ["FFPE_ovarian"],
+        "position": [0],
+        "replicate": [0],
+        "directory": ["FFPE_ovarian"],
+        "n_clusters": [7],  # Adjust based on expected domains
+    })
+
+
+if __name__ == "__main__":
+    main()
+```
+
+> **Important**: You need to get the actual download URLs from the 10x Genomics dataset pages. Visit:
+> - https://www.10xgenomics.com/datasets/xenium-prime-ffpe-human-ovarian-cancer
+> - https://www.10xgenomics.com/datasets/xenium-comparison-fresh-frozen-human-ovarian-cancer
+>
+> Look for the "Output Files" section and copy the direct download links for the `*_outs.zip` bundle. Update the `LINKS` dictionary accordingly.
+
+Create a similar script for `xenium-ovarian-cancer-ff` adjusting the URLs and sample names.
+
+#### 3.5 Make Scripts Executable
+
+```bash
+chmod +x data/xenium-ovarian-cancer-ffpe/xenium-ovarian-cancer-ffpe.py
+chmod +x data/xenium-ovarian-cancer-ff/xenium-ovarian-cancer-ff.py
+```
+
+#### 3.6 Register the Datasets in Config Files
+
+**Add to `workflows/path_config.yaml`** under the `datasets:` section:
+
+```yaml
+  xenium-ovarian-cancer-ffpe:
+    env: data/xenium-ovarian-cancer-ffpe/xenium-ovarian-cancer-ffpe.yml
+    script: data/xenium-ovarian-cancer-ffpe/xenium-ovarian-cancer-ffpe.py
+    optargs: data/xenium-ovarian-cancer-ffpe/xenium-ovarian-cancer-ffpe_optargs.json
+  xenium-ovarian-cancer-ff:
+    env: data/xenium-ovarian-cancer-ff/xenium-ovarian-cancer-ff.yml
+    script: data/xenium-ovarian-cancer-ff/xenium-ovarian-cancer-ff.py
+    optargs: data/xenium-ovarian-cancer-ff/xenium-ovarian-cancer-ff_optargs.json
+```
+
+**Update `workflows/excute_config.yaml`**:
+
+```yaml
+datasets_selected:
+  - "xenium-ovarian-cancer-ffpe"
+  - "xenium-ovarian-cancer-ff"
+
+methods_selected:
+  - "spaGCN"
+  - "scanpy"
+  - "CellCharter"
+  - "BANKSY"
+
+n_clusters:
+  xenium-ovarian-cancer-ffpe: [5, 7, 9, 11]
+  xenium-ovarian-cancer-ff: [5, 7, 9, 11]
+```
+
+### 3.7 Run the Pipeline
+
+```bash
+cd /path/to/your/SACCELERATOR/workflows
+
+# Step 1: Download and format data
+snakemake -s 01_download.smk --profile lsf
+
+# Step 2: Preprocessing
+snakemake -s 02_preprocessing.smk --profile lsf
+
+# Step 3: Run methods
+snakemake -s 03_methods.smk --profile lsf
+
+# Step 4: Metrics
+snakemake -s 04_metrics.smk --profile lsf
+
+# Step 5: Aggregate
+snakemake -s 05_aggregation.smk --profile lsf
+
+# Step 6: Select base clusterings
+snakemake -s 06_select_base_clusterings.smk --profile lsf
+
+# Step 7: Consensus
+snakemake -s 07_consensus.smk --profile lsf
+```
+
+### 3.8 Getting 10x Genomics Download URLs
+
+The 10x Genomics dataset pages require agreeing to their terms. To get the direct URLs:
+
+1. Go to https://www.10xgenomics.com/datasets/xenium-prime-ffpe-human-ovarian-cancer
+2. Click "Download" and accept terms
+3. Copy the link for "Feature / cell matrix (HDF5)" or the full "Output file" zip
+4. For Xenium data, the key files you need are in the `*_outs.zip`:
+   - `cells.csv.gz` or `cells.parquet` (cell positions and metadata)
+   - `cell_feature_matrix/` (count matrix in MEX format)
+   - `gene_panel.json` (panel info)
+
+Alternatively, use `curl` with the direct S3/CloudFront URLs:
+
+```bash
+# Example pattern (URLs change per dataset):
+# https://cf.10xgenomics.com/samples/xenium/3.0.0/Xenium_Prime_FFPE_Human_Ovarian_Cancer/Xenium_Prime_FFPE_Human_Ovarian_Cancer_outs.zip
+```
+
+---
+
+## Troubleshooting
+
+### Common Issues
+
+1. **"Directory does not exist" errors**: Make sure `DATASET_DIR` exists and the download step completed successfully.
+
+2. **Conda environment creation fails**: Use `--conda-create-envs-only` to create envs first without running rules.
+
+3. **LSF job memory issues**: Increase `mem_mb` in your profile or add per-rule resources in the Snakefiles.
+
+4. **Method incompatible with technology**: Some methods (e.g., BayesSpace, GraphST) only support certain technologies. Check the `optargs.json` → `"technology"` field for each method.
+
+5. **Snakemake locked**: If a previous run failed mid-execution:
+   ```bash
+   snakemake -s <step>.smk --unlock
+   ```
+
+### Useful Commands
+
+```bash
+# Check what would run (dry run)
+snakemake -s 03_methods.smk --profile lsf -n
+
+# Force re-run of incomplete jobs
+snakemake -s 03_methods.smk --profile lsf --rerun-incomplete
+
+# Run on a single specific rule/target
+snakemake -s 03_methods.smk --profile lsf /path/to/data/xenium-ovarian-cancer-ffpe/FFPE_ovarian/spaGCN/config_default/cluster_7/domains.tsv
+
+# Show the DAG (dependency graph)
+snakemake -s 03_methods.smk --dag | dot -Tpng > dag.png
+```
+
+---
+
+## Summary
+
+| Step | Command |
+|------|---------|
+| Download data | `snakemake -s 01_download.smk --profile lsf` |
+| Preprocess | `snakemake -s 02_preprocessing.smk --profile lsf` |
+| Run methods | `snakemake -s 03_methods.smk --profile lsf` |
+| Compute metrics | `snakemake -s 04_metrics.smk --profile lsf` |
+| Aggregate | `snakemake -s 05_aggregation.smk --profile lsf` |
+| Select base clusterings | `snakemake -s 06_select_base_clusterings.smk --profile lsf` |
+| Consensus | `snakemake -s 07_consensus.smk --profile lsf` |
+
+Key flags:
+- `--profile lsf` — use your LSF profile for job submission
+- `--conda-frontend mamba` — use Mamba for faster env creation
+- `--conda-prefix /shared/path` — share envs across projects
+- `--conda-create-envs-only` — pre-create all environments without running
+- `-n` — dry run
+- `--rerun-incomplete` or `--ri` — re-run interrupted jobs

From 2c7a61ce8a0df87014a26b297b176a423f9b4707 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 31 May 2026 21:57:08 +0000
Subject: [PATCH 2/2] docs: fix review comments in tutorial (glob pattern,
 index param, links note)

---
 docs/tutorial_lsf_mamba.md | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/docs/tutorial_lsf_mamba.md b/docs/tutorial_lsf_mamba.md
index f7a2b615..b60b509a 100644
--- a/docs/tutorial_lsf_mamba.md
+++ b/docs/tutorial_lsf_mamba.md
@@ -34,7 +34,7 @@ mamba activate snakemake
 
 You need to edit two config files in the `workflows/` directory.
 
-#### `workflows/excute_config.yaml`
+#### `workflows/excute_config.yaml` (note: the filename uses "excute" — this is intentional, matching the repo's naming)
 
 ```yaml
 ###### Universal parameters #######
@@ -252,7 +252,7 @@ ENV_DIR="/path/to/shared/conda_envs"
 mkdir -p "$ENV_DIR"
 
 # Dataset environments
-for yml in "$GIT_DIR"/data/*//*.yml "$GIT_DIR"/data/*/*.yaml; do
+for yml in "$GIT_DIR"/data/*/*.yml "$GIT_DIR"/data/*/*.yaml; do
     [ -f "$yml" ] || continue
     name=$(basename "$(dirname "$yml")")
     echo "Creating env: data_${name}"
@@ -447,11 +447,12 @@ from pypdl import Downloader
 from spatialdata_io import xenium
 
 
-# Update these URLs from the 10x Genomics dataset page.
-# Go to the dataset page and find the direct download links for the output bundle.
+# UPDATE THESE URLs: Visit the 10x Genomics dataset page and copy the direct download links.
+# You MUST populate this dict before running the script, otherwise it will fail.
+# Example format:
+#   "https://cf.10xgenomics.com/samples/xenium/.../Xenium_..._outs.zip": "md5_checksum_or_empty_string",
 LINKS = {
-    # "https://cf.10xgenomics.com/samples/xenium/.../_outs.zip": "md5_checksum",
-    # Add the actual download URL once you retrieve it from the 10x dataset page
+    # TODO: Add actual download URLs from https://www.10xgenomics.com/datasets/xenium-prime-ffpe-human-ovarian-cancer
 }
 
 
@@ -531,7 +532,7 @@ def write_experiment_json(out_path):
 
 def write_samples_tsv(out_path, samples_info):
     df = pd.DataFrame(samples_info)
-    df.to_csv(f"{out_path}/samples.tsv", sep="\t", index_label=False)
+    df.to_csv(f"{out_path}/samples.tsv", sep="\t", index=False)
 
 
 def main():