Add large-volume parallel waterz decoding via file-backed orchestrator

Donglai Wei · claude · Donglai Wei · commit 3962cec7bba5 · 2026-03-28T16:20:19.000-04:00
- tutorials/waterz_decoding_large.yaml: config for chunked parallel decode
- scripts/decode_large.py: launcher (serial, worker, wait/assemble modes)
- scripts/decode_large_worker.sh: SLURM array job worker template

Pipeline: split volume → decode chunks independently → stitch borders
via IOU → assemble final volume. Workers coordinate via shared filesystem
(no central scheduler). Supports serial and multi-node parallel modes.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/scripts/decode_large.py b/scripts/decode_large.py
@@ -0,0 +1,147 @@
+"""Large-volume parallel waterz decoding using file-backed orchestrator.
+
+Usage:
+    # Serial (single process, all stages)
+    python scripts/decode_large.py --config tutorials/waterz_decoding_large.yaml
+
+    # Initialize workflow only (for parallel launch)
+    python scripts/decode_large.py --config tutorials/waterz_decoding_large.yaml --init-only
+
+    # Run as a worker (claims tasks from shared workflow dir)
+    python scripts/decode_large.py --config tutorials/waterz_decoding_large.yaml --worker
+
+    # Wait for all workers to finish, then assemble output
+    python scripts/decode_large.py --config tutorials/waterz_decoding_large.yaml --wait --assemble
+"""
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+import yaml
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Large-volume waterz decoding")
+    parser.add_argument("--config", required=True, help="YAML config file")
+    parser.add_argument("--init-only", action="store_true", help="Initialize workflow and exit")
+    parser.add_argument("--worker", action="store_true", help="Run as a worker (claim tasks)")
+    parser.add_argument("--wait", action="store_true", help="Wait for all tasks to complete")
+    parser.add_argument("--assemble", action="store_true", help="Assemble final output volume")
+    parser.add_argument("--max-tasks", type=int, default=None, help="Max tasks per worker")
+    parser.add_argument("--idle-timeout", type=float, default=60.0, help="Worker idle timeout (seconds)")
+    parser.add_argument("--worker-id", type=str, default=None, help="Worker identifier")
+    parser.add_argument("--job-id", type=str, default=None, help="SLURM job ID")
+    # Allow CLI overrides in key=value format
+    parser.add_argument("overrides", nargs="*", help="Config overrides (key=value)")
+    args = parser.parse_args()
+
+    # Load config
+    with open(args.config) as f:
+        cfg = yaml.safe_load(f)
+
+    large_cfg = cfg.get("large_decode", {})
+
+    # Apply CLI overrides
+    for override in args.overrides:
+        if "=" not in override:
+            print(f"Warning: skipping invalid override '{override}' (expected key=value)")
+            continue
+        key, value = override.split("=", 1)
+        # Try numeric conversion
+        try:
+            value = int(value)
+        except ValueError:
+            try:
+                value = float(value)
+            except ValueError:
+                pass
+        large_cfg[key] = value
+
+    if not large_cfg.get("affinity_path"):
+        print("Error: large_decode.affinity_path is required")
+        sys.exit(1)
+    if not large_cfg.get("workflow_root"):
+        print("Error: large_decode.workflow_root is required")
+        sys.exit(1)
+
+    os.environ.setdefault("CCACHE_DISABLE", "1")
+
+    from waterz import LargeDecodeRunner
+
+    # Parse config
+    chunk_shape = large_cfg.get("chunk_shape", [256, 512, 512])
+    if isinstance(chunk_shape, list):
+        chunk_shape = tuple(chunk_shape)
+
+    thresholds = large_cfg.get("thresholds", [0.5])
+    if isinstance(thresholds, (int, float)):
+        thresholds = [thresholds]
+
+    runner = LargeDecodeRunner.create(
+        affinity_path=large_cfg["affinity_path"],
+        workflow_root=large_cfg["workflow_root"],
+        chunk_shape=chunk_shape,
+        thresholds=thresholds,
+        merge_function=large_cfg.get("merge_function", "aff85_his256"),
+        aff_threshold_low=float(large_cfg.get("aff_threshold_low", 0.1)),
+        aff_threshold_high=float(large_cfg.get("aff_threshold_high", 0.999)),
+        channel_order=large_cfg.get("channel_order", "xyz"),
+        write_output=bool(large_cfg.get("write_output", True)),
+        output_path=large_cfg.get("output_path") or None,
+        border_min_overlap=int(large_cfg.get("border_min_overlap", 1)),
+        border_one_sided_threshold=float(large_cfg.get("border_one_sided_threshold", 0.9)),
+        border_iou_threshold=float(large_cfg.get("border_iou_threshold", 0.0)),
+        border_affinity_threshold=float(large_cfg.get("border_affinity_threshold", 0.0)),
+        compression=large_cfg.get("compression", "gzip"),
+        compression_level=int(large_cfg.get("compression_level", 4)),
+    )
+
+    chunks = runner.chunks
+    borders = runner.borders
+    print(f"Volume shape: {runner.config.volume_shape}")
+    print(f"Chunk shape:  {runner.config.chunk_shape}")
+    print(f"Chunks:       {len(chunks)}")
+    print(f"Borders:      {len(borders)}")
+    print(f"Workflow:     {runner.config.workflow_root}")
+
+    if args.init_only:
+        print("Workflow initialized. Launch workers to execute tasks.")
+        return
+
+    if args.worker:
+        worker_id = args.worker_id or os.environ.get("SLURM_JOB_ID", None)
+        job_id = args.job_id or os.environ.get("SLURM_ARRAY_TASK_ID", None)
+        print(f"Starting worker: {worker_id or 'auto'} (job={job_id or 'none'})")
+        n = runner.run_worker(
+            worker_id=worker_id,
+            max_tasks=args.max_tasks,
+            idle_timeout=args.idle_timeout,
+            job_id=job_id,
+        )
+        print(f"Worker completed {n} tasks.")
+        return
+
+    if args.wait:
+        print("Waiting for all tasks to complete...")
+        runner.wait(timeout=None)
+        print("All tasks completed.")
+        if args.assemble and runner.config.write_output:
+            print("Assembling output...")
+            runner.handle_assemble_output(None)
+            print(f"Output: {runner.config.resolved_output_path}")
+        return
+
+    # Default: run serial (all stages in one process)
+    print("Running serial decode...")
+    n = runner.run_serial()
+    print(f"Completed {n} tasks.")
+
+    status = runner.orchestrator.stage_counts()
+    for stage, counts in sorted(status.items()):
+        print(f"  {stage}: {counts}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/decode_large_worker.sh b/scripts/decode_large_worker.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#SBATCH --job-name=waterz_worker
+#SBATCH --mem=64G
+#SBATCH --cpus-per-task=4
+#SBATCH --time=12:00:00
+#SBATCH --output=slurm_outputs/waterz_worker_%A_%a.out
+#SBATCH --error=slurm_outputs/waterz_worker_%A_%a.err
+
+# Usage:
+#   sbatch --array=0-7 scripts/decode_large_worker.sh tutorials/waterz_decoding_large.yaml
+#
+# Each array task is an independent worker that claims and executes
+# tasks from the shared workflow directory. Workers coordinate via
+# file locks — no central scheduler needed.
+
+CONFIG=${1:-tutorials/waterz_decoding_large.yaml}
+
+source /projects/weilab/weidf/lib/miniconda3/bin/activate pytc
+cd /projects/weilab/weidf/lib/pytorch_connectomics
+
+export CCACHE_DISABLE=1
+
+echo "Worker ${SLURM_ARRAY_TASK_ID} of ${SLURM_ARRAY_TASK_COUNT} on $(hostname)"
+echo "Config: ${CONFIG}"
+echo "Start: $(date)"
+
+python scripts/decode_large.py \
+    --config ${CONFIG} \
+    --worker \
+    --worker-id "$(hostname)-${SLURM_ARRAY_TASK_ID}" \
+    --job-id "${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}" \
+    --idle-timeout 120
+
+echo "End: $(date)"
diff --git a/tutorials/waterz_decoding_large.yaml b/tutorials/waterz_decoding_large.yaml
@@ -0,0 +1,53 @@
+experiment_name: waterz_decode_large
+description: >
+  Large-volume parallel waterz decoding using the file-backed orchestrator.
+  Splits the affinity volume into chunks, decodes each independently,
+  then merges borders via IOU matching.
+
+  Supports serial mode (single process) or parallel mode (multiple workers
+  on shared filesystem, e.g. SLURM array jobs).
+
+  Usage (serial):
+    python scripts/decode_large.py --config tutorials/waterz_decoding_large.yaml
+
+  Usage (parallel — launch N workers on SLURM):
+    # Step 1: Initialize workflow (creates tasks on disk)
+    python scripts/decode_large.py --config tutorials/waterz_decoding_large.yaml --init-only
+
+    # Step 2: Launch workers (each claims and executes tasks)
+    sbatch --array=0-7 scripts/decode_large_worker.sh tutorials/waterz_decoding_large.yaml
+
+    # Step 3: Wait for completion + assemble
+    python scripts/decode_large.py --config tutorials/waterz_decoding_large.yaml --wait --assemble
+
+# ── Chunked decoding config ──────────────────────────────────────────────────
+large_decode:
+  # Input affinity file (HDF5, dataset "main", shape (C,Z,Y,X))
+  affinity_path: ""       # SET THIS
+
+  # Workflow directory (shared filesystem for parallel workers)
+  workflow_root: ""       # SET THIS: e.g. /scratch/waterz_workflow/
+
+  # Chunk size in voxels (Z, Y, X). Each chunk is decoded independently.
+  # Smaller chunks = less memory per worker but more border stitching.
+  # Rule of thumb: ~256^3 for 16 GB RAM, ~512^3 for 64 GB RAM.
+  chunk_shape: [256, 512, 512]
+
+  # Waterz agglomeration parameters (same as decode_waterz kwargs)
+  thresholds: [0.5]
+  merge_function: aff85_his256
+  aff_threshold_low: 0.1
+  aff_threshold_high: 0.999
+  channel_order: xyz
+
+  # Border stitching parameters
+  border_min_overlap: 1          # min overlap pixels to consider a pair
+  border_one_sided_threshold: 0.9  # IOU for small-into-large merge at borders
+  border_iou_threshold: 0.0     # full Jaccard IOU threshold (0=disabled)
+  border_affinity_threshold: 0.0 # min boundary affinity (0=disabled)
+
+  # Output
+  write_output: true             # assemble final volume
+  output_path: ""                # default: workflow_root/assembled.h5
+  compression: gzip
+  compression_level: 4