Skip to content

Commit 3962cec

Browse files
Donglai Weiclaude
andcommitted
Add large-volume parallel waterz decoding via file-backed orchestrator
- tutorials/waterz_decoding_large.yaml: config for chunked parallel decode - scripts/decode_large.py: launcher (serial, worker, wait/assemble modes) - scripts/decode_large_worker.sh: SLURM array job worker template Pipeline: split volume → decode chunks independently → stitch borders via IOU → assemble final volume. Workers coordinate via shared filesystem (no central scheduler). Supports serial and multi-node parallel modes. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent d83be3d commit 3962cec

3 files changed

Lines changed: 234 additions & 0 deletions

File tree

scripts/decode_large.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
"""Large-volume parallel waterz decoding using file-backed orchestrator.
2+
3+
Usage:
4+
# Serial (single process, all stages)
5+
python scripts/decode_large.py --config tutorials/waterz_decoding_large.yaml
6+
7+
# Initialize workflow only (for parallel launch)
8+
python scripts/decode_large.py --config tutorials/waterz_decoding_large.yaml --init-only
9+
10+
# Run as a worker (claims tasks from shared workflow dir)
11+
python scripts/decode_large.py --config tutorials/waterz_decoding_large.yaml --worker
12+
13+
# Wait for all workers to finish, then assemble output
14+
python scripts/decode_large.py --config tutorials/waterz_decoding_large.yaml --wait --assemble
15+
"""
16+
17+
import argparse
18+
import os
19+
import sys
20+
from pathlib import Path
21+
22+
import yaml
23+
24+
25+
def main():
26+
parser = argparse.ArgumentParser(description="Large-volume waterz decoding")
27+
parser.add_argument("--config", required=True, help="YAML config file")
28+
parser.add_argument("--init-only", action="store_true", help="Initialize workflow and exit")
29+
parser.add_argument("--worker", action="store_true", help="Run as a worker (claim tasks)")
30+
parser.add_argument("--wait", action="store_true", help="Wait for all tasks to complete")
31+
parser.add_argument("--assemble", action="store_true", help="Assemble final output volume")
32+
parser.add_argument("--max-tasks", type=int, default=None, help="Max tasks per worker")
33+
parser.add_argument("--idle-timeout", type=float, default=60.0, help="Worker idle timeout (seconds)")
34+
parser.add_argument("--worker-id", type=str, default=None, help="Worker identifier")
35+
parser.add_argument("--job-id", type=str, default=None, help="SLURM job ID")
36+
# Allow CLI overrides in key=value format
37+
parser.add_argument("overrides", nargs="*", help="Config overrides (key=value)")
38+
args = parser.parse_args()
39+
40+
# Load config
41+
with open(args.config) as f:
42+
cfg = yaml.safe_load(f)
43+
44+
large_cfg = cfg.get("large_decode", {})
45+
46+
# Apply CLI overrides
47+
for override in args.overrides:
48+
if "=" not in override:
49+
print(f"Warning: skipping invalid override '{override}' (expected key=value)")
50+
continue
51+
key, value = override.split("=", 1)
52+
# Try numeric conversion
53+
try:
54+
value = int(value)
55+
except ValueError:
56+
try:
57+
value = float(value)
58+
except ValueError:
59+
pass
60+
large_cfg[key] = value
61+
62+
if not large_cfg.get("affinity_path"):
63+
print("Error: large_decode.affinity_path is required")
64+
sys.exit(1)
65+
if not large_cfg.get("workflow_root"):
66+
print("Error: large_decode.workflow_root is required")
67+
sys.exit(1)
68+
69+
os.environ.setdefault("CCACHE_DISABLE", "1")
70+
71+
from waterz import LargeDecodeRunner
72+
73+
# Parse config
74+
chunk_shape = large_cfg.get("chunk_shape", [256, 512, 512])
75+
if isinstance(chunk_shape, list):
76+
chunk_shape = tuple(chunk_shape)
77+
78+
thresholds = large_cfg.get("thresholds", [0.5])
79+
if isinstance(thresholds, (int, float)):
80+
thresholds = [thresholds]
81+
82+
runner = LargeDecodeRunner.create(
83+
affinity_path=large_cfg["affinity_path"],
84+
workflow_root=large_cfg["workflow_root"],
85+
chunk_shape=chunk_shape,
86+
thresholds=thresholds,
87+
merge_function=large_cfg.get("merge_function", "aff85_his256"),
88+
aff_threshold_low=float(large_cfg.get("aff_threshold_low", 0.1)),
89+
aff_threshold_high=float(large_cfg.get("aff_threshold_high", 0.999)),
90+
channel_order=large_cfg.get("channel_order", "xyz"),
91+
write_output=bool(large_cfg.get("write_output", True)),
92+
output_path=large_cfg.get("output_path") or None,
93+
border_min_overlap=int(large_cfg.get("border_min_overlap", 1)),
94+
border_one_sided_threshold=float(large_cfg.get("border_one_sided_threshold", 0.9)),
95+
border_iou_threshold=float(large_cfg.get("border_iou_threshold", 0.0)),
96+
border_affinity_threshold=float(large_cfg.get("border_affinity_threshold", 0.0)),
97+
compression=large_cfg.get("compression", "gzip"),
98+
compression_level=int(large_cfg.get("compression_level", 4)),
99+
)
100+
101+
chunks = runner.chunks
102+
borders = runner.borders
103+
print(f"Volume shape: {runner.config.volume_shape}")
104+
print(f"Chunk shape: {runner.config.chunk_shape}")
105+
print(f"Chunks: {len(chunks)}")
106+
print(f"Borders: {len(borders)}")
107+
print(f"Workflow: {runner.config.workflow_root}")
108+
109+
if args.init_only:
110+
print("Workflow initialized. Launch workers to execute tasks.")
111+
return
112+
113+
if args.worker:
114+
worker_id = args.worker_id or os.environ.get("SLURM_JOB_ID", None)
115+
job_id = args.job_id or os.environ.get("SLURM_ARRAY_TASK_ID", None)
116+
print(f"Starting worker: {worker_id or 'auto'} (job={job_id or 'none'})")
117+
n = runner.run_worker(
118+
worker_id=worker_id,
119+
max_tasks=args.max_tasks,
120+
idle_timeout=args.idle_timeout,
121+
job_id=job_id,
122+
)
123+
print(f"Worker completed {n} tasks.")
124+
return
125+
126+
if args.wait:
127+
print("Waiting for all tasks to complete...")
128+
runner.wait(timeout=None)
129+
print("All tasks completed.")
130+
if args.assemble and runner.config.write_output:
131+
print("Assembling output...")
132+
runner.handle_assemble_output(None)
133+
print(f"Output: {runner.config.resolved_output_path}")
134+
return
135+
136+
# Default: run serial (all stages in one process)
137+
print("Running serial decode...")
138+
n = runner.run_serial()
139+
print(f"Completed {n} tasks.")
140+
141+
status = runner.orchestrator.stage_counts()
142+
for stage, counts in sorted(status.items()):
143+
print(f" {stage}: {counts}")
144+
145+
146+
if __name__ == "__main__":
147+
main()

scripts/decode_large_worker.sh

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=waterz_worker
3+
#SBATCH --mem=64G
4+
#SBATCH --cpus-per-task=4
5+
#SBATCH --time=12:00:00
6+
#SBATCH --output=slurm_outputs/waterz_worker_%A_%a.out
7+
#SBATCH --error=slurm_outputs/waterz_worker_%A_%a.err
8+
9+
# Usage:
10+
# sbatch --array=0-7 scripts/decode_large_worker.sh tutorials/waterz_decoding_large.yaml
11+
#
12+
# Each array task is an independent worker that claims and executes
13+
# tasks from the shared workflow directory. Workers coordinate via
14+
# file locks — no central scheduler needed.
15+
16+
CONFIG=${1:-tutorials/waterz_decoding_large.yaml}
17+
18+
source /projects/weilab/weidf/lib/miniconda3/bin/activate pytc
19+
cd /projects/weilab/weidf/lib/pytorch_connectomics
20+
21+
export CCACHE_DISABLE=1
22+
23+
echo "Worker ${SLURM_ARRAY_TASK_ID} of ${SLURM_ARRAY_TASK_COUNT} on $(hostname)"
24+
echo "Config: ${CONFIG}"
25+
echo "Start: $(date)"
26+
27+
python scripts/decode_large.py \
28+
--config ${CONFIG} \
29+
--worker \
30+
--worker-id "$(hostname)-${SLURM_ARRAY_TASK_ID}" \
31+
--job-id "${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}" \
32+
--idle-timeout 120
33+
34+
echo "End: $(date)"
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
experiment_name: waterz_decode_large
2+
description: >
3+
Large-volume parallel waterz decoding using the file-backed orchestrator.
4+
Splits the affinity volume into chunks, decodes each independently,
5+
then merges borders via IOU matching.
6+
7+
Supports serial mode (single process) or parallel mode (multiple workers
8+
on shared filesystem, e.g. SLURM array jobs).
9+
10+
Usage (serial):
11+
python scripts/decode_large.py --config tutorials/waterz_decoding_large.yaml
12+
13+
Usage (parallel — launch N workers on SLURM):
14+
# Step 1: Initialize workflow (creates tasks on disk)
15+
python scripts/decode_large.py --config tutorials/waterz_decoding_large.yaml --init-only
16+
17+
# Step 2: Launch workers (each claims and executes tasks)
18+
sbatch --array=0-7 scripts/decode_large_worker.sh tutorials/waterz_decoding_large.yaml
19+
20+
# Step 3: Wait for completion + assemble
21+
python scripts/decode_large.py --config tutorials/waterz_decoding_large.yaml --wait --assemble
22+
23+
# ── Chunked decoding config ──────────────────────────────────────────────────
24+
large_decode:
25+
# Input affinity file (HDF5, dataset "main", shape (C,Z,Y,X))
26+
affinity_path: "" # SET THIS
27+
28+
# Workflow directory (shared filesystem for parallel workers)
29+
workflow_root: "" # SET THIS: e.g. /scratch/waterz_workflow/
30+
31+
# Chunk size in voxels (Z, Y, X). Each chunk is decoded independently.
32+
# Smaller chunks = less memory per worker but more border stitching.
33+
# Rule of thumb: ~256^3 for 16 GB RAM, ~512^3 for 64 GB RAM.
34+
chunk_shape: [256, 512, 512]
35+
36+
# Waterz agglomeration parameters (same as decode_waterz kwargs)
37+
thresholds: [0.5]
38+
merge_function: aff85_his256
39+
aff_threshold_low: 0.1
40+
aff_threshold_high: 0.999
41+
channel_order: xyz
42+
43+
# Border stitching parameters
44+
border_min_overlap: 1 # min overlap pixels to consider a pair
45+
border_one_sided_threshold: 0.9 # IOU for small-into-large merge at borders
46+
border_iou_threshold: 0.0 # full Jaccard IOU threshold (0=disabled)
47+
border_affinity_threshold: 0.0 # min boundary affinity (0=disabled)
48+
49+
# Output
50+
write_output: true # assemble final volume
51+
output_path: "" # default: workflow_root/assembled.h5
52+
compression: gzip
53+
compression_level: 4

0 commit comments

Comments
 (0)