openproblems-bio · ghar1821 · Apr 8, 2026 · Jan 15, 2026 · Jan 15, 2026 · Jan 16, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -67,6 +67,14 @@
 
 * Added new metric `ratio_inconsistent_peaks` (PR #114).
 
+* Added processing scripts for Lille dataset and remove ones for CLL dataset (PR #118).
+
+* Added config and run scripts for running the benchmark on WEHI HPC (PR #119). 
+  Refer to the pull request on Github to see what needs to be set up.
+  TLDR; Setup the compute environment and the caching directory, then run the warmup job
+  to pull and create the apptainer images (one job = one method + one metric).
+  After that, run the main benchmarking job with the config file for the HPC system.
+
 ## MAJOR CHANGES
 
 * Updated file schema (PR #18): 
@@ -103,6 +111,14 @@
 
 * Update CytoVI (PR #114).
 
+* Update CytoVI to normalise using minmax scaler fitted on batch 1 post correction (PR #119).
+
+* Update batchadjust, cytonorm to use HPC temp dir if the environment variable is set or else
+  default to what is set by viash. This is to prevent collision in temp files when the jobs are running (PR #119).  
+
+* Update ratio inconsistent peaks to handle edge cases where methods return only zero values
+  for a marker/cell type/donor combination, causing sd to be zero and division by zero (PR #119).
+
 ## MINOR CHANGES
 
 * Enabled unit tests (PR #2).
@@ -134,6 +150,11 @@
 
 * Removed EMD max from calculation (PR #113).
 
+* Tune the resource requirement for each method (PR #119).
+  * Low time, mem, cpu for control methods.
+  * Mid time, mem, cpu for most methods, except below.
+  * High (or very high) time, mem, cpu for computationally ones like rPCA.
+
 
 ## BUG FIXES
 
@@ -170,3 +191,14 @@
 * Fix bug in EMD vertical where sample combination was malformed (PR #113)
 
 * Fix lisi inconsistent naming (PR #117) for issue #116.
+
+* Fix bug in perfect integration where if batch is str (not int), it only returns control samples (PR #119).
+
+* Fix bug in batchadjust needing "Batch_" in the sample names for non-control samples (PR #119). 
+
+* Fix bug in cytonorm to mid where recompute was set to FALSE. It is now set to TRUE (PR #119).
+
+* Remove transpose in harmonypy as new updates to harmonypy no longer need the transpose (PR #119). 
+
+* Fix bug in get_obs_var_for_integrated to handle the cases where batch column in obs is str
+  and thus can't be directly overriden (new values given by get_donor_batch_map is int) (PR #119).
diff --git a/scripts/create_resources/process_raw_datasets_hpc.sh b/scripts/create_resources/process_raw_datasets_hpc.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# script to launch the process raw dataset workflow on slurm via seqera tower.
+# leave the input_states to s3 bucket as the datasets raw files are stored there.
+
+cat > /tmp/params.yaml << 'HERE'
+input_states: s3://openproblems-data/resources/task_cyto_batch_integration/datasets_raw/**/state.yaml
+rename_keys: 'input:output_dataset'
+output_state: '$id/state.yaml'
+settings: '{"output_unintegrated": "$id/unintegrated.h5ad", "output_censored_split1": "$id/censored_split1.h5ad", "output_censored_split2": "$id/censored_split2.h5ad"}'
+publish_dir: /vast/scratch/users/putri.g/cytobenchmark/benchmark_out_hpc/datasets/
+HERE
+
+tw launch https://github.com/openproblems-bio/task_cyto_batch_integration.git \
+  --revision build/main \
+  --pull-latest \
+  --main-script target/nextflow/workflows/process_datasets/main.nf \
+  --workspace 80689470953249 \
+  --params-file /tmp/params.yaml \
+  --entry-name auto \
+  --config scripts/labels_tw_wehi.config \
+  --labels task_cyto_batch_integration,process_datasets
diff --git a/scripts/labels_tw_wehi.config b/scripts/labels_tw_wehi.config
@@ -0,0 +1,168 @@
+def exitStrat(task, max_attempts = 3) {
+  println "Determining exit strategy for task (attempt '${task.attempt}', exit status '${task.exitStatus}')"
+
+  // if the component failed 3 times, ignore the error so the workflow can continue
+  // it's important 'ignore' is returned even if maxRetries is set to 3,
+  // otherwise the workflow will stop
+  if (task.attempt >= 3) {
+    return 'ignore'
+  }
+
+  return 'retry'
+}
+
+// Let Nextflow head job manages the Apptainer containers
+apptainer {
+  enabled = true
+  pullTimeout = '48h'
+  ociAutoPull = false
+  cacheDir = '/vast/scratch/users/putri.g/nextflow/apptainer_cache'
+  envWhitelist = 'APPTAINER_CACHEDIR,APPTAINER_TMPDIR,SINGULARITY_CACHEDIR,SINGULARITY_TMPDIR,TMPDIR,NXF_HOME,NXF_TEMP,NXF_APPTAINER_CACHEDIR,PYTHONPATH,NUMBA_CACHE_DIR,NUMBA_DISABLE_JIT,HPC_VIASH_META_TEMP_DIR'
+}
+
+env {
+  NXF_APPTAINER_CACHEDIR = '/vast/scratch/users/putri.g/nextflow/apptainer_cache'
+  APPTAINER_CACHEDIR = '/vast/scratch/users/putri.g/nextflow/apptainer_cache'
+  APPTAINER_TMPDIR = '/vast/scratch/users/putri.g/nextflow/apptainer_tmp'
+  SINGULARITY_CACHEDIR = '/vast/scratch/users/putri.g/nextflow/apptainer_cache'
+  SINGULARITY_TMPDIR = '/vast/scratch/users/putri.g/nextflow/apptainer_tmp'
+  NXF_HOME = '/vast/scratch/users/putri.g/nextflow/nxf_home'
+  PYTHONPATH = '/root/.local/lib/python3.12/site-packages'
+  // Add Numba environment variables to fix caching issues in containers
+  NUMBA_DISABLE_JIT = '0'
+}
+
+process {
+  beforeScript = '''
+    # Create base directories (shared across tasks)
+    mkdir -p "$APPTAINER_CACHEDIR" "$NXF_HOME" "$HOME"
+
+    # Create task-specific temp directories
+    export TMPDIR="/vast/scratch/users/putri.g/nextflow/apptainer_tmp/${NXF_TASK_INDEX:-$$}"
+    export APPTAINER_TMPDIR="${TMPDIR}"
+    export SINGULARITY_TMPDIR="${TMPDIR}"
+    export NXF_TEMP="/vast/scratch/users/putri.g/nextflow/nxf_tmp/${NXF_TASK_INDEX:-$$}"
+    export HPC_VIASH_META_TEMP_DIR="${NXF_TEMP}"
+    export NUMBA_CACHE_DIR="/vast/scratch/users/putri.g/nextflow/numba_cache/${NXF_TASK_INDEX:-$$}"
+
+    mkdir -p "$TMPDIR" "$NXF_TEMP" "$NUMBA_CACHE_DIR"
+
+    echo "============================="
+    echo "Task-specific directories:"
+    echo "============================="
+    echo "  TMPDIR: $TMPDIR"
+    echo "  APPTAINER_TMPDIR: $APPTAINER_TMPDIR"
+    echo "  SINGULARITY_TMPDIR: $SINGULARITY_TMPDIR"
+    echo "  NXF_TEMP: $NXF_TEMP"
+    echo "  HPC_VIASH_META_TEMP_DIR: $HPC_VIASH_META_TEMP_DIR"
+    echo "  NUMBA_CACHE_DIR: $NUMBA_CACHE_DIR"
+    echo "============================="
+    echo "Shared directories:"
+    echo "============================="
+    echo "  APPTAINER_CACHEDIR: $APPTAINER_CACHEDIR"
+    echo "  NXF_APPTAINER_CACHEDIR: $NXF_APPTAINER_CACHEDIR"
+    echo "  NXF_HOME: $NXF_HOME"
+  '''.stripIndent()
+}
+
+
+process {
+  executor = 'slurm'
+
+  // Default resources for all processes
+  cpus = 4
+  memory = { get_memory( 10.GB * task.attempt ) }
+  time = '48.h'
+  disk = 50.GB
+  queue = 'regular'
+
+  // Retry for exit codes that have something to do with memory issues
+  // always retry once
+  errorStrategy = { exitStrat(task) }
+  maxRetries = 3
+  maxMemory = null
+
+  // Resource labels
+  withLabel: lowcpu { cpus = 5 }
+  withLabel: midcpu { cpus = 15 }
+  withLabel: highcpu { cpus = 30 }
+  withLabel: lowmem { memory = { get_memory( 10.GB * task.attempt ) } }
+  withLabel: midmem { memory = { get_memory( 30.GB * task.attempt ) } }
+  withLabel: highmem { memory = { get_memory( 80.GB * task.attempt ) } }
+  withLabel: veryhighmem { memory = { get_memory( 150.GB * task.attempt ) } }
+  withLabel: lowtime { time = 2.h }
+  withLabel: midtime { time = 8.h } 
+  withLabel: hightime { time = 12.h } 
+  withLabel: veryhightime { time = 24.h } 
+  withLabel: lowsharedmem {
+    containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.05)}" : ""}
+  }
+  withLabel: midsharedmem {
+    containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.1)}" : ""}
+  }
+  withLabel: highsharedmem {
+    containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.25)}" : ""}
+  }
+  withLabel: gpu {
+    cpus = 16
+    clusterOptions = '--gres=gpu:A30:1'
+    queue = "gpuq"
+    containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
+       ( workflow.containerEngine == "docker" ? '--gpus all': null ) }
+  }
+  withLabel: midgpu {
+    cpus = 32
+    clusterOptions = '--gres=gpu:A30:4'
+    queue = "gpuq"
+    containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
+       ( workflow.containerEngine == "docker" ? '--gpus all': null ) }
+  }
+  withLabel: highgpu {
+    cpus = 64
+    clusterOptions = '--gres=gpu:A30:8'
+    queue = "gpuq"
+    containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
+       ( workflow.containerEngine == "docker" ? '--gpus all': null ) }
+  }
+  withLabel: biggpu {
+    cpus = 16
+    clusterOptions = '--gres=gpu:A100:1'
+    queue = "gpuq"
+    containerOptions = { workflow.containerEngine == "singularity" ? '--nv':
+       ( workflow.containerEngine == "docker" ? '--gpus all': null ) }
+  }
+
+  // make sure publishstates gets enough disk space and memory
+  withName:'.*publishStatesProc' {
+    memory = '16GB'
+    disk = '100GB'
+  }
+}
+
+def get_memory(to_compare) {
+  if (!process.containsKey("maxMemory") || !process.maxMemory) {
+    return to_compare
+  }
+
+  try {
+    if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
+      return process.maxMemory
+    }
+    else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
+      return max_memory as nextflow.util.MemoryUnit
+    }
+    else {
+      return to_compare
+    }
+  } catch (all) {
+        println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
+        System.exit(1)
+  }
+}
+
+// set tracing file
+trace {
+    enabled = true
+    overwrite = true
+    file = "${params.publish_dir}/trace.txt"
+}
diff --git a/scripts/nextflow_prerun_script.sh b/scripts/nextflow_prerun_script.sh
@@ -0,0 +1,19 @@
+# paste me as pre-run script in Tower if setting up workflow run in WEHI HPC.
+# load module first so the variables don't get overwritten
+module load nextflow/25.04.2
+
+# Tower pre-run script
+export SHARED_SCRATCH="/vast/scratch/users/putri.g/nextflow"
+
+export NXF_APPTAINER_CACHEDIR="$SHARED_SCRATCH/apptainer_cache"
+export APPTAINER_CACHEDIR="$SHARED_SCRATCH/apptainer_cache"
+export APPTAINER_TMPDIR="$SHARED_SCRATCH/apptainer_tmp"
+export APPTAINER_LIBRARYDIR="$SHARED_SCRATCH/apptainer_library"
+export SINGULARITY_CACHEDIR="$SHARED_SCRATCH/apptainer_cache"
+export SINGULARITY_TMPDIR="$SHARED_SCRATCH/apptainer_tmp"
+export TMPDIR="$SHARED_SCRATCH/apptainer_tmp"
+export NXF_HOME="$SHARED_SCRATCH/nxf_home"
+export NXF_TEMP="$SHARED_SCRATCH/nxf_tmp"
+export HOME="$SHARED_SCRATCH/home"
+
+mkdir -p "$APPTAINER_CACHEDIR" "$APPTAINER_TMPDIR" "$APPTAINER_LIBRARYDIR" "$NXF_HOME" "$NXF_TEMP" "$HOME"
diff --git a/scripts/run_benchmark/run_full_hpc.sh b/scripts/run_benchmark/run_full_hpc.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+# generate a unique id
+RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
+publish_dir="/vast/scratch/users/putri.g/cytobenchmark/benchmark_out_hpc/results/${RUN_ID}"
+
+# write the parameters to file
+cat > /tmp/params.yaml << HERE
+input_states: /vast/scratch/users/putri.g/cytobenchmark/benchmark_out_hpc/datasets/**/state.yaml
+rename_keys: 'input_censored_split1:output_censored_split1;input_censored_split2:output_censored_split2;input_unintegrated:output_unintegrated'
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+tw launch https://github.com/openproblems-bio/task_cyto_batch_integration.git \
+  --revision build/setup_run_hpc \
+  --pull-latest \
+  --main-script target/nextflow/workflows/run_benchmark/main.nf \
+  --workspace 80689470953249 \
+  --params-file /tmp/params.yaml \
+  --entry-name auto \
+  --config scripts/labels_tw_wehi.config \
+  --labels task_cyto_batch_integration,full
diff --git a/scripts/run_benchmark/run_subset_hpc.sh b/scripts/run_benchmark/run_subset_hpc.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# run script to run only subset of methods/metrics on HPC
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+# generate a unique id
+RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
+publish_dir="/vast/scratch/users/putri.g/cytobenchmark/benchmark_out_hpc/results/${RUN_ID}"
+
+# write the parameters to file
+cat > /tmp/params.yaml << HERE
+input_states: /vast/scratch/users/putri.g/cytobenchmark/benchmark_out_hpc/datasets/**/state.yaml
+rename_keys: 'input_censored_split1:output_censored_split1;input_censored_split2:output_censored_split2;input_unintegrated:output_unintegrated'
+output_state: "state.yaml"
+settings: '{"metrics_include": ["lisi"], "methods_include": ["combat"]}'
+publish_dir: "$publish_dir"
+HERE
+
+tw launch https://github.com/openproblems-bio/task_cyto_batch_integration.git \
+  --revision build/setup_run_hpc \
+  --pull-latest \
+  --main-script target/nextflow/workflows/run_benchmark/main.nf \
+  --workspace 80689470953249 \
+  --params-file /tmp/params.yaml \
+  --entry-name auto \
+  --config scripts/labels_tw_wehi.config \
+  --labels task_cyto_batch_integration,combat,test