nf-core
diff --git a/‎bin/merge_sompy_features.py‎
Lines changed: 96 additions & 0 deletions b/‎bin/merge_sompy_features.py‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎bin/split_sompy_features.py‎
Lines changed: 66 additions & 0 deletions b/‎bin/split_sompy_features.py‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎conf/modules.config‎
Lines changed: 16 additions & 2 deletions b/‎conf/modules.config‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎modules/local/custom/merge_sompy_features/environment.yml‎
Lines changed: 7 additions & 0 deletions b/‎modules/local/custom/merge_sompy_features/environment.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎modules/local/custom/merge_sompy_features/main.nf‎
Lines changed: 43 additions & 0 deletions b/‎modules/local/custom/merge_sompy_features/main.nf‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎modules/local/custom/split_sompy_features/environment.yml‎
Lines changed: 7 additions & 0 deletions b/‎modules/local/custom/split_sompy_features/environment.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎modules/local/custom/split_sompy_features/main.nf‎
Lines changed: 46 additions & 0 deletions b/‎modules/local/custom/split_sompy_features/main.nf‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎subworkflows/local/compare_benchmark_results/main.nf‎
Lines changed: 15 additions & 7 deletions b/‎subworkflows/local/compare_benchmark_results/main.nf‎
Lines changed: 15 additions & 7 deletions
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+
+# Copyright 2025 - GHGA
+# Author: Kuebra Narci - @kubranarci
+'''
+Generates a CSV file from a VCF
+Expected usage:
+    $ python split_sompy_features.py <vcf_file> <prefix>
+Use --help for more information.
+'''
+
+import csv
+import argparse
+from collections import defaultdict
+import os
+
+KEY_COLUMNS = ["CHROM", "POS", "tag"]
+FIELDS_TO_EXTRACT = ["CHROM", "POS", "tag", "REF", "REF.truth", "ALT", "ALT.truth", "QUAL", "FILTER"]
+FIELDS_TO_SUFFIX = ["REF", "ALT"]
+
+def extract_sample_suffix(filename):
+    """Extract sample suffix from filename (without extension)."""
+    return os.path.splitext(os.path.basename(filename))[0]
+
+def load_csv_by_key(filepath, suffix):
+    """Read a CSV file, filter relevant fields, and suffix REF/ALT."""
+    with open(filepath, newline='') as f:
+        reader = csv.DictReader(f)
+        data = {}
+        for row in reader:
+            key = tuple(row[k] for k in KEY_COLUMNS)
+            filtered = {}
+
+            for field in FIELDS_TO_EXTRACT:
+                if field in FIELDS_TO_SUFFIX:
+                    filtered[f"{field}_{suffix}"] = row.get(field, "")
+                elif field in KEY_COLUMNS:
+                    filtered[field] = row.get(field, "")
+                else:
+                    if field not in data.get(key, {}):
+                        filtered[field] = row.get(field, "")
+
+            if key not in data:
+                data[key] = filtered
+            else:
+                data[key].update(filtered)
+
+        return data
+
+def merge_dicts_by_key(dicts):
+    """Merge all dicts on shared key."""
+    merged = defaultdict(dict)
+    for d in dicts:
+        for key, row in d.items():
+            merged[key].update(row)
+    return merged
+
+def write_merged_csv(merged_data, output_file):
+    """Write merged dictionary to CSV."""
+    sorted_keys = sorted(merged_data.keys(), key=lambda x: (x[0], int(x[1])))
+
+    # Determine full set of columns
+    all_fields = set()
+    for row in merged_data.values():
+        all_fields.update(row.keys())
+
+    # Reorder fields: key columns first, then others
+    fieldnames = KEY_COLUMNS + sorted(all_fields - set(KEY_COLUMNS))
+
+    with open(output_file, 'w', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        for key in sorted_keys:
+            writer.writerow(merged_data[key])
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Merge TP/FP/FN CSVs by CHROM,POS,tag, keep selected fields, and suffix REF/ALT from filename."
+    )
+    parser.add_argument("files", nargs='+', help="Input CSV files (e.g. *_TP.csv)")
+    parser.add_argument("--output", required=True, help="Output merged CSV file")
+    args = parser.parse_args()
+
+    all_dicts = []
+    for file in args.files:
+        suffix = extract_sample_suffix(file)
+        print(f"Processing {file} (sample: {suffix})")
+        sample_dict = load_csv_by_key(file, suffix)
+        all_dicts.append(sample_dict)
+
+    merged = merge_dicts_by_key(all_dicts)
+    write_merged_csv(merged, args.output)
+    print(f"Merged CSV written to {args.output}")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+
+# Copyright 2025 - GHGA
+# Author: Kuebra Narci - @kubranarci
+'''
+Generates a CSV file from a VCF
+Expected usage:
+    $ python split_sompy_features.py <vcf_file> <prefix>
+Use --help for more information.
+'''
+import csv
+import argparse
+import os
+
+def split_csv_by_tag(input_file, prefix):
+    output_files = {
+        'TP': f'{prefix}_TP.csv',
+        'FP': f'{prefix}_FP.csv',
+        'FN': f'{prefix}_FN.csv'
+    }
+
+    try:
+        with open(input_file, newline='') as infile:
+            reader = csv.reader(infile)
+            header = next(reader)
+
+            # Prepare output writers
+            writers = {}
+            files = {}
+            for tag, filename in output_files.items():
+                f = open(filename, 'w', newline='')
+                writer = csv.writer(f)
+                writer.writerow(header)
+                writers[tag] = writer
+                files[tag] = f
+
+            # Write rows to correct files
+            for row in reader:
+                if len(row) > 3:
+                    tag = row[3]
+                    if tag in writers:
+                        writers[tag].writerow(row)
+
+            # Close all output files
+            for f in files.values():
+                f.close()
+
+        print("Done. Files created:")
+        for filename in output_files.values():
+            print(f"  - {filename}")
+
+    except FileNotFoundError:
+        print(f"Error: File '{input_file}' not found.")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Split a CSV file into TP, FP, and FN files based on the 'tag' column.")
+    parser.add_argument("input_csv", help="Path to the input CSV file")
+    parser.add_argument("prefix", help="Path to the input CSV file")
+    args = parser.parse_args()
+
+    split_csv_by_tag(args.input_csv, args.prefix)
+
+if __name__ == "__main__":
+    main()
@@ -300,14 +300,20 @@ process {
 
     withName: "HAPPY_SOMPY" {
         ext.prefix = {"${meta.id}.${params.truth_id}.${meta.caller}"}
-        ext.args = { meta.caller.contains("strelka") || meta.caller.contains("varscan") || meta.caller.contains("pisces") ? "--feature-table hcc.${meta.caller}.${params.variant_type} --bin-afs" : "--feature-table generic" }
+        ext.args = { meta.caller.contains("strelka") || meta.caller.contains("varscan") || meta.caller.contains("pisces") ||  meta.caller == "mutect" ? "--feature-table hcc.${meta.caller}.${params.variant_type} --bin-afs" : "--feature-table generic" }
         publishDir = [
             path: {"${params.outdir}/${params.variant_type}/${meta.id}/benchmarks/sompy"},
             pattern: "*{.csv.gz,csv,json.gz,json,vcf.gz,vcf.gz.tbi,csv}",
             mode: params.publish_dir_mode
         ]
     }
 
+    withName: "SPLIT_SOMPY_FEATURES" {
+        publishDir = [
+            enabled: false
+        ]
+    }
+
     withName: "HAPPY_PREPY" {
         ext.prefix = {"${meta.id}.${params.truth_id}.${meta.caller}.prepy"}
         ext.args   = {"--fixchr --filter-nonref --bcftools-norm"}
@@ -455,12 +461,20 @@ process {
     withName: VCF_TO_CSV {
         ext.prefix = {"${meta.id}.${meta.tag}"}
         publishDir = [
-            path: {"${params.outdir}/${params.variant_type}/summary/comparisons/"},
+            path: {"${params.outdir}/${params.variant_type}/summary/comparisons/${meta.id}"},
             pattern: "*{.csv}",
             mode: params.publish_dir_mode
         ]
     }
 
+    withName: MERGE_SOMPY_FEATURES {
+        publishDir = [
+            path: {"${params.outdir}/${params.variant_type}/summary/comparisons/${meta.id}"},
+            pattern: "*{csv}",
+            mode: params.publish_dir_mode
+        ]
+    }
+
     // VCF2BED tools
 
     withName: "SVTK_VCF2BED" {
 
@@ -0,0 +1,7 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - pip
+  - pip:
+      - pandas==2.2.3
@@ -0,0 +1,43 @@
+process MERGE_SOMPY_FEATURES {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/ab/ab3b0054e3111812d8f2deb12345d5b7ca7ea7b18a2dbcbf174d46274c28deba/data':
+        'community.wave.seqera.io/library/pip_pandas:40d2e76c16c136f0' }"
+
+    input:
+    tuple val(meta), path(csvs)
+
+    output:
+    tuple val(meta), path("*.summary.csv")   , emit: TP
+    path "versions.yml"                      , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    """
+    merge_sompy_features.py $csvs --output ${prefix}.${meta.tag}.summary.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.${meta.tag}.summary.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+
+}
@@ -0,0 +1,7 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - pip
+  - pip:
+      - pandas==2.2.3
@@ -0,0 +1,46 @@
+process SPLIT_SOMPY_FEATURES {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/ab/ab3b0054e3111812d8f2deb12345d5b7ca7ea7b18a2dbcbf174d46274c28deba/data':
+        'community.wave.seqera.io/library/pip_pandas:40d2e76c16c136f0' }"
+
+    input:
+    tuple val(meta), path(input)
+
+    output:
+    tuple val(meta), path("*TP.csv")   , emit: TP
+    tuple val(meta), path("*FP.csv")   , emit: FP
+    tuple val(meta), path("*FN.csv")   , emit: FN
+    path "versions.yml"                , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    """
+    split_sompy_features.py $input $prefix
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}_TP.csv
+    touch ${prefix}_FP.csv
+    touch ${prefix}_FN.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+
+}
@@ -3,17 +3,19 @@
 // COMPARE_BENCHMARK_RESULTS: SUBWORKFLOW to merge TP/FP/FN results from different tools.
 //
 
-include { SURVIVOR_MERGE    } from '../../../modules/nf-core/survivor/merge'
-include { BCFTOOLS_MERGE    } from '../../../modules/nf-core/bcftools/merge'
-include { VCF_TO_CSV        } from '../../../modules/local/custom/vcf_to_csv'
-include { REFORMAT_HEADER   } from '../../../modules/local/custom/reformat_header'
+include { SURVIVOR_MERGE       } from '../../../modules/nf-core/survivor/merge'
+include { BCFTOOLS_MERGE       } from '../../../modules/nf-core/bcftools/merge'
+include { VCF_TO_CSV           } from '../../../modules/local/custom/vcf_to_csv'
+include { REFORMAT_HEADER      } from '../../../modules/local/custom/reformat_header'
+include { MERGE_SOMPY_FEATURES } from '../../../modules/local/custom/merge_sompy_features'
 include { TABIX_BGZIP as TABIX_BGZIP_UNZIP } from '../../../modules/nf-core/tabix/bgzip'
 
 workflow COMPARE_BENCHMARK_RESULTS {
     take:
-    evaluations // channel: [val(meta), vcf.gz, index]
-    fasta       // reference channel [val(meta), ref.fa]
-    fai         // reference channel [val(meta), ref.fa.fai]
+    evaluations     // channel: [val(meta), vcf.gz, index]
+    evaluations_csv // channel: [val(meta), csv]
+    fasta           // reference channel [val(meta), ref.fa]
+    fai             // reference channel [val(meta), ref.fa.fai]
 
     main:
     versions    = Channel.empty()
@@ -69,6 +71,12 @@ workflow COMPARE_BENCHMARK_RESULTS {
     )
     versions = versions.mix(VCF_TO_CSV.out.versions.first())
 
+
+    MERGE_SOMPY_FEATURES(
+        evaluations_csv.groupTuple()
+    )
+    versions = versions.mix(MERGE_SOMPY_FEATURES.out.versions.first())
+
     emit:
     merged_vcfs  // channel: [val(meta), vcf]
     versions     // channel: [versions.yml]