Skip to content

Commit cb13c6f

Browse files
authored
Merge branch 'dev' into add-rtgtools-bndeval
2 parents 43937ff + b932094 commit cb13c6f

20 files changed

Lines changed: 425 additions & 85 deletions

bin/merge_sompy_features.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2025 - GHGA
4+
# Author: Kuebra Narci - @kubranarci
5+
'''
6+
Generates a CSV file from a VCF
7+
Expected usage:
8+
$ python split_sompy_features.py <vcf_file> <prefix>
9+
Use --help for more information.
10+
'''
11+
12+
import csv
13+
import argparse
14+
from collections import defaultdict
15+
import os
16+
17+
KEY_COLUMNS = ["CHROM", "POS", "tag"]
18+
FIELDS_TO_EXTRACT = ["CHROM", "POS", "tag", "REF", "REF.truth", "ALT", "ALT.truth", "QUAL", "FILTER"]
19+
FIELDS_TO_SUFFIX = ["REF", "ALT"]
20+
21+
def extract_sample_suffix(filename):
22+
"""Extract sample suffix from filename (without extension)."""
23+
return os.path.splitext(os.path.basename(filename))[0]
24+
25+
def load_csv_by_key(filepath, suffix):
26+
"""Read a CSV file, filter relevant fields, and suffix REF/ALT."""
27+
with open(filepath, newline='') as f:
28+
reader = csv.DictReader(f)
29+
data = {}
30+
for row in reader:
31+
key = tuple(row[k] for k in KEY_COLUMNS)
32+
filtered = {}
33+
34+
for field in FIELDS_TO_EXTRACT:
35+
if field in FIELDS_TO_SUFFIX:
36+
filtered[f"{field}_{suffix}"] = row.get(field, "")
37+
elif field in KEY_COLUMNS:
38+
filtered[field] = row.get(field, "")
39+
else:
40+
if field not in data.get(key, {}):
41+
filtered[field] = row.get(field, "")
42+
43+
if key not in data:
44+
data[key] = filtered
45+
else:
46+
data[key].update(filtered)
47+
48+
return data
49+
50+
def merge_dicts_by_key(dicts):
51+
"""Merge all dicts on shared key."""
52+
merged = defaultdict(dict)
53+
for d in dicts:
54+
for key, row in d.items():
55+
merged[key].update(row)
56+
return merged
57+
58+
def write_merged_csv(merged_data, output_file):
59+
"""Write merged dictionary to CSV."""
60+
sorted_keys = sorted(merged_data.keys(), key=lambda x: (x[0], int(x[1])))
61+
62+
# Determine full set of columns
63+
all_fields = set()
64+
for row in merged_data.values():
65+
all_fields.update(row.keys())
66+
67+
# Reorder fields: key columns first, then others
68+
fieldnames = KEY_COLUMNS + sorted(all_fields - set(KEY_COLUMNS))
69+
70+
with open(output_file, 'w', newline='') as f:
71+
writer = csv.DictWriter(f, fieldnames=fieldnames)
72+
writer.writeheader()
73+
for key in sorted_keys:
74+
writer.writerow(merged_data[key])
75+
76+
def main():
77+
parser = argparse.ArgumentParser(
78+
description="Merge TP/FP/FN CSVs by CHROM,POS,tag, keep selected fields, and suffix REF/ALT from filename."
79+
)
80+
parser.add_argument("files", nargs='+', help="Input CSV files (e.g. *_TP.csv)")
81+
parser.add_argument("--output", required=True, help="Output merged CSV file")
82+
args = parser.parse_args()
83+
84+
all_dicts = []
85+
for file in args.files:
86+
suffix = extract_sample_suffix(file)
87+
print(f"Processing {file} (sample: {suffix})")
88+
sample_dict = load_csv_by_key(file, suffix)
89+
all_dicts.append(sample_dict)
90+
91+
merged = merge_dicts_by_key(all_dicts)
92+
write_merged_csv(merged, args.output)
93+
print(f"Merged CSV written to {args.output}")
94+
95+
if __name__ == "__main__":
96+
main()

bin/split_sompy_features.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2025 - GHGA
4+
# Author: Kuebra Narci - @kubranarci
5+
'''
6+
Generates a CSV file from a VCF
7+
Expected usage:
8+
$ python split_sompy_features.py <vcf_file> <prefix>
9+
Use --help for more information.
10+
'''
11+
import csv
12+
import argparse
13+
import os
14+
15+
def split_csv_by_tag(input_file, prefix):
16+
output_files = {
17+
'TP': f'{prefix}_TP.csv',
18+
'FP': f'{prefix}_FP.csv',
19+
'FN': f'{prefix}_FN.csv'
20+
}
21+
22+
try:
23+
with open(input_file, newline='') as infile:
24+
reader = csv.reader(infile)
25+
header = next(reader)
26+
27+
# Prepare output writers
28+
writers = {}
29+
files = {}
30+
for tag, filename in output_files.items():
31+
f = open(filename, 'w', newline='')
32+
writer = csv.writer(f)
33+
writer.writerow(header)
34+
writers[tag] = writer
35+
files[tag] = f
36+
37+
# Write rows to correct files
38+
for row in reader:
39+
if len(row) > 3:
40+
tag = row[3]
41+
if tag in writers:
42+
writers[tag].writerow(row)
43+
44+
# Close all output files
45+
for f in files.values():
46+
f.close()
47+
48+
print("Done. Files created:")
49+
for filename in output_files.values():
50+
print(f" - {filename}")
51+
52+
except FileNotFoundError:
53+
print(f"Error: File '{input_file}' not found.")
54+
except Exception as e:
55+
print(f"An error occurred: {e}")
56+
57+
def main():
58+
parser = argparse.ArgumentParser(description="Split a CSV file into TP, FP, and FN files based on the 'tag' column.")
59+
parser.add_argument("input_csv", help="Path to the input CSV file")
60+
parser.add_argument("prefix", help="Path to the input CSV file")
61+
args = parser.parse_args()
62+
63+
split_csv_by_tag(args.input_csv, args.prefix)
64+
65+
if __name__ == "__main__":
66+
main()

conf/modules.config

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -316,14 +316,20 @@ process {
316316

317317
withName: "HAPPY_SOMPY" {
318318
ext.prefix = {"${meta.id}.${params.truth_id}.${meta.caller}"}
319-
ext.args = { meta.caller.contains("strelka") || meta.caller.contains("varscan") || meta.caller.contains("pisces") ? "--feature-table hcc.${meta.caller}.${params.variant_type} --bin-afs" : "--feature-table generic" }
319+
ext.args = { meta.caller.contains("strelka") || meta.caller.contains("varscan") || meta.caller.contains("pisces") || meta.caller == "mutect" ? "--feature-table hcc.${meta.caller}.${params.variant_type} --bin-afs" : "--feature-table generic" }
320320
publishDir = [
321321
path: {"${params.outdir}/${params.variant_type}/${meta.id}/benchmarks/sompy"},
322322
pattern: "*{.csv.gz,csv,json.gz,json,vcf.gz,vcf.gz.tbi,csv}",
323323
mode: params.publish_dir_mode
324324
]
325325
}
326326

327+
withName: "SPLIT_SOMPY_FEATURES" {
328+
publishDir = [
329+
enabled: false
330+
]
331+
}
332+
327333
withName: "HAPPY_PREPY" {
328334
ext.prefix = {"${meta.id}.${params.truth_id}.${meta.caller}.prepy"}
329335
ext.args = {"--fixchr --filter-nonref --bcftools-norm"}
@@ -471,12 +477,20 @@ process {
471477
withName: VCF_TO_CSV {
472478
ext.prefix = {"${meta.id}.${meta.tag}"}
473479
publishDir = [
474-
path: {"${params.outdir}/${params.variant_type}/summary/comparisons/"},
480+
path: {"${params.outdir}/${params.variant_type}/summary/comparisons/${meta.id}"},
475481
pattern: "*{.csv}",
476482
mode: params.publish_dir_mode
477483
]
478484
}
479485

486+
withName: MERGE_SOMPY_FEATURES {
487+
publishDir = [
488+
path: {"${params.outdir}/${params.variant_type}/summary/comparisons/${meta.id}"},
489+
pattern: "*{csv}",
490+
mode: params.publish_dir_mode
491+
]
492+
}
493+
480494
// VCF2BED tools
481495

482496
withName: "SVTK_VCF2BED" {
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
channels:
2+
- conda-forge
3+
- bioconda
4+
dependencies:
5+
- pip
6+
- pip:
7+
- pandas==2.2.3
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
process MERGE_SOMPY_FEATURES {
2+
tag "$meta.id"
3+
label 'process_single'
4+
5+
conda "${moduleDir}/environment.yml"
6+
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
7+
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/ab/ab3b0054e3111812d8f2deb12345d5b7ca7ea7b18a2dbcbf174d46274c28deba/data':
8+
'community.wave.seqera.io/library/pip_pandas:40d2e76c16c136f0' }"
9+
10+
input:
11+
tuple val(meta), path(csvs)
12+
13+
output:
14+
tuple val(meta), path("*.summary.csv") , emit: TP
15+
path "versions.yml" , emit: versions
16+
17+
when:
18+
task.ext.when == null || task.ext.when
19+
20+
script:
21+
def args = task.ext.args ?: ''
22+
def prefix = task.ext.prefix ?: "${meta.id}"
23+
24+
"""
25+
merge_sompy_features.py $csvs --output ${prefix}.${meta.tag}.summary.csv
26+
27+
cat <<-END_VERSIONS > versions.yml
28+
"${task.process}":
29+
python: \$(python --version | sed 's/Python //g')
30+
END_VERSIONS
31+
"""
32+
stub:
33+
def prefix = task.ext.prefix ?: "${meta.id}"
34+
"""
35+
touch ${prefix}.${meta.tag}.summary.csv
36+
37+
cat <<-END_VERSIONS > versions.yml
38+
"${task.process}":
39+
python: \$(python --version | sed 's/Python //g')
40+
END_VERSIONS
41+
"""
42+
43+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
channels:
2+
- conda-forge
3+
- bioconda
4+
dependencies:
5+
- pip
6+
- pip:
7+
- pandas==2.2.3
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
process SPLIT_SOMPY_FEATURES {
2+
tag "$meta.id"
3+
label 'process_single'
4+
5+
conda "${moduleDir}/environment.yml"
6+
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
7+
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/ab/ab3b0054e3111812d8f2deb12345d5b7ca7ea7b18a2dbcbf174d46274c28deba/data':
8+
'community.wave.seqera.io/library/pip_pandas:40d2e76c16c136f0' }"
9+
10+
input:
11+
tuple val(meta), path(input)
12+
13+
output:
14+
tuple val(meta), path("*TP.csv") , emit: TP
15+
tuple val(meta), path("*FP.csv") , emit: FP
16+
tuple val(meta), path("*FN.csv") , emit: FN
17+
path "versions.yml" , emit: versions
18+
19+
when:
20+
task.ext.when == null || task.ext.when
21+
22+
script:
23+
def args = task.ext.args ?: ''
24+
def prefix = task.ext.prefix ?: "${meta.id}"
25+
26+
"""
27+
split_sompy_features.py $input $prefix
28+
cat <<-END_VERSIONS > versions.yml
29+
"${task.process}":
30+
python: \$(python --version | sed 's/Python //g')
31+
END_VERSIONS
32+
"""
33+
stub:
34+
def prefix = task.ext.prefix ?: "${meta.id}"
35+
"""
36+
touch ${prefix}_TP.csv
37+
touch ${prefix}_FP.csv
38+
touch ${prefix}_FN.csv
39+
40+
cat <<-END_VERSIONS > versions.yml
41+
"${task.process}":
42+
python: \$(python --version | sed 's/Python //g')
43+
END_VERSIONS
44+
"""
45+
46+
}

subworkflows/local/compare_benchmark_results/main.nf

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,19 @@
33
// COMPARE_BENCHMARK_RESULTS: SUBWORKFLOW to merge TP/FP/FN results from different tools.
44
//
55

6-
include { SURVIVOR_MERGE } from '../../../modules/nf-core/survivor/merge'
7-
include { BCFTOOLS_MERGE } from '../../../modules/nf-core/bcftools/merge'
8-
include { VCF_TO_CSV } from '../../../modules/local/custom/vcf_to_csv'
9-
include { REFORMAT_HEADER } from '../../../modules/local/custom/reformat_header'
6+
include { SURVIVOR_MERGE } from '../../../modules/nf-core/survivor/merge'
7+
include { BCFTOOLS_MERGE } from '../../../modules/nf-core/bcftools/merge'
8+
include { VCF_TO_CSV } from '../../../modules/local/custom/vcf_to_csv'
9+
include { REFORMAT_HEADER } from '../../../modules/local/custom/reformat_header'
10+
include { MERGE_SOMPY_FEATURES } from '../../../modules/local/custom/merge_sompy_features'
1011
include { TABIX_BGZIP as TABIX_BGZIP_UNZIP } from '../../../modules/nf-core/tabix/bgzip'
1112

1213
workflow COMPARE_BENCHMARK_RESULTS {
1314
take:
14-
evaluations // channel: [val(meta), vcf.gz, index]
15-
fasta // reference channel [val(meta), ref.fa]
16-
fai // reference channel [val(meta), ref.fa.fai]
15+
evaluations // channel: [val(meta), vcf.gz, index]
16+
evaluations_csv // channel: [val(meta), csv]
17+
fasta // reference channel [val(meta), ref.fa]
18+
fai // reference channel [val(meta), ref.fa.fai]
1719

1820
main:
1921
versions = Channel.empty()
@@ -69,6 +71,12 @@ workflow COMPARE_BENCHMARK_RESULTS {
6971
)
7072
versions = versions.mix(VCF_TO_CSV.out.versions.first())
7173

74+
75+
MERGE_SOMPY_FEATURES(
76+
evaluations_csv.groupTuple()
77+
)
78+
versions = versions.mix(MERGE_SOMPY_FEATURES.out.versions.first())
79+
7280
emit:
7381
merged_vcfs // channel: [val(meta), vcf]
7482
versions // channel: [versions.yml]

0 commit comments

Comments
 (0)