abanalysis/batch_merge.py at master · rmukh/abanalysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/python
# filename: batch_merge.py

###########################################################################
#
# Copyright (c) 2013 Bryan Briney.  All rights reserved.
# @version: 1.1.0
# @author: Bryan Briney
# @props: IgBLAST team (http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi)
# @license: MIT (http://opensource.org/licenses/MIT)
#
###########################################################################
###########################################################################
# Fork Development
# Copyright (c) 2021-2025 Rinat Mukhometzianov.
# @version: 1.1.1
# @author: Rinat Mukhometzianov
#
###########################################################################


import os
import glob
import shutil
import argparse
from collections import defaultdict

import pandaseq

parser = argparse.ArgumentParser("Batch merging of paired-end reads with PANDAseq")
parser.add_argument(
    "-i",
    "--in",
    dest="input",
    type=str,
    required=True,
    help="The input directory, containing paired FASTQ files"
    " (uncompressed or gzip compressed). Required.",
)
parser.add_argument(
    "-o",
    "--out",
    dest="output",
    type=str,
    required=True,
    help="The output directory, will contain merged FASTA files. Required.",
)
parser.add_argument(
    "-n",
    "--nextseq",
    dest="nextseq",
    default=False,
    action="store_true",
    help="Use flag if run was performed on a NextSeq sequencer.",
)
args = parser.parse_args()


def make_dir(d):
    if not os.path.exists(d):
        os.mkdir(d)


def list_files(d):
    return sorted([f for f in glob.glob(d + "/*") if os.path.isfile(f)])


def bin_files(files):
    file_bins = defaultdict(list)

    for f in files:
        name_parts = os.path.basename(f).split("_")
        if len(name_parts) > 1:
            prefix = "_".join(name_parts[:-1])
        else:
            prefix = name_parts[0]
        file_bins[prefix].append(f)

    return file_bins


def concat(d):
    files = list_files(d)
    file_bins = bin_files(files)
    for prefix, grouped_files in file_bins.items():
        outfile = args.output / f"{prefix}.fasta"

        with outfile.open("w") as out_fh:
            for fpath in grouped_files:
                with open(fpath, "r") as in_fh:
                    out_fh.writelines(in_fh)


def main():
    make_dir(args.output)
    o = args.output
    if args.nextseq:
        temp = os.path.join(args.output, "temp")
        make_dir(temp)
        o = temp
    pandaseq.run(args.input, o, args.nextseq)
    if args.nextseq:
        print("\nConcatenating NextSeq lane files for each sample...")
        concat(o)
        shutil.rmtree(o)
        print("Done.\n")


if __name__ == "__main__":
    main()