-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbatch_merge.py
More file actions
110 lines (90 loc) · 2.67 KB
/
batch_merge.py
File metadata and controls
110 lines (90 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/python
# filename: batch_merge.py
###########################################################################
#
# Copyright (c) 2013 Bryan Briney. All rights reserved.
# @version: 1.1.0
# @author: Bryan Briney
# @props: IgBLAST team (http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi)
# @license: MIT (http://opensource.org/licenses/MIT)
#
###########################################################################
###########################################################################
# Fork Development
# Copyright (c) 2021-2025 Rinat Mukhometzianov.
# @version: 1.1.1
# @author: Rinat Mukhometzianov
#
###########################################################################
import os
import glob
import shutil
import argparse
from collections import defaultdict
import pandaseq
parser = argparse.ArgumentParser("Batch merging of paired-end reads with PANDAseq")
parser.add_argument(
"-i",
"--in",
dest="input",
type=str,
required=True,
help="The input directory, containing paired FASTQ files"
" (uncompressed or gzip compressed). Required.",
)
parser.add_argument(
"-o",
"--out",
dest="output",
type=str,
required=True,
help="The output directory, will contain merged FASTA files. Required.",
)
parser.add_argument(
"-n",
"--nextseq",
dest="nextseq",
default=False,
action="store_true",
help="Use flag if run was performed on a NextSeq sequencer.",
)
args = parser.parse_args()
def make_dir(d):
if not os.path.exists(d):
os.mkdir(d)
def list_files(d):
return sorted([f for f in glob.glob(d + "/*") if os.path.isfile(f)])
def bin_files(files):
file_bins = defaultdict(list)
for f in files:
name_parts = os.path.basename(f).split("_")
if len(name_parts) > 1:
prefix = "_".join(name_parts[:-1])
else:
prefix = name_parts[0]
file_bins[prefix].append(f)
return file_bins
def concat(d):
files = list_files(d)
file_bins = bin_files(files)
for prefix, grouped_files in file_bins.items():
outfile = args.output / f"{prefix}.fasta"
with outfile.open("w") as out_fh:
for fpath in grouped_files:
with open(fpath, "r") as in_fh:
out_fh.writelines(in_fh)
def main():
make_dir(args.output)
o = args.output
if args.nextseq:
temp = os.path.join(args.output, "temp")
make_dir(temp)
o = temp
pandaseq.run(args.input, o, args.nextseq)
if args.nextseq:
print("\nConcatenating NextSeq lane files for each sample...")
concat(o)
shutil.rmtree(o)
print("Done.\n")
if __name__ == "__main__":
main()