-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathget_assembly_stats.py
More file actions
106 lines (91 loc) · 3.33 KB
/
get_assembly_stats.py
File metadata and controls
106 lines (91 loc) · 3.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import numpy as np
from itertools import groupby
import json
import sys
def fasta_iter(fasta_file):
"""Takes a FASTA file, and produces a generator of Header and Sequences.
This is a memory-efficient way of analyzing a FASTA files -- without
reading the entire file into memory.
Parameters
----------
fasta_file : str
The file location of the FASTA file
Returns
-------
header: str
The string contained in the header portion of the sequence record
(everything after the '>')
seq: str
The sequence portion of the sequence record
"""
fh = open(fasta_file)
fa_iter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
for header in fa_iter:
# drop the ">"
header = next(header)[1:].strip()
# join all sequence lines to one.
seq = "".join(s.upper().strip() for s in next(fa_iter))
yield header, seq
def read_genome(fasta_file):
"""Takes a FASTA file, and produces 2 lists of sequence lengths. It also
calculates the GC Content, since this is the only statistic that is not
calculated based on sequence lengths.
Parameters
----------
fasta_file : str
The file location of the FASTA file
Returns
-------
contig_lens: list
A list of lengths of all contigs in the genome.
scaffold_lens: list
A list of lengths of all scaffolds in the genome.
gc_cont: float
The percentage of total basepairs in the genome that are either G or C.
"""
gc = 0
total_len = 0
contig_lens = []
scaffold_lens = []
for _, seq in fasta_iter(fasta_file):
scaffold_lens.append(len(seq))
if "NN" in seq:
contig_list = seq.split("NN")
else:
contig_list = [seq]
for contig in contig_list:
if len(contig):
gc += contig.count('G') + contig.count('C')
total_len += len(contig)
contig_lens.append(len(contig))
gc_cont = (gc / total_len) * 100
return contig_lens, scaffold_lens, gc_cont
def calculate_stats(seq_lens, gc_cont):
stats = {}
seq_array = np.array(seq_lens)
stats['sequence_count'] = seq_array.size
stats['gc_content'] = gc_cont
sorted_lens = seq_array[np.argsort(-seq_array)]
stats['longest'] = int(sorted_lens[0])
stats['shortest'] = int(sorted_lens[-1])
stats['median'] = np.median(sorted_lens)
stats['mean'] = np.mean(sorted_lens)
stats['total_bps'] = int(np.sum(sorted_lens))
csum = np.cumsum(sorted_lens)
# for level in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
for level in [10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95]:
nx = int(stats['total_bps'] * (level / 100))
csumn = min(csum[csum >= nx])
l_level = int(np.where(csum == csumn)[0])
n_level = int(sorted_lens[l_level])
stats['L' + str(level)] = l_level
stats['N' + str(level)] = n_level
return stats
if __name__ == "__main__":
infilename = sys.argv[1]
contig_lens, scaffold_lens, gc_cont = read_genome(infilename)
contig_stats = calculate_stats(contig_lens, gc_cont)
scaffold_stats = calculate_stats(scaffold_lens, gc_cont)
stat_output = {'Contig Stats': contig_stats,
'Scaffold Stats': scaffold_stats}
print(json.dumps(stat_output, indent=2, sort_keys=True))