-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfasta_parser.py
More file actions
176 lines (152 loc) · 4.78 KB
/
fasta_parser.py
File metadata and controls
176 lines (152 loc) · 4.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/env python
######################################################################################
## fasta_parser.py
#
# Author: Michele Berselli
# University of Padova
# berselli.michele@gmail.com
#
######################################################################################
import sys, argparse, os
from bitarray import bitarray
########################################
#### CLASS FastaHandler ####
########################################
class FastaHandler(object):
''' a class to implement an object to handle multiple fasta sequences '''
########################################
#### CLASS FastaSequence ####
########################################
class FastaSequence(object):
''' a class to implement an object representing a fasta sequence '''
## FUNCTIONS ##
def __init__(self, header, sequence):
self.__header = header
self.__sequence = sequence
#end def __init__
def get_header(self):
return self.__header
#end def get_header
def get_sequence(self):
if type(self.__sequence) == bitarray:
return self.__bitarray_to_seq()
else:
return self.__sequence
#end if
#end def get_seq()
def get_bitarray(self):
if type(self.__sequence) != bitarray:
raise ValueError('Sequence is not encoded as a bitarray, use get_sequence() instead!')
else:
return self.__sequence
#end if
#end def get_bitarray
def __bitarray_to_seq(self):
s, i, max_i = '', 0, len(self.__sequence) - 2
encode = {'00': 'A', '01': 'C', '10': 'G', '11': 'T'}
while i <= max_i:
s += encode[str(int(self.__sequence[i])) + str(int(self.__sequence[i + 1]))]
i += 2
#end while
return s
#end def __bitarray_to_seq
def write_sequence(self, fo, max_char_per_line=0):
fo.write('>' + self.__header + '\n')
i, sequence = 0, self.get_sequence()
if not max_char_per_line:
fo.write(sequence + '\n')
else:
max_i = len(sequence) - max_char_per_line
while i <= max_i:
fo.write(sequence[i:i + max_char_per_line] + '\n')
i += max_char_per_line
#end while
if sequence[i:]: fo.write(sequence[i:] + '\n')
#end if
#def write_sequence
#end class FastaSequence
## FUNCTIONS ##
def __init__(self):
self.__sequences = []
#end def __init__
def parse(self, inputfile):
''' adds fasta sequences as FastaSequence objects '''
header, sequence = None, []
fi = self.__open(inputfile)
for line in fi:
if line.startswith('>'):
if header: self.__sequences.append(self.FastaSequence(header, ''.join(sequence)))
header, sequence = line.rstrip()[1:], []
else:
sequence.append(line.rstrip())
#end if
#end for
if header: self.__sequences.append(self.FastaSequence(header, ''.join(sequence)))
fi.close()
#end def parse
def parse_binary(self, inputfile):
''' adds fasta sequences binary encoded as FastaSequence objects,
works only for upper or lower canonical bases A, C, T, G '''
encode = {
'A': '00', 'a': '00',
'C': '01', 'c': '01',
'G': '10', 'g': '10',
'T': '11', 't': '11'
}
header, sequence = None, bitarray()
fi = self.__open(inputfile)
for line in fi:
if line.startswith('>'):
if header: self.__sequences.append(self.FastaSequence(header, sequence))
header, sequence = line.rstrip()[1:], bitarray()
else:
try:
for c in line.rstrip(): sequence.extend(encode[c])
except:
self.__sequences = []
raise ValueError('Non-canonical base found in sequence ' + header + '!')
#end try
#end if
#end for
if header: self.__sequences.append(self.FastaSequence(header, sequence))
fi.close()
#end def parse_binary
def parse_generator(self, inputfile):
''' create a generator of the sequences,
each iteration return a (header, sequence) tuple '''
header, sequence = None, []
fi = self.__open(inputfile)
for line in fi:
if line.startswith('>'):
if header: yield (header, ''.join(sequence))
header, sequence = line.rstrip()[1:], []
else:
sequence.append(line.rstrip())
#end if
#end for
if header: yield (header, ''.join(sequence))
fi.close()
#end def parse_generator
def __open(self, inputfile):
''' check the existance of inputfile and opens it if exists '''
if os.path.isfile(inputfile):
return open(inputfile, 'r')
else:
raise ValueError('Input file is missing!')
#end if
#end def __open
def iter_sequences(self):
''' create a generator of the stored sequences,
each iteration return a FastaSequence object '''
return iter(self.__sequences)
#end def iter_sequences
def get_sequences(self):
''' return the list of the stored FastaSequence objects '''
return self.__sequences
#end def get_sequences
def write_sequences(self, fo, max_char_per_line=0):
for s in self.__sequences:
s.write_sequence(fo, max_char_per_line)
#end for
#end def write_sequences
#end class FastaHandler