-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathconsense_dup_taxa.py
More file actions
executable file
·105 lines (78 loc) · 3.21 KB
/
consense_dup_taxa.py
File metadata and controls
executable file
·105 lines (78 loc) · 3.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python
import sys
import os
import time
#import MySQLdb
from Bio import SeqIO
from Bio import AlignIO
from Bio import Alphabet #import generic_dna
from Bio.Align import AlignInfo
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
# user libraries, available from bitbucket python module repo
import dnaalignment
#print "\n\nEnter the path to the file:"
#filename = raw_input("> ")
infname = sys.argv[1]
cwd = os.getcwd() + os.sep
try:
infpath = cwd + infname
myfile = file(infpath, 'rU')
mydata = SeqIO.parse(myfile,"fasta")
print "\nReading from: %s" % infpath
except IOError:
exit("Error: there was a problem opening the input file: " + infpath)
try:
fnamebits = os.path.splitext(infname)
outfname = """%s_dupsconsensed.fasta""" % (fnamebits[0])
outfpath = cwd + outfname
outfile = file(cwd + outfname,"w")
print "Writing to: %s \n" % outfpath
except IOError:
exit("Error: there was a problem opening the output file: " + outfpath)
# Vars to accumulate metadata for seqs we've processed, to be used during/after dup checking
checkedspp = list()
tempfnames = list()
odata = list()
cdata = list(mydata)
# Variables to hold temporary values during dup checking
appending = False
dups = list()
# for every record (iterate by record index)
for curindex in range(len(cdata)-1):
compdata = list(cdata) # make a list copy for iteration
currecord = compdata.pop(curindex)
## if the species of the current record hasn't yet been seen, then look for all other records
## of the same species, and append them to the duplicates list
if currecord.id not in checkedspp:
for comprecord in compdata:
if currecord.id == comprecord.id:
dups.append(comprecord)
appending = True
# if we found duplicates
if appending:
dups.append(currecord) # add the record we just checked against to the list of dups
print "%s: \n\t%i sequences found" % (currecord.id,len(dups))
tempfname = "%s%s%f_TEMP.fasta" % (cwd, currecord.id, time.time())
temphandle = file(tempfname, "w")
tempfnames.append(tempfname)
# save the records to a temp file so AlignIO can access them
SeqIO.write(dups, temphandle, "fasta")
temphandle.close()
# read all records from tempfile
duphandle = file(tempfname, "rU")
alignment = AlignIO.read(duphandle,"fasta")
cons = ConAlign.consensus(alignment)
# print "the consensus is %s" % cons.consensus_seq
print "\t%i polymorphic columns consensed\n" % cons.ncols_polymorphic
os.remove(tempfname)
conrecord = SeqRecord(Seq(cons.consensus_seq,Alphabet.Gapped(Alphabet.generic_dna)),id=currecord.id, \
name=currecord.id+" consensus sequence",description=currecord.id+" consensus sequence")
odata.append(conrecord)
elif currecord.id not in checkedspp:
odata.append(currecord)
# reset temp vars
dups = list()
appending = False
checkedspp.append(currecord.id) # add the species we checked to our ignore list for future checks
SeqIO.write(odata, outfile, "fasta")