-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathtranspose_fastas.py
More file actions
executable file
·110 lines (85 loc) · 4.12 KB
/
transpose_fastas.py
File metadata and controls
executable file
·110 lines (85 loc) · 4.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
"""transpose_fastas.py
version 0.1
cody hinchliff
2012.7.21
This script accesses a directory, and traverses all FASTA files in it, recording them into a dict object. It then
writes new fasta files to a directory named 'inverted' within the provided dir. One inverted fasta file is created
for each sequence id found in the original set of fastas; each of these files contains the sequences associated
with this sequence id, labeled with ids representing the original fasta files whence they came.
For example, if the script is passed a directory containing fasta files corresponding to loci, containing sequences
labeled with taxon names, the inverted directory will contain fasta files corresponding to taxon names, containing
sequences labeled with locus names (drawn from the filenames of the original fastas)."""
if __name__ == "__main__":
import sys
import os
# if called for a target outside the cwd, get a path to the target dir
cwd = os.getcwd() + os.sep
try:
specdir = sys.argv[1]
if specdir[0] == '/':
dirpath = specdir
else:
dirpath = cwd + specdir
except IndexError:
print "No target directory specified, using current working directory."
dirpath = ""
if dirpath[-1] != "/":
dirpath += "/"
print "Reading files from: " + dirpath
# attempt to get a list of files
try:
infiles = os.listdir(dirpath)
except OSError:
exit("There was a problem opening the specified directory. Are you sure it exists?")
# the data dict will hold all the sequences in a indexed structure
data = dict()
# the seqids list will store all the sequence ids we find, which will be used to name the inverted files
seqids = list()
# walk the list of input files (assumes all non-system files are fastas!)
for infname in infiles:
if infname in ("") or infname[0] == "." or os.path.isdir(infname):
continue
# attempt to determine a useful filename id for this input file
infile = file(dirpath + infname, 'rU')
infid = os.path.basename(infile.name).rsplit(".fasta")[0].rsplit(".fst")[0]
print infile.name
# create a container to hold this file's sequences
data[infid] = dict()
# gather sequences from this file
curseqid = ""
for line in infile:
lineclean = line.strip()
if len(lineclean) == 0:
continue
# if we hit a sequence identifier line, record it, create a new sequence element
if lineclean[0] == ">":
curseqid = lineclean.strip(">")
seqids.append(curseqid)
data[infid][curseqid] = ""
# otherwise this line contains sequence data; add it to the current sequence element
else:
data[infid][curseqid] += lineclean
# uniquify the list of seqids
seqids = list(set(seqids))
seqids.sort()
try:
os.mkdir(dirpath + "inverted")
except OSError:
pass
# for each unique seq id
for seqid in seqids:
# create a file to contain the corresponding seqs out of the original alignments
outfname = dirpath + "inverted/" + seqid + ".fasta"
outfile = file(outfname, "w")
# look through input data for seqs matching this seqid
for infname, infdata in data.iteritems():
# write each matching sequence into the outfile, labeled with the name of the file whence it came
try:
seq = infdata[seqid]
outfile.write(">" + infname + "\n")
outfile.write(seq + "\n")
# this file didn't contain a sequence with this id, so move on
except KeyError:
pass
exit("\nTranposed fasta files have been written to:\n%s" % dirpath + "inverted")