33
44import argparse
55from glob import glob
6+ import re
67
7- def concatenate (multiple , pairwise , indir , outfile ):
8- # Parses CodeML output and prints to file
9- if multiple == True and pairwise == True :
10- print ("\n \t Please specify only one alignment type.\n " )
11- quit ()
12- elif multiple == True :
13- # Read in codeml output for multiple alignments
14- readMultiple (indir , outfile )
15- elif pairwise == True :
16- # Read in codeml output for pairwise alignments
17- readPairwise (indir , outfile )
8+ def inputList (indir ):
9+ # Creates list of mlc files and determines which analysis was run
10+ infiles = glob (indir + "*" )
11+ # Remove tmp dir
12+ for i in infiles :
13+ if i [i .rfind ("/" )+ 1 :] == "tmp" :
14+ infiles .remove (i )
15+ with open (infiles [0 ], "r" ) as ali :
16+ # Identify analysis
17+ for line in ali :
18+ if "dN/dS (w) for site classes" in line :
19+ typ = "brsite"
20+ break
21+ elif "dN & dS for each branch" in line :
22+ typ = "brspec"
23+ break
24+ elif "pairwise comparison, codon frequencies:" in line :
25+ typ = "pw"
26+ break
27+ return infiles , typ
28+
29+ def branchSite (infiles , outfile ):
30+ # Saves data from branch site analysis to csv
31+ save = False
32+ with open (outfile , "w" ) as output :
33+ # Open output file and write header
34+ output .write ("TranscriptID,Foreground_dN/dS,Background_dN/dS,\
35+ TreeLength,lnL,Species,SitesUnderSelection (Amino Acid; PosteriorProbability)\n " )
36+ for infile in infiles :
37+ try :
38+ # Get gene id and number of species remaining for each file
39+ filename = infile .split ("/" )[- 1 ]
40+ transcript = filename .split ("." )[0 ]
41+ species = filename .split ("." )[1 ]
42+ if int (species ) > 2 :
43+ with open (infile , "r" ) as mlc :
44+ sites = ""
45+ for line in mlc :
46+ # Get substitution rates, tree lengths, etc
47+ if save == True :
48+ if "The grid" in line :
49+ save = False
50+ elif line .strip () and "Positive" not in line :
51+ # Record sites and probability
52+ splt = line .split ()
53+ sites += ("{} ({}; {})," ).format (splt [0 ],
54+ splt [1 ], splt [2 ])
55+ elif "tree length =" in line :
56+ length = line .split ("=" )[1 ].strip ()
57+ elif "lnL" in line :
58+ lnl = line .split ("):" )[1 ]
59+ lnl = lnl .split ()[0 ].strip ()
60+ elif "background w" in line :
61+ # Convert multiple spaces to single space
62+ line = re .sub (" +" , " " , line )
63+ bw = line .split ()[4 ]
64+ elif "foreground w" in line :
65+ # Convert multiple spaces to single space
66+ line = re .sub (" +" , " " , line )
67+ fw = line .split ()[4 ]
68+ elif "Bayes Empirical Bayes" in line :
69+ save = True
70+ # Append data to list and convert into string
71+ data = [fw , bw , length , lnl , species , sites [:- 1 ]]
72+ for i in data :
73+ transcript += "," + i
74+ output .write (transcript + "\n " )
75+ else :
76+ # Skip files with only two sequences
77+ pass
78+ except IsADirectoryError :
79+ pass
1880
19- def readMultiple (indir , outfile ):
81+ def branchSpecific (infiles , outfile ):
82+ # Saves data from branch specific analysis to csv
2083 with open (outfile , "w" ) as output :
2184 # Open output file and write header
2285 output .write ("TranscriptID,dN,dS,dN/dS,TreeLength,lnL,Species\n " )
23- infiles = glob (indir + "*" )
2486 for infile in infiles :
2587 try :
2688 # Get gene id and number of species remaining for each file
2789 filename = infile .split ("/" )[- 1 ]
2890 transcript = filename .split ("." )[0 ]
2991 species = filename .split ("." )[1 ]
3092 if int (species ) > 2 :
93+ with open (infile , "r" ) as mlc :
94+ for line in mlc :
95+ # Get substitution rates, tree lengths, etc
96+ if "tree length =" in line :
97+ length = line .split ("=" )[1 ].strip ()
98+ elif "tree length for dN" in line :
99+ dn = line .split (":" )[1 ].strip ()
100+ elif "tree length for dS" in line :
101+ ds = line .split (":" )[1 ].strip ()
102+ elif "lnL" in line :
103+ lnl = line .split ("):" )[1 ]
104+ lnl = lnl .split ()[0 ].strip ()
105+ # Calculate dN/dS and save as a string
31106 try :
32- with open (infile , "r" ) as mlc :
33- for line in mlc :
34- # Get substitution rates, tree lengths, etc
35- if "tree length =" in line :
36- length = line .split ("=" )[1 ].strip ()
37- elif "tree length for dN" in line :
38- dn = line .split (":" )[1 ].strip ()
39- elif "tree length for dS" in line :
40- ds = line .split (":" )[1 ].strip ()
41- elif "lnL" in line :
42- lnl = line .split ("):" )[1 ]
43- lnl = lnl .split ()[0 ].strip ()
44- # Calculate dN/dS and save as a string
45- try :
46- dnds = str (float (dn )/ float (ds ))
47- except ZeroDivisionError :
48- dnds = "NA"
49- # Append data to list and convert into string
50- data = [dn , ds , dnds , length , lnl , species ]
51- transcript += ","
52- for i in data :
53- transcript += str (i ) + ","
54- transcript = transcript [:- 1 ]
55- output .write (transcript + "\n " )
107+ dnds = str (float (dn )/ float (ds ))
108+ except ZeroDivisionError :
109+ dnds = "NA"
110+ # Append data to list and convert into string
111+ data = [dn , ds , dnds , length , lnl , species ]
112+ for i in data :
113+ transcript += "," + str (i )
114+ output .write (transcript + "\n " )
56115 else :
57116 # Skip files with only two sequences
58117 pass
59- except IsADirectoryError , UnboundLocalError :
118+ except IsADirectoryError :
60119 pass
61120
62- def readPairwise (indir , outfile ):
121+ def pairwise (infiles , outfile ):
122+ # Saves data from pairwise analysis to csv
63123 with open (outfile , "w" ) as output :
64124 # Open output file and write header
65125 output .write ("TranscriptID,dN,dS,dN/dS,lnL\n " )
66- infiles = glob (indir + "*" )
67126 for infile in infiles :
68127 try :
69128 # Get gene id and number of species remaining for each file
@@ -94,10 +153,6 @@ def readPairwise(indir, outfile):
94153def main ():
95154 parser = argparse .ArgumentParser (description = "This script will \
96155 concatenate CodeML output and print them in a single file." )
97- parser .add_argument ("--multiple" , action = "store_true" ,
98- help = "Indicates that a multiple alignment was used" )
99- parser .add_argument ("--pairwise" , action = "store_true" ,
100- help = "Indicates that a pairwise alignment was used." )
101156 parser .add_argument ("-i" , help = "Path to CodeML output directory" )
102157 parser .add_argument ("-o" , help = "Name of output file." )
103158 # Parse command
@@ -106,9 +161,13 @@ def main():
106161 if indir [- 1 ] != "/" :
107162 indir += "/"
108163 outfile = args .o
109- multiple = args .multiple
110- pairwise = args .pairwise
111- concatenate (multiple , pairwise , indir , outfile )
164+ infiles , typ = inputList (indir )
165+ if typ == "pw" :
166+ pairwise (infiles , outfile )
167+ elif typ == "brsite" :
168+ branchSite (infiles , outfile )
169+ elif typ == "brspec" :
170+ branchSpecific (infiles , outfile )
112171
113172if __name__ == "__main__" :
114173 main ()
0 commit comments