-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathtest_monophyly_against_tree.py
More file actions
executable file
·100 lines (70 loc) · 3.48 KB
/
test_monophyly_against_tree.py
File metadata and controls
executable file
·100 lines (70 loc) · 3.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python
if __name__ == "__main__":
import sys, sqlite3, newick3, phylo3
if len(sys.argv) < 5:
# print "usage: testmonophylyagainsttree <phlawddb> rank=<rank> [include=\"<tx1>,<tx2>,...\"] " \
# "[includefile=<file>] [exclude=\"<tx3>,<tx4>,...\"]"
print "usage: testmonophylyagainsttree <phlawddb> <treefile> rank=<rank> [include=\"<tx1>,<tx2>,...\"] [includefile=<file>]"
sys.exit(0)
# process command line args
dbname = sys.argv[1]
treefname = sys.argv[2]
# initializing parameters
target_rank = ""
cladenames = []
# exclude_names = []
for arg in sys.argv[3:]:
argname, argval = arg.split("=")
if argname == "rank":
target_rank = argval.strip()
elif argname == "include":
cladenames = [n.strip() for n in argval.split(",")]
elif argname == "includefile":
includenamesfile = open(argval,"r")
cladenames = [n.strip() for n in includenamesfile.readlines()]
includenamesfile.close()
# elif argname == "exclude":
# exclude_names = [n.strip() for n in argval.split(",")]
assert(len(cladenames) > 0)
assert(target_rank != "")
test_tree = newick3.parse(open(treefname,"r"))
print "will assess monophyly for taxa of rank '" + target_rank + "'"
con = sqlite3.connect(dbname)
cur = con.cursor()
included_taxa = {}
for name in cladenames:
cur.execute("SELECT name, left_value, right_value FROM taxonomy WHERE name_class == 'scientific name' AND name LIKE ?",(name,))
for curname, leftval, rightval in cur.fetchall():
print "including " + name
included_taxa[name] = (leftval, rightval)
out_prefix = "_".join(included_taxa)
if len(out_prefix) > 40:
out_prefix = out_prefix[0:40] + "_etc"
out_prefix = out_prefix + "_" + target_rank
outfile = open(out_prefix + ".monophyly.csv","w")
outfile.write("taxon,monophyletic,offending_names\n")
for incl_name, rl_values in included_taxa.iteritems():
print "now searching " + incl_name
cur.execute("SELECT name, id, left_value, right_value FROM taxonomy WHERE name_class == 'scientific name' " \
"AND node_rank == ? AND left_value > ? AND right_value < ?",(target_rank, rl_values[0], rl_values[1]))
results = cur.fetchall()
for tax_name, tax_id, tax_leftval, tax_rightval in results:
cur.execute("SELECT name FROM taxonomy WHERE name_class == 'scientific name' AND left_value > ? AND right_value < ?", \
(tax_leftval,tax_rightval))
taxo_mrca_names = [n[0] for n in cur.fetchall()]
if len(taxo_mrca_names) < 1:
print tax_name + " has no children, will be skipped"
continue
print tax_name
tree_mrca = phylo3.getMRCA(test_tree, taxo_mrca_names)
if tree_mrca == None:
print " could not find any descendants in tree, will be skipped"
continue
tree_mrca_names = [n.label for n in tree_mrca.descendants() if n.label != None]
monophyly = False
offending_names = set(tree_mrca_names) - set(taxo_mrca_names)
if len(offending_names) == 0:
monophyly = True
outfile.write(",".join([tax_name, str(monophyly), " | ".join(offending_names)]) + "\n")
outfile.flush()
outfile.close()