-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcreate_abinitio_gff.py
More file actions
165 lines (139 loc) · 5.49 KB
/
create_abinitio_gff.py
File metadata and controls
165 lines (139 loc) · 5.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
'''
Script for creating .gff files from ab-initio results
'''
#!/usr/bin/env python
import subprocess,os,sys
'''
Function reads in the CRISPR results
'''
def pilercr_merger(input_directory_path,pilercr_file):
#Creating input file path
input_file=input_directory_path + pilercr_file
#Creating output file path
mod_pilercr_file_name=pilercr_file.replace("_crispr","_crispr.gff")
output_file="./tmp/"+mod_pilercr_file_name
dict1={} #Dictionary stores the crispr array details for each contig tested.
#Parsing the CRISPR input file
with open(input_file,"r") as inp:
in_file=inp.readlines()
line_count=0 #Counter for the lines in input file.
count=0 #Counter for number of crispr arrays predicted.
while(line_count<len(in_file)):
if in_file[line_count].startswith(">")==True:
in_file[line_count]=in_file[line_count].rstrip()
header=in_file[line_count].replace(">","")
count=count+1
if count not in dict1.keys():
dict1[count]=header
if in_file[line_count].startswith("SUMMARY BY SIMILARITY")==True:
line_count=line_count+6
count1=0 #Counter to match the number of arrays found to the ones reported in the "Summary by similarity"
while in_file[line_count].startswith("SUMMARY BY POSITION")!=True:
if count1<count:
count1=count1+1
crisp_array=in_file[line_count].split()
arr_num=int(crisp_array[0])
start_pos=crisp_array[2]
end_pos=int(start_pos)+int(crisp_array[3])+1
head=dict1[arr_num]
dict1[arr_num]=head+"\t"+start_pos+"\t"+str(end_pos)+"\t"+"Copies:"+crisp_array[4]+";Repeat_length:"+crisp_array[5]+";Spacer_Length:"+crisp_array[6]+";Repeat_Consensus:"+crisp_array[8]+"\n"
line_count=line_count+2
if in_file[line_count].startswith("SUMMARY BY POSITION")==True:
break
line_count=line_count+1
#Writing to the .gff output files
with open(output_file,"a+") as op:
for keys in dict1.keys():
line_split=dict1[keys].split("\t")
op.write(line_split[0]+"\t"+"pilercr"+"\t"+"CRISPR array"+"\t"+line_split[1]+"\t"+line_split[2]+"\t"+"."+"\t"+"."+"\t"+"."+"\t"+line_split[3])
'''
This function merges the predicted Transmembrane proteins to the .faa and .gff files produced by the Gene Prediction group.
'''
def tmhmm_merger(input_directory_path,tmhmm_file):
#Creating input file path
input_file=input_directory_path + tmhmm_file
#Creating output file path for .gff file
mod_tmhmm_file_name=tmhmm_file.replace("tmhmm","tmhmm.gff")
output_file="./tmp/"+mod_tmhmm_file_name
dict_faa={}
#Parsing the TM protein input file
with open(input_file,"r") as inp:
for line in inp:
line=line.rstrip()
col=line.split("\t")
header=col[0]
pred_hel_split=col[4].split("=")
pred_hel=pred_hel_split[1]
top=col[5]
if int(pred_hel)!=0: #Rejecting the proteins with zero predicted alpha-helices
dict_faa[header]= "Transmembrane Protein: Predicted Helices="+pred_hel+", Topology:"+top
#Writing to the .gff output files
with open(output_file,"a+") as op:
for keys in dict_faa.keys():
name=keys.split(":")
node=name[0]
number=name[1].split("-")
start=int(number[0])-1
if start <0:
start=0
stop=number[1]
op.write(node+"\t"+"."+"\t"+"."+"\t"+str(start)+"\t"+stop+"\t"+"."+"\t"+"."+"\t"+"."+"\t"+dict_faa[keys]+"\n")
'''
This function merges the predicted Transmembrane proteins to the .faa and .gff files produced by the Gene Prediction group.
'''
def signalp_merger(input_directory_path,signalp_file):
#Creating input file path
input_file=input_directory_path + signalp_file
#Creating output file path for .gff file
mod_signalp_file_name=signalp_file.replace(".gff3","_signalp.gff")
output_file="./tmp/"+mod_signalp_file_name
signalp_dict={}
#Parsing the SignalP input file
with open(input_file,"r") as inp:
first=inp.readline() #Removing the first line
for line in inp:
col=line.split("\t")
name=col[0]
funct=col[2]
signalp_dict[name]=funct
#Writing to the .gff output files
with open(output_file,"a+") as op:
for keys in signalp_dict.keys():
name=keys.split(":")
node=name[0]
number=name[1].split("-")
start=int(number[0])-1
if start <0:
start=0
stop=number[1]
op.write(node+"\t"+"."+"\t"+"."+"\t"+str(start)+"\t"+stop+"\t"+"."+"\t"+"."+"\t"+"."+"\t"+signalp_dict[keys]+"\n")
def main():
inputpath_pilercr=sys.argv[1]
inputpath_tmhmm=sys.argv[2]
inputpath_signalp=sys.argv[3]
files_pilercr=os.listdir(inputpath_pilercr)
file_tmhmm=os.listdir(inputpath_tmhmm)
file_signalp=os.listdir(inputpath_signalp)
#Checking if pilercr input files exist in the directory
if len(files_pilercr) == 0:
print("No files present in the directory.")
for name in files_pilercr:
print("Writing file for "+name+"\n")
#Writing gff of PilerCr results.
pilercr=pilercr_merger(inputpath_pilercr,name)
#Checking if tmhmm input files exist in the directory
if len(files_tmhmm) == 0:
print("No files present in the directory.")
for name in files_tmhmm:
print("Writing file for "+name+"\n")
#Writing gff of tmhmm results
tmhmm=tmhmm_merger(inputpath_tmhmm,name)
#Checking if signalp input files exist in the directory
if len(files_signalp) == 0:
print("No files present in the directory.")
for name in files_signalp:
print("Writing file for "+name+"\n")
#Writing gff of signalp results
signalp=signalp_merger(inputpath_signalp,name)
if __name__ == "__main__":
main()