-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathgetJunctionsFromGTF.py
More file actions
159 lines (124 loc) · 7.84 KB
/
getJunctionsFromGTF.py
File metadata and controls
159 lines (124 loc) · 7.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# Determines equivalent junction sequences at exon-exon boundaries in a transcriptome.
# Sequence of genes on - strand are what would be observed in the transcript.
import sys
import argparse
from Bio import SeqIO
from BCBio import GFF
# usage: python getJunctionsFromGTF.py -f /scratch/PI/horence/linda/EquivJuncPaper/GRCh38.primary_assembly.genome.fa
# -a /scratch/PI/horence/linda/EquivJuncPaper/gencode.v25.annotation.gtf
# -o /scratch/PI/horence/linda/EquivJuncPaper/GRCh38_gencodev25.txt
# -v
# output: tab delimited file with equivJuncSeq, gene name, chr, donor, acceptor
# TO GENERATE REPORT FROM THIS OUTPUT: sort -k1 GRCh38_gencodev25.txt | uniq | cut -f1 | sort | uniq -c | sort -k1nr
# this removes double-counting cases where the same exon boundaries are used in 2 different transcripts
CHECK_EQUIV_LEN = 30 # number of nt to check on either side of exon boundary for equivalent junction sequences
output_Strand = ["-", "+"] #for the output text file for equivalent junctions we want to also add strand information as + or -
# tries getting primary gene name, if that doesn't exist tries getting secondary gene name.
# if neither primary or secondary gene name fields exist, an exception will be thrown
# which is caught and handled in the calling function
def getGeneName(use_feature):
if args.name1 in use_feature.qualifiers:
use_name = args.name1
else:
use_name = args.name2
return use_feature.qualifiers[use_name][0]
# Single-exon genes will be skipped since they do not have sub_features
# for + strand genes location.start is 1 less than is in the gtf file, location.end matches gtf
# param chrSeqRecord: a SeqRecord for 1 chromosome
def parseChrFeatures(chrSeqRecord):
for ftr in chrSeqRecord.features: # this is 1 entire gene
curDonors = []
curAcceptors = []
try:
sftr = None # genes with a single exon are at the top level ftr
for sftr in ftr.sub_features:
curStrand = sftr.strand
if curStrand == 1: # + strand
curDonors.append(int(sftr.location.end)) # will have 1 extra for last exon that will just get ignored
curAcceptors.append(int(sftr.location.start)) # will have 1 extra at the beginning for start exon that will just get ignored
else: # -strand
curDonors.append(int(sftr.location.start))
curAcceptors.append(int(sftr.location.end))
if args.verbose:
if not sftr:
print ("single exon gene:", getGeneName(ftr))
print ("exon donors:", curDonors)
print ("exon acceptors:", curAcceptors)
for b_ind in range(len(curAcceptors)-1):
if args.verbose:
print (curDonors[b_ind], curAcceptors[b_ind+1])
if curStrand == 1: # + strand
dseq_prev = str(f_dict[chrSeqRecord.id][curDonors[b_ind]-CHECK_EQUIV_LEN:curDonors[b_ind]].seq).upper()
dseq_next = str(f_dict[chrSeqRecord.id][curDonors[b_ind]:curDonors[b_ind]+CHECK_EQUIV_LEN].seq).upper()
aseq_prev = str(f_dict[chrSeqRecord.id][curAcceptors[b_ind+1]-CHECK_EQUIV_LEN:curAcceptors[b_ind+1]].seq).upper()
aseq_next = str(f_dict[chrSeqRecord.id][curAcceptors[b_ind+1]:curAcceptors[b_ind+1]+CHECK_EQUIV_LEN].seq).upper()
else:
# need reverse complement of sequence for - strand genes, and prev/next are swapped
dseq = str(f_dict[chrSeqRecord.id][curDonors[b_ind]-CHECK_EQUIV_LEN:curDonors[b_ind]+CHECK_EQUIV_LEN].seq.reverse_complement()).upper()
dseq_prev = dseq[:CHECK_EQUIV_LEN]
dseq_next = dseq[CHECK_EQUIV_LEN:]
aseq = str(f_dict[chrSeqRecord.id][curAcceptors[b_ind+1]-CHECK_EQUIV_LEN:curAcceptors[b_ind+1]+CHECK_EQUIV_LEN].seq.reverse_complement()).upper()
aseq_prev = aseq[:CHECK_EQUIV_LEN]
aseq_next = aseq[CHECK_EQUIV_LEN:]
if args.verbose:
print ("dseq_prev:", dseq_prev)
print ("dseq_next:", dseq_next)
print ("aseq_prev:", aseq_prev)
print ("aseq_next:", aseq_next)
# generate equiv junc seq
equivJuncSeq = ""
# check moving breakpoint more upstream in gene
e_ind = CHECK_EQUIV_LEN - 1
while e_ind != -1:
if dseq_prev[e_ind] != aseq_prev[e_ind]:
break
equivJuncSeq = equivJuncSeq + dseq_prev[e_ind]
e_ind = e_ind - 1
equivJuncSeq = equivJuncSeq[::-1] # since it was added from end 1 by 1, need to reverse to get coorrect order
if args.verbose:
print ("equiv from moving upstream:", equivJuncSeq)
# check moving breakpoint more downstream in gene
e_ind = 0
while e_ind < CHECK_EQUIV_LEN:
if dseq_next[e_ind] != aseq_next[e_ind]:
break
equivJuncSeq = equivJuncSeq + dseq_next[e_ind]
e_ind = e_ind + 1
if args.verbose:
print ("equiv from checking both directions:", equivJuncSeq)
o_handle.write("\t".join([equivJuncSeq, getGeneName(sftr), chrSeqRecord.id, str(output_Strand[curStrand]), str(curDonors[b_ind]), str(curAcceptors[b_ind+1])]))
o_handle.write("\n")
except Exception as e:
print ("Exception")
print (e)
print ("error:", sys.exc_info()[0])
print ("parsing features for", ftr)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--fastaFile', required=True, help='path to fasta file with chromosome sequences')
parser.add_argument('-a', '--annotationFile', required=True, help='path to gff file with exon annotations')
parser.add_argument('-o', '--outFile', required=True, help='output file name (including full path)')
parser.add_argument('-n1', '--name1', help='name of field in gtf to use for gene names', default='gene_name')
parser.add_argument('-n2', '--name2', help='name of field in gtf to use for gene names if n1 does not exist', default='gene_id')
parser.add_argument('-v', '--verbose', help='print info about data obtained', action='store_true')
args = parser.parse_args()
# read in the sequences
f_handle = open(args.fastaFile, "rU")
f_dict = SeqIO.to_dict(SeqIO.parse(f_handle, "fasta"))
f_handle.close()
# only want exons for now, and only chromosomes that we have sequences for
limit_info = dict(
gff_id = f_dict.keys(),
gff_type = ["exon"])
########### read in the annotations, adding annotations to the sequences
a_handle = open(args.annotationFile, "rU")
if args.verbose:
print ("opening annotation file", args.annotationFile)
o_handle = open(args.outFile, "w")
for rec in GFF.parse(a_handle, base_dict=f_dict, limit_info=limit_info): # each rec is a SeqRecord (for 1 chromosome)
if args.verbose:
print ("####### starting new chromosome: " + str(rec.id))
# populate the data structures with genes and exons from this chromosome and pickle the objects
parseChrFeatures(rec)
o_handle.close()
a_handle.close()