-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcheck-ref-no-start.pl
More file actions
executable file
·72 lines (61 loc) · 2.28 KB
/
check-ref-no-start.pl
File metadata and controls
executable file
·72 lines (61 loc) · 2.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/perl
use strict;
use EssexParser;
use GffTranscriptReader;
use FastaReader;
use TempFilename;
use Translation;
my $THOUSAND="/home/bmajoros/1000G";
my $GENOME_DIR="$THOUSAND/assembly/combined/HG00096";
my $ICE_FILE="$GENOME_DIR/out.ice";
my $TWO_BIT="/data/reddylab/Reference_Data/hg19.2bit";
my $ENSEMBL="/home/bmajoros/ensembl/coding-and-noncoding.gff";
my $tempFile=TempFilename::generate();
my %startCodons; $startCodons{"ATG"}=1;
# Load the original ensembl GFF
my $gffReader=new GffTranscriptReader();
my $byTranscriptID=$gffReader->loadTranscriptIdHash($ENSEMBL);
# Process the ICE output file
my ($notFound,$sampleSize);
my $parser=new EssexParser($ICE_FILE);
while(1) {
my $report=$parser->nextElem(); last unless $report;
my $status=$report->findChild("status"); next unless $status;
my $code=$status->getIthElem(0); next unless $code;
# Find a transcript with no-stop-codon
next unless $code eq "bad-annotation";
my $array=$status->findDescendents("bad-start");
next unless @$array>0;
my $geneID=$report->getAttribute("substrate");
my $transcriptID=$report->getAttribute("transcript-ID");
++$sampleSize;
# Get ensembl GFF for this transcript
my $transcript=$byTranscriptID->{$transcriptID};
die "$transcriptID not found in ensembl GFF" unless $transcript;
my $startCoord=getStartCoord($transcript);
my $strand=$transcript->getStrand();
# Extract sequence from 2bit file
my $substrate=$transcript->getSubstrate();
if($strand eq "+") { $startCoord-=3 }
my $begin=$startCoord;
my $end=$begin+6;
system("twoBitToFa -seq=$substrate -start=$begin -end=$end $TWO_BIT $tempFile");
my $seq=FastaReader::firstSequence($tempFile); die unless length($seq)==6;
if($strand eq "-") { $seq=Translation::reverseComplement(\$seq) }
my $firstCodon=substr($seq,0,3); my $secondCodon=substr($seq,3,3);
if($startCodons{$firstCodon} || $startCodons{$secondCodon}) {
print "found\t$firstCodon\t$secondCodon\t$transcriptID\n";
next
}
++$notFound;
}
unlink($tempFile) if -e $tempFile;
print "$notFound of $sampleSize had no start codon\n";
sub getStartCoord
{
my ($transcript)=@_;
my $strand=$transcript->getStrand();
my $firstExon=$transcript->getIthExon(0);
if($strand eq "+") { return $firstExon->getBegin() }
else { return $firstExon->getEnd()-3 }
}