-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrypskip-fpkm.pl
More file actions
executable file
·113 lines (100 loc) · 3.32 KB
/
crypskip-fpkm.pl
File metadata and controls
executable file
·113 lines (100 loc) · 3.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/perl
use strict;
use ProgramName;
my $name=ProgramName::get();
die "$name <FPKM|READS>\n" unless @ARGV==1;
my ($type)=@ARGV;
#my $READS=1; # 1 = use spliced read counts (TopHat), 0 = use FPKM (StringTie)
my $READS=$type eq "FPKM" ? 0 : 1;
my $MIN_FPKM=1;
my $MIN_SAMPLE_SIZE=30;
my $MAX_COPIES=1000000;
my $BASE="/home/bmajoros/1000G/assembly";
my $CRYPSKIP="$BASE/crypskip.txt";
my $RNA=$READS ? "$BASE/crypskip-counts.txt" : "$BASE/rna.txt";
my (%rna,%rnaByTranscript,%sampleSize,%seen);
open(IN,$RNA) || die $RNA;
while(<IN>) {
chomp; my @fields=split; next unless @fields>=7;
my ($indiv,$allele,$gene,$transcript,$cov,$FPKM,$TPM,$nmd)=@fields;
next if $indiv eq "indiv"; # header
#next unless $nmd eq "OK";
my $key="$indiv $allele";
next if $transcript eq ".";
if($transcript=~/(ALT\d+_\S+)_\d+/) { $transcript=$1 }
# else { next }
my $baseID=$transcript;
if($transcript=~/ALT\d+_(\S+)/) { $baseID=$1 }
if($transcript=~/(ALT\d+_\S+)_\d+/) { $seen{$baseID}++ }
$rna{$key}->{$transcript}=$FPKM;
if($READS) {
$rnaByTranscript{$baseID}+=$FPKM;
++$sampleSize{$baseID};
}
}
close(IN);
if(!$READS) {
open(IN,"$BASE/rna.txt") || die;
while(<IN>) {
chomp; my @fields=split; next unless @fields>=7;
my ($indiv,$allele,$gene,$transcript,$cov,$FPKM,$TPM,$nmd)=@fields;
next if $indiv eq "indiv"; # header
my $key="$indiv $allele";
if($transcript=~/ALT\d+_(\S+)/) { $transcript=$1 }
$rnaByTranscript{$transcript}+=$FPKM;
++$sampleSize{$transcript};
}
close(IN);
}
my (%crypticCounts,%exonSkipping,%baseIDs);
open(IN,$CRYPSKIP) || die $CRYPSKIP;
while(<IN>) {
chomp; my @fields=split; next unless @fields>=6;
my ($indiv,$hap,$gene,$transcript,$event,$dist)=@fields;
$transcript=~/(\S+)_\d+$/ || die $transcript;
$transcript=$1;
my $baseID=$transcript;
if($transcript=~/ALT\d+_(\S+)/) { $baseID=$1 }
$baseIDs{$baseID}=1;
my $key="$indiv $hap";
#print "$key $transcript\n";
#if($event eq "exon-skipping") {next unless $rna{$key}->{$transcript}>0} ###
next unless $rna{$key}->{$transcript}>0; ###
if($event eq "exon-skipping") { $exonSkipping{$key}->{$baseID}=$transcript }
else { ++$crypticCounts{$key}->{$baseID} }
}
close(IN);
my %meanFPKM;
my @baseIDs=keys %rnaByTranscript;
my $numBaseIDs=@baseIDs;
for(my $i=0 ; $i<$numBaseIDs ; ++$i) {
my $baseID=$baseIDs[$i];
my $sum=$rnaByTranscript{$baseID};
my $N=$sampleSize{$baseID};
next unless $sum>$MIN_FPKM && $N>$MIN_SAMPLE_SIZE;
$meanFPKM{$baseID}=$sum/$N;
}
my @keys=keys %exonSkipping;
my $numKeys=@keys;
for(my $i=0 ; $i<$numKeys ; ++$i) {
my $key=$keys[$i];
my $hash=$exonSkipping{$key};
die unless $hash;
my @baseIDs=keys %$hash;
foreach my $baseID (@baseIDs) {
my $numCryptic=defined($crypticCounts{$key}) ?
0+$crypticCounts{$key}->{$baseID} : 0;
my $skippingID=$hash->{$baseID};
my $fpkm=0+$rna{$key}->{$skippingID};
my $mean=$meanFPKM{$baseID};
#print "$baseID $fpkm / $mean $skippingID\n";
next unless $mean>$MIN_FPKM;
next if $baseID=~/ENST00000421308.2/; # outlier: reference has minor allele
next if $baseID=~/ENST00000458398.2/; # outlier: high FPKM for exon-skip
#my $rep=$seen{$baseID}; print "$rep vs $MAX_COPIES\n";
next if $seen{$baseID}>$MAX_COPIES;
#print "dividing $fpkm by $mean\n";
$fpkm/=$mean;
print "$baseID\t$numCryptic\t$fpkm\n";
}
}