-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyze-NMD.pl
More file actions
executable file
·125 lines (113 loc) · 3.28 KB
/
analyze-NMD.pl
File metadata and controls
executable file
·125 lines (113 loc) · 3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/perl
use strict;
use SummaryStats;
my $THOUSAND="/home/bmajoros/1000G";
my $ASSEMBLY="$THOUSAND/assembly";
my $COMBINED="$ASSEMBLY/combined-hg19";
my %chr;
open(IN,"$ASSEMBLY/local-CDS-and-UTR.gff") || die;
while(<IN>) {
if(/^(chr\S+)\s.*transcript_id=([^;]+);/) { $chr{$2}=$1 }
}
close(IN);
my (%hash,%byIndiv,%indivNMD,%indivHomoNMD);
my @dirs=`ls $COMBINED`;
foreach my $subdir (@dirs) {
chomp $subdir;
next unless $subdir=~/^HG\d+$/ || $subdir=~/^NA\d+$/;
my $dir="$COMBINED/$subdir";
my $infile="$dir/nmd.txt";
next if -z $infile;
process($infile);
}
my @indivs=keys %byIndiv;
foreach my $indiv (@indivs) {
my $hash=$byIndiv{$indiv};
my @array=values(%$hash);
my $n=@array;
for(my $i=0 ; $i<$n ; ++$i) {
my $rec=$array[$i];
my $functionalCopies=0;
if($rec->{status}->[0] eq "functional") { ++$functionalCopies }
if($rec->{status}->[1] eq "functional") { ++$functionalCopies }
if($functionalCopies<2) { ++$indivNMD{$indiv} }
if($functionalCopies==0) { ++$indivHomoNMD{$indiv} }
}
}
my @array=values %indivNMD;
my ($mean,$stddev,$min,$max)=SummaryStats::summaryStats(\@array);
print "# transcripts NMD per individual:\t$mean +/- $stddev ($min\-$max)\n";
my @array=values %indivHomoNMD;
my ($mean,$stddev,$min,$max)=SummaryStats::summaryStats(\@array);
print "# transcripts homozygous NMD per individual:\t$mean +/- $stddev ($min\-$max)\n";
my @transcriptIDs=keys %hash;
my $n=@transcriptIDs;
for(my $i=0 ; $i<$n ; ++$i) {
my $transcriptID=$transcriptIDs[$i];
my $chr=$chr{$transcriptID};
next if $chr eq "chrX" || $chr eq "chrY";
my @array=values(%{$hash{$transcriptID}});
my $n=@array;
my (@points,$n0,$n1,$n2,$expressed);
for(my $i=0 ; $i<$n ;++$i) {
my $rec=$array[$i];
my $functionalCopies=0;
if($rec->{status}->[0] eq "functional") { ++$functionalCopies }
if($rec->{status}->[1] eq "functional") { ++$functionalCopies }
my $fpkm;
foreach my $x (@{$rec->{FPKM}}) {$fpkm+=$x}
#if($fpkm>0) { $expressed=1 }
if($fpkm>=5) { $expressed=1 }
push @points,[$functionalCopies,$fpkm];
if($functionalCopies==0) { ++$n0 }
elsif($functionalCopies==1) { ++$n1 }
elsif($functionalCopies==2) { ++$n2 }
}
next unless $expressed;
#next unless $n0+$n1>=5 && $n2>=5;
next unless $n0>=5 && $n1>=5 && $n2>=5;
my $mean=meanFPKM(\@points);
next unless $mean>0;
foreach my $point (@points) {
my ($copies,$fpkm)=@$point;
my $score=log($fpkm/$mean+1);
print "$chr\t$transcriptID\t$copies\t$score\n";
}
}
sub meanFPKM
{
my ($array)=@_;
my $n=@$array;
my $sum=0;
for(my $i=0 ; $i<$n ; ++$i) {
next unless $array->[$i]->[0]==2;
$sum+=$array->[$i]->[1];
}
return $n>0 ? $sum/$n : 0;
}
sub addFPKMs
{
my ($from,$to)=@_;
my $n=@$from;
for(my $i=0 ; $i<$n ; ++$i) {
my $rec=$from->[$i];
push @$to,$rec->{FPKM}->[0]+$rec->{FPKM}->[1];
}
}
sub process
{
my ($infile)=@_;
open(IN,$infile) || die "can't open $infile\n";
while(<IN>) {
chomp; my @fields=split/\t/; next unless @fields>=4;
my ($transcript,$indiv,$status,$fpkm)=@fields;
my $rec=$hash{$transcript}->{$indiv};
if(!$rec) {
$byIndiv{$indiv}->{$transcript}=
$hash{$transcript}->{$indiv}=
$rec={status=>[],FPKM=>[]} }
push @{$rec->{status}},$status;
push @{$rec->{FPKM}},$fpkm;
}
close(IN);
}