-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsnps2maf.pl
More file actions
executable file
·140 lines (119 loc) · 3.46 KB
/
snps2maf.pl
File metadata and controls
executable file
·140 lines (119 loc) · 3.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/perl -w
#
# snps2maf.pl
#
# Get the minor allele frequency at every snp site
#
# September 25, 2013
# Liz Cooper
use strict;
# Get the name of the input and output files from the command line
# Get a list of population names from the command line
my ($USAGE) = "\n$0 <input.snps> <output.maf> <Pop_Names>
\tinput.snps = The input file in .snps format create by RAD data pipeline
\toutput.maf = A text file with the following columns:
\t\tCHROM.POS = The physical location of the SNP
\t\tP = The most frequent allele
\t\tQ = The minor allele
\t\tPop1_MAF = One column per population with the minor allele frequency in that pop
\t\tPop1_Alleles = One column per pop. with the total (non-missing) allele count
\t\tTotal_MAF = The overall minor allele frequency for the data set
\tPop_Names: Must be a comma delimited list of names; used in column headers\n\n";
unless (@ARGV) {
print $USAGE;
exit;
}
my ($input, $output, $popnames) = @ARGV;
open (IN, $input) || die "\nUnable to open the file $input!\n";
open (OUT, ">$output") || die "\nUnable to open the file $output!\n";
my @pops = split(/,/, $popnames);
# Print a HEADER line to the output file
print OUT "CHROM.POS\tP\tQ\t";
foreach my $pop (@pops) {
print OUT $pop, "_MAF", "\t";
}
foreach my $pop (@pops) {
print OUT $pop, "_Alleles", "\t";
}
print OUT "Total_MAF\n";
while (<IN>) {
chomp $_;
my @info = split(/\t/, $_);
# Calculate the minor allele counts for each population
# Count the total number of alleles in the populations
my %mac = ();
my %all = ();
for (my $i = 0; $i < scalar @pops; $i++) {
$mac{$pops[$i]} = 0;
$all{$pops[$i]} = 0;
my $genstring = $info[$i+3];
for (my $p = 0; $p < length $genstring; $p++) {
my $base = substr($genstring, $p, 1);
if ($base =~ /$info[1]/) {
$all{$pops[$i]} += 2;
} elsif ($base =~ /0/) {
$all{$pops[$i]} += 2;
} elsif ($base =~ /$info[2]/) {
$mac{$pops[$i]} += 2;
$all{$pops[$i]} += 2;
} elsif ($base =~ /1/) {
$mac{$pops[$i]} += 2;
$all{$pops[$i]} += 2;
} elsif ($base =~ /[MRWSYK]/) {
$mac{$pops[$i]} += 1;
$all{$pops[$i]} += 2;
}
}
}
# Calculate the allele frequency for each population, and the overall allele frequency
my %maf = ();
for (my $i = 0; $i < scalar @pops; $i++) {
$maf{$pops[$i]} = 0;
unless ($all{$pops[$i]} == 0) {
$maf{$pops[$i]} = $mac{$pops[$i]}/$all{$pops[$i]};
}
}
# Check if the alternative allele is actually the minor allele
my $total_maf = 0;
my $num = 0;
my $denom = 0;
for (my $i = 0; $i < scalar @pops; $i++) {
$num += $mac{$pops[$i]};
$denom += $all{$pops[$i]};
}
unless ($denom == 0) {
$total_maf = $num/$denom;
}
# Print out the correct reference and minor allele for each population
my $line = $info[0] . "\t";
if ($total_maf > 0.5) {
$line .= $info[2] . "\t" . $info[1] . "\t";
} else {
$line .= $info[1] . "\t" . $info[2] . "\t";
}
foreach my $population (@pops) {
my $maf = $maf{$population};
if ($total_maf > 0.5) {
if ($all{$population} > 0) {
my $new = 1 - $maf;
$line .= $new . "\t";
} else {
$line .= "0" . "\t";
}
} else {
$line .= $maf . "\t";
}
}
# Print out the total allele counts for each population
foreach my $population (@pops) {
$line .= $all{$population} . "\t";
}
if ($total_maf > 0.5) {
print OUT $line, 1-$total_maf, "\n";
} else {
print OUT $line, $total_maf, "\n";
}
}
close(IN);
close(OUT);
exit;