-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathunique_rs_ids.rb
More file actions
118 lines (105 loc) · 3.46 KB
/
unique_rs_ids.rb
File metadata and controls
118 lines (105 loc) · 3.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/ruby
# USAGE: ruby unique_rs_ids.rb
# Filter all hapmap genotype files to get unique rs ids by merging alleles from duplicate rs ids.
# Between the duplicated rs entries, compare the alleles like this:
# known vs unknown => known
# known == known => known
# known != known => unknown
# ignore the location, origin and other, keep from the one with most retrieved alleles
# Output is a hapmap genotype file tab delimited with merged duplicates.
require 'pathname'
# compare the alleles to merge the duplicated rs ids
def resolve_duplicates(rows_as_strings)
rows_as_arrays = rows_as_strings.map{|row| row.split(' ')}
merged_row = rows_as_arrays.first[0..10] # first 11 elements are selected from the first row
(11..rows_as_arrays.first.length-1).each do |col_idx|
alleles = []
rows_as_arrays.each do |row|
alleles << row[col_idx]
end
merged_allele = resolve_alleles(alleles)
merged_row << merged_allele
end
return merged_row.join(" ")
end
# compare and choose the allele
def resolve_alleles(alleles)
merged_allele = alleles.pop
alleles.each do |allele|
if allele == "NN" || allele == merged_allele
# no need to do anything
else
if merged_allele == "NN"
merged_allele = allele
else
return "NN"
end
end
end
return merged_allele
end
# # retouch the row to keep the first column = rs id and the alleles (samples columns) only, also replace the NN alleles with space
# def to_ldat_format(row)
# row_as_array = row.split(' ')
# rs_id = row_as_array[0].to_s
# alleles_string = row_as_array[11..row.length-1].join("\t")
# alleles_string.gsub!("NN", " ")
# return rs_id + "\t" + alleles_string
# end
# io files
# data_path = "data/raw/Hapmap/test_data/"
data_path = "data/raw/Hapmap/hapmap_genotypes_2010-08_phaseII+III_forward/"
results_path = "results/Hapmap/hapmap_genotypes_2010-08_phaseII+III_forward_with_merged_duplicates/"
# find interesting files
input_file_list = []
Dir[data_path + "*.txt"].each do |input_file|
input_pathname = Pathname.new(input_file)
group = input_pathname.basename.to_s.split("_")[2]
if group == "CEU" || group == "JPT" || group == "YRI" || group == "CHB"
input_file_list << input_pathname
end
end
# find duplicate rs ids (by rows) and merge the alleles of the duplicate entries (by col)
input_file_list.each do |input_pathname|
input_handle = input_pathname.open
# scan the rs ids and find the duplicates
rs_ids = {}
duplicate_rs_ids = {}
input_handle.each_line do |row|
rs_id = row.split(" ")[0]
if rs_ids.include?(rs_id)
duplicate_rs_ids[rs_id] = []
else
rs_ids[rs_id] = nil
end
end
input_handle.rewind
# get the entries for the duplicate rs ids
input_handle.each_line do |row|
rs_id = row.split(" ")[0]
if duplicate_rs_ids.include?(rs_id)
duplicate_rs_ids[rs_id] << row
end
end
input_handle.rewind
# merge the rows for the duplicates and store the result in a hash
merged_row = {}
duplicate_rs_ids.each do |rs_id, rows|
merged_row[rs_id] = resolve_duplicates(rows)
end
# print all lines together with the merged lines for the duplicates (print only once)
output_handle = File.open(results_path + '/' + input_pathname.basename, "w")
input_handle.each_line do |row|
rs_id = row.split(" ")[0]
if duplicate_rs_ids.include?(rs_id)
if !merged_row[rs_id].nil?
output_handle.puts merged_row[rs_id]
# output_handle.puts to_ldat_format(merged_row[rs_id])
merged_row[rs_id] = nil
end
else
output_handle.puts row
# output_handle.puts to_ldat_format(row)
end
end
end