-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsketching_2.py
More file actions
89 lines (81 loc) · 2.75 KB
/
sketching_2.py
File metadata and controls
89 lines (81 loc) · 2.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import my_hashing
import python_hashing
import random
import itertools
f = open('genome.txt', 'r')
genome = f.read()
guides = {'A': '0', 'C': '1', 'T': '2', 'G': '3', '0': 'A', '1': 'C', '2': 'T', '3': 'G'}
klen = 20
size = len(genome)
cap = size - klen
N = 10000000
w = 2**20
error = 2*N/w
print(error)
prefixes = []
x = itertools.product('ACTG', repeat = 10)
for i in x:
prefixes.append(''.join(i))
print('Hi')
# These bins are for my hasing
bin1 = [0]*w
bin2 = [0]*w
bin3 = [0]*w
bin4 = [0]*w
# These bins are for the Python Hash function DSA
dbin1 = [0]*(2**28)
dbin2 = [0]*(2**28)
dbin3 = [0]*(2**28)
dbin4 = [0]*(2**28)
for _ in range(0, N):
choice = random.randrange(0, cap + 1)
kmer = genome[choice : choice + klen]
bin1[my_hashing.my_second_hash(kmer, 0)] += 1
bin2[my_hashing.my_second_hash(kmer, 1)] += 1
bin3[my_hashing.my_second_hash(kmer, 2)] += 1
bin4[my_hashing.my_second_hash(kmer, 3)] += 1
dbin1[python_hashing.DSA_hash(kmer, 0)] += 1
dbin2[python_hashing.DSA_hash(kmer, 1)] += 1
dbin3[python_hashing.DSA_hash(kmer, 2)] += 1
dbin4[python_hashing.DSA_hash(kmer, 3)] += 1
print('Hi')
# This naturally produces alot of intersections
# so we set all boxes with frequency less than 2N/w to 0.
for j in range(0, w):
if bin1[j] < error:
bin1[j] = 0
if bin2[j] < error:
bin2[j] = 0
if bin3[j] < error:
bin3[j] = 0
if bin4[j] < error:
bin4[j] = 0
# print(bin1)
# print(sum(bin1), sum(bin2), sum(bin3), sum(bin4))
# print(sum(x > 0 for x in bin1), sum(x > 0 for x in bin2), sum(x > 0 for x in bin3), sum(x > 0 for x in bin4))
# Now we want to recombine our 4 bins to get frequency estimates for each 20-mer
# bin1 gives the frequency of the first 0-10
# bin2 gives the frequency of the first 5-15
# bin3 gives the frequency of the first 10-20
# bin4 gives the frequency of the first 0-5 and 15-20
bin1_reduced = [(prefixes[x], bin1[x]) for x in range(0, w) if bin1[x] != 0]
bin3_reduced = [(prefixes[x], bin3[x]) for x in range(0, w) if bin3[x] != 0]
print(len(bin1_reduced), len(bin3_reduced))
g = open('data3.txt', 'w')
acc = 0
# k_mer_frequency = []
for k in bin1_reduced:
for l in bin3_reduced:
k_mer = k[0] + l[0]
frequency = min(k[1],
bin2[my_hashing.my_second_hash(k_mer, 1)], l[1],
bin4[my_hashing.my_second_hash(k_mer, 3)])
if frequency != 0:
second_estimate = min(dbin1[python_hashing.DSA_hash(kmer, 0)],
dbin2[python_hashing.DSA_hash(kmer, 1)], dbin3[python_hashing.DSA_hash(kmer, 2)],
dbin4[python_hashing.DSA_hash(kmer, 3)])
# k_mer_frequency.append((k_mer, frequency))
g.write('k_mer: ' + k_mer + ', ' )
g.write('Frequency: ' + str(frequency) + ', ')
g.write('Second Estimate: ' + str(second_estimate) + '\n')
# print(k_mer_frequency)