-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdetermining_dna_health.py
More file actions
executable file
·158 lines (131 loc) · 4.62 KB
/
determining_dna_health.py
File metadata and controls
executable file
·158 lines (131 loc) · 4.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python3
import bisect
import os
import sys
from collections import defaultdict, deque
from operator import itemgetter
from pathlib import Path
from typing import IO
class TrieNode:
def __init__(self) -> None:
# Initialize TrieNode attributes
self.children: dict[str, TrieNode] = defaultdict(TrieNode)
self.output: list[str] = []
self.fail: TrieNode | None = None
# cf. https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
def build_automaton(keywords: set[str]) -> TrieNode:
# Initialize root node of the trie
root = TrieNode()
# Build trie
for keyword in keywords:
node = root
# Traverse the trie and create nodes for each character
for char in keyword:
node = node.children[char]
# Add keyword to the output list of the final node
node.output.append(keyword)
# Build failure links using BFS
queue = deque()
# Start from root's children
for node in root.children.values():
queue.append(node)
node.fail = root
# Breadth-First traversal of the trie
while queue:
current_node = queue.popleft()
# Traverse each child node
for key, next_node in current_node.children.items():
queue.append(next_node)
fail_node = current_node.fail
# Find the longest proper suffix that is also a prefix
while fail_node and (key not in fail_node.children):
fail_node = fail_node.fail
# Set failure link of the current node
next_node.fail = fail_node.children[key] if fail_node else root
# Add output patterns of failure node to current node's output
next_node.output += next_node.fail.output
return root
def computeHealth(
root: TrieNode,
catalogue: dict[str, list[tuple[int, int]]],
first: int,
last: int,
dna: str,
) -> int:
h = 0
current_node = root
# Traverse the text
for char in dna:
# Follow failure links until a match is found
while current_node and (char not in current_node.children):
current_node = current_node.fail
if not current_node:
current_node = root
continue
# Move to the next node based on current character
current_node = current_node.children[char]
# Update sum with gene found at this position
for gene in current_node.output:
h += sumInRange(catalogue[gene], first, last)
return h
def sumInRange(values: list[tuple[int, int]], first: int, last: int) -> int:
start = bisect.bisect_left(values, first, key=itemgetter(0))
end = bisect.bisect_right(values, last, key=itemgetter(0))
return (
values[end - 1][1] - (values[start - 1][1] if start > 0 else 0)
if end > start
else 0
)
def main(fptr: IO) -> None:
n = int(input().strip())
genes = input().rstrip().split()
assert len(genes) == n
health = list(map(int, input().rstrip().split()))
assert len(health) == n
s = int(input().strip())
len_max = 0
len_min = sys.maxsize
alphabet = set()
genes_dict = defaultdict(list[tuple[int, int]])
genes_dict_csum = defaultdict(int)
for i in range(n):
g = genes[i]
genes_dict_csum[g] += health[i]
genes_dict[g].append((i, genes_dict_csum[g]))
len_max = max(len_max, len(g))
len_min = min(len_min, len(g))
alphabet.update(list(g))
dnas = []
for _ in range(s):
first_multiple_input = input().rstrip().split()
first = int(first_multiple_input[0])
last = int(first_multiple_input[1])
d = first_multiple_input[2]
dnas.append((first, last, d))
max_h = 0
min_h = sys.maxsize
if len_min == len_max:
for first, last, d in dnas:
h = 0
for i in range(len(d) - len_min + 1):
keyword = d[i : i + len_min]
if keyword in genes_dict:
h += sumInRange(genes_dict[keyword], first, last)
max_h = max(max_h, h)
min_h = min(min_h, h)
else:
# Build the Aho-Corasick automaton
genes_unique = set(genes)
trie = build_automaton(genes_unique)
for first, last, d in dnas:
h = computeHealth(trie, genes_dict, first, last, d)
max_h = max(max_h, h)
min_h = min(min_h, h)
fptr.write(f"{min_h} {max_h}\n")
if __name__ == "__main__":
if path := os.getenv("OUTPUT_PATH"):
with Path(path).open("wt", encoding="utf-8") as fptr:
main(fptr)
fptr.close()
else:
main(sys.stdout)