Skip to content

Commit 2e2dfd5

Browse files
committed
POC
1 parent c5cfcdf commit 2e2dfd5

File tree

1 file changed

+167
-47
lines changed

1 file changed

+167
-47
lines changed

Lib/difflib.py

Lines changed: 167 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,110 @@
3232

3333
from _colorize import can_colorize, get_theme
3434
from heapq import nlargest as _nlargest
35-
from collections import namedtuple as _namedtuple
35+
from collections import Counter as _Counter, namedtuple as _namedtuple
3636
from types import GenericAlias
3737

3838
Match = _namedtuple('Match', 'a b size')
3939

40+
_LENGTH = 0
41+
_LINK = 1
42+
_NEXT = 2
43+
_POS = 3
44+
45+
46+
class _LCSUBAutomaton:
47+
"""Suffix Automaton for finding longest common substring."""
48+
49+
def __init__(self, s2, start2=0, stop2=None, *, junk=()):
50+
if stop2 is None:
51+
stop2 = len(s2)
52+
53+
self.start2 = start2
54+
self.stop2 = stop2
55+
self.junk = frozenset(junk)
56+
self.root = root = [0, None, {}, -1] # [length, link, next, end_pos]
57+
58+
last_len = 0
59+
last = root
60+
for j in range(start2, stop2):
61+
c = s2[j]
62+
if c in junk:
63+
last_len = 0
64+
last = root
65+
else:
66+
last_len += 1
67+
curr = [last_len, None, {}, j]
68+
69+
p = last
70+
p_next = p[_NEXT]
71+
while c not in p_next:
72+
p_next[c] = curr
73+
if p is root:
74+
curr[_LINK] = root
75+
break
76+
p = p[_LINK]
77+
p_next = p[_NEXT]
78+
else:
79+
q = p_next[c]
80+
p_length_p1 = p[_LENGTH] + 1
81+
if p_length_p1 == q[_LENGTH]:
82+
curr[_LINK] = q
83+
else:
84+
clone = [p_length_p1, q[_LINK], q[_NEXT].copy(), q[_POS]]
85+
while (p_next := p[_NEXT]).get(c) is q:
86+
p_next[c] = clone
87+
if p is root:
88+
break
89+
p = p[_LINK]
90+
91+
q[_LINK] = curr[_LINK] = clone
92+
93+
last = curr
94+
95+
def find(self, s1, start1=0, stop1=None):
96+
if stop1 is None:
97+
stop1 = len(s1)
98+
root = self.root
99+
junk = self.junk
100+
v = root
101+
l = 0
102+
best_len = 0
103+
best_state = None
104+
best_pos = 0
105+
106+
for i in range(start1, stop1):
107+
c = s1[i]
108+
if c in junk:
109+
v = root
110+
l = 0
111+
else:
112+
while v is not root and c not in v[_NEXT]:
113+
v = v[_LINK]
114+
l = v[_LENGTH]
115+
116+
v_next = v[_NEXT]
117+
if c in v_next:
118+
v = v_next[c]
119+
l += 1
120+
if l > best_len:
121+
best_len = l
122+
best_state = v
123+
best_pos = i
124+
125+
if not best_len:
126+
return (start1, self.start2, 0)
127+
128+
start_in_s1 = best_pos + 1 - best_len
129+
end_in_s2 = best_state[_POS]
130+
start_in_s2 = end_in_s2 + 1 - best_len
131+
return (start_in_s1, start_in_s2, best_len)
132+
133+
134+
def longest_common_substring(s1, s2, start1=0, stop1=None, start2=0, stop2=None,
135+
*, junk=()):
136+
return _LCSUBAutomaton(s2, start2, stop2, junk=junk).find(s1, start1, stop1)
137+
138+
40139
def _calculate_ratio(matches, length):
41140
if length:
42141
return 2.0 * matches / length
@@ -276,32 +375,42 @@ def __chain_b(self):
276375
# out the junk later is much cheaper than building b2j "right"
277376
# from the start.
278377
b = self.b
279-
self.b2j = b2j = {}
280-
281-
for i, elt in enumerate(b):
282-
indices = b2j.setdefault(elt, [])
283-
indices.append(i)
284-
285-
# Purge junk elements
286-
self.bjunk = junk = set()
287378
isjunk = self.isjunk
288-
if isjunk:
289-
for elt in b2j.keys():
290-
if isjunk(elt):
291-
junk.add(elt)
292-
for elt in junk: # separate loop avoids separate list of keys
293-
del b2j[elt]
294-
295-
# Purge popular elements that are not junk
379+
self.bjunk = junk = set()
380+
autojunk = self.autojunk
296381
self.bpopular = popular = set()
297-
n = len(b)
298-
if self.autojunk and n >= 200:
299-
ntest = n // 100 + 1
300-
for elt, idxs in b2j.items():
301-
if len(idxs) > ntest:
302-
popular.add(elt)
303-
for elt in popular: # ditto; as fast for 1% deletion
304-
del b2j[elt]
382+
self.b2j = b2j = {}
383+
if autojunk:
384+
for i, elt in enumerate(b):
385+
indices = b2j.setdefault(elt, [])
386+
indices.append(i)
387+
388+
# Purge junk elements
389+
if isjunk:
390+
for elt in b2j.keys():
391+
if isjunk(elt):
392+
junk.add(elt)
393+
for elt in junk: # separate loop avoids separate list of keys
394+
del b2j[elt]
395+
396+
# Purge popular elements that are not junk
397+
n = len(b)
398+
if autojunk and n >= 200:
399+
ntest = n // 100 + 1
400+
for elt, idxs in b2j.items():
401+
if len(idxs) > ntest:
402+
popular.add(elt)
403+
for elt in popular: # ditto; as fast for 1% deletion
404+
del b2j[elt]
405+
else:
406+
# Prepare LCSUB Automaton
407+
if isjunk:
408+
bcounts = _Counter(b)
409+
junk.update(filter(isjunk, bcounts))
410+
for elt in junk:
411+
del bcounts[elt]
412+
self.aut_cache = (None, None) # Cache last automaton
413+
self.all_junk = junk | popular
305414

306415
def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
307416
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
@@ -361,32 +470,43 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
361470
# Windiff ends up at the same place as diff, but by pairing up
362471
# the unique 'b's and then matching the first two 'a's.
363472

364-
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__
473+
a, b, isbjunk = self.a, self.b, self.bjunk.__contains__
365474
if ahi is None:
366475
ahi = len(a)
367476
if bhi is None:
368477
bhi = len(b)
369-
besti, bestj, bestsize = alo, blo, 0
370-
# find longest junk-free match
371-
# during an iteration of the loop, j2len[j] = length of longest
372-
# junk-free match ending with a[i-1] and b[j]
373-
j2len = {}
374-
nothing = []
375-
for i in range(alo, ahi):
376-
# look at all instances of a[i] in b; note that because
377-
# b2j has no junk keys, the loop is skipped if a[i] is junk
378-
j2lenget = j2len.get
379-
newj2len = {}
380-
for j in b2j.get(a[i], nothing):
381-
# a[i] matches b[j]
382-
if j < blo:
383-
continue
384-
if j >= bhi:
385-
break
386-
k = newj2len[j] = j2lenget(j-1, 0) + 1
387-
if k > bestsize:
388-
besti, bestj, bestsize = i-k+1, j-k+1, k
389-
j2len = newj2len
478+
if alo >= ahi:
479+
besti, bestj, bestsize = alo, blo, 0
480+
elif self.autojunk:
481+
b2j = self.b2j
482+
besti, bestj, bestsize = alo, blo, 0
483+
# find longest junk-free match
484+
# during an iteration of the loop, j2len[j] = length of longest
485+
# junk-free match ending with a[i-1] and b[j]
486+
j2len = {}
487+
nothing = []
488+
for i in range(alo, ahi):
489+
# look at all instances of a[i] in b; note that because
490+
# b2j has no junk keys, the loop is skipped if a[i] is junk
491+
j2lenget = j2len.get
492+
newj2len = {}
493+
for j in b2j.get(a[i], nothing):
494+
# a[i] matches b[j]
495+
if j < blo:
496+
continue
497+
if j >= bhi:
498+
break
499+
k = newj2len[j] = j2lenget(j-1, 0) + 1
500+
if k > bestsize:
501+
besti, bestj, bestsize = i-k+1, j-k+1, k
502+
j2len = newj2len
503+
else:
504+
# Without autojunk, run LCSUB Automaton
505+
blo_bhi, aut = self.aut_cache
506+
if aut is None or blo_bhi != (blo, bhi):
507+
aut = _LCSUBAutomaton(b, blo, bhi, junk=self.all_junk)
508+
self.aut_cache = ((blo, bhi), aut)
509+
besti, bestj, bestsize = aut.find(a, alo, ahi)
390510

391511
# Extend the best by non-junk elements on each end. In particular,
392512
# "popular" non-junk elements aren't in b2j, which greatly speeds

0 commit comments

Comments
 (0)