|
32 | 32 |
|
33 | 33 | from _colorize import can_colorize, get_theme |
34 | 34 | from heapq import nlargest as _nlargest |
35 | | -from collections import namedtuple as _namedtuple |
| 35 | +from collections import Counter as _Counter, namedtuple as _namedtuple |
36 | 36 | from types import GenericAlias |
37 | 37 |
|
38 | 38 | Match = _namedtuple('Match', 'a b size') |
39 | 39 |
|
| 40 | +_LENGTH = 0 |
| 41 | +_LINK = 1 |
| 42 | +_NEXT = 2 |
| 43 | +_POS = 3 |
| 44 | + |
| 45 | + |
| 46 | +class _LCSUBAutomaton: |
| 47 | + """Suffix Automaton for finding longest common substring.""" |
| 48 | + |
| 49 | + def __init__(self, s2, start2=0, stop2=None, *, junk=()): |
| 50 | + if stop2 is None: |
| 51 | + stop2 = len(s2) |
| 52 | + |
| 53 | + self.start2 = start2 |
| 54 | + self.stop2 = stop2 |
| 55 | + self.junk = frozenset(junk) |
| 56 | + self.root = root = [0, None, {}, -1] # [length, link, next, end_pos] |
| 57 | + |
| 58 | + last_len = 0 |
| 59 | + last = root |
| 60 | + for j in range(start2, stop2): |
| 61 | + c = s2[j] |
| 62 | + if c in junk: |
| 63 | + last_len = 0 |
| 64 | + last = root |
| 65 | + else: |
| 66 | + last_len += 1 |
| 67 | + curr = [last_len, None, {}, j] |
| 68 | + |
| 69 | + p = last |
| 70 | + p_next = p[_NEXT] |
| 71 | + while c not in p_next: |
| 72 | + p_next[c] = curr |
| 73 | + if p is root: |
| 74 | + curr[_LINK] = root |
| 75 | + break |
| 76 | + p = p[_LINK] |
| 77 | + p_next = p[_NEXT] |
| 78 | + else: |
| 79 | + q = p_next[c] |
| 80 | + p_length_p1 = p[_LENGTH] + 1 |
| 81 | + if p_length_p1 == q[_LENGTH]: |
| 82 | + curr[_LINK] = q |
| 83 | + else: |
| 84 | + clone = [p_length_p1, q[_LINK], q[_NEXT].copy(), q[_POS]] |
| 85 | + while (p_next := p[_NEXT]).get(c) is q: |
| 86 | + p_next[c] = clone |
| 87 | + if p is root: |
| 88 | + break |
| 89 | + p = p[_LINK] |
| 90 | + |
| 91 | + q[_LINK] = curr[_LINK] = clone |
| 92 | + |
| 93 | + last = curr |
| 94 | + |
| 95 | + def find(self, s1, start1=0, stop1=None): |
| 96 | + if stop1 is None: |
| 97 | + stop1 = len(s1) |
| 98 | + root = self.root |
| 99 | + junk = self.junk |
| 100 | + v = root |
| 101 | + l = 0 |
| 102 | + best_len = 0 |
| 103 | + best_state = None |
| 104 | + best_pos = 0 |
| 105 | + |
| 106 | + for i in range(start1, stop1): |
| 107 | + c = s1[i] |
| 108 | + if c in junk: |
| 109 | + v = root |
| 110 | + l = 0 |
| 111 | + else: |
| 112 | + while v is not root and c not in v[_NEXT]: |
| 113 | + v = v[_LINK] |
| 114 | + l = v[_LENGTH] |
| 115 | + |
| 116 | + v_next = v[_NEXT] |
| 117 | + if c in v_next: |
| 118 | + v = v_next[c] |
| 119 | + l += 1 |
| 120 | + if l > best_len: |
| 121 | + best_len = l |
| 122 | + best_state = v |
| 123 | + best_pos = i |
| 124 | + |
| 125 | + if not best_len: |
| 126 | + return (start1, self.start2, 0) |
| 127 | + |
| 128 | + start_in_s1 = best_pos + 1 - best_len |
| 129 | + end_in_s2 = best_state[_POS] |
| 130 | + start_in_s2 = end_in_s2 + 1 - best_len |
| 131 | + return (start_in_s1, start_in_s2, best_len) |
| 132 | + |
| 133 | + |
| 134 | +def longest_common_substring(s1, s2, start1=0, stop1=None, start2=0, stop2=None, |
| 135 | + *, junk=()): |
| 136 | + return _LCSUBAutomaton(s2, start2, stop2, junk=junk).find(s1, start1, stop1) |
| 137 | + |
| 138 | + |
40 | 139 | def _calculate_ratio(matches, length): |
41 | 140 | if length: |
42 | 141 | return 2.0 * matches / length |
@@ -276,32 +375,42 @@ def __chain_b(self): |
276 | 375 | # out the junk later is much cheaper than building b2j "right" |
277 | 376 | # from the start. |
278 | 377 | b = self.b |
279 | | - self.b2j = b2j = {} |
280 | | - |
281 | | - for i, elt in enumerate(b): |
282 | | - indices = b2j.setdefault(elt, []) |
283 | | - indices.append(i) |
284 | | - |
285 | | - # Purge junk elements |
286 | | - self.bjunk = junk = set() |
287 | 378 | isjunk = self.isjunk |
288 | | - if isjunk: |
289 | | - for elt in b2j.keys(): |
290 | | - if isjunk(elt): |
291 | | - junk.add(elt) |
292 | | - for elt in junk: # separate loop avoids separate list of keys |
293 | | - del b2j[elt] |
294 | | - |
295 | | - # Purge popular elements that are not junk |
| 379 | + self.bjunk = junk = set() |
| 380 | + autojunk = self.autojunk |
296 | 381 | self.bpopular = popular = set() |
297 | | - n = len(b) |
298 | | - if self.autojunk and n >= 200: |
299 | | - ntest = n // 100 + 1 |
300 | | - for elt, idxs in b2j.items(): |
301 | | - if len(idxs) > ntest: |
302 | | - popular.add(elt) |
303 | | - for elt in popular: # ditto; as fast for 1% deletion |
304 | | - del b2j[elt] |
| 382 | + self.b2j = b2j = {} |
| 383 | + if autojunk: |
| 384 | + for i, elt in enumerate(b): |
| 385 | + indices = b2j.setdefault(elt, []) |
| 386 | + indices.append(i) |
| 387 | + |
| 388 | + # Purge junk elements |
| 389 | + if isjunk: |
| 390 | + for elt in b2j.keys(): |
| 391 | + if isjunk(elt): |
| 392 | + junk.add(elt) |
| 393 | + for elt in junk: # separate loop avoids separate list of keys |
| 394 | + del b2j[elt] |
| 395 | + |
| 396 | + # Purge popular elements that are not junk |
| 397 | + n = len(b) |
| 398 | + if autojunk and n >= 200: |
| 399 | + ntest = n // 100 + 1 |
| 400 | + for elt, idxs in b2j.items(): |
| 401 | + if len(idxs) > ntest: |
| 402 | + popular.add(elt) |
| 403 | + for elt in popular: # ditto; as fast for 1% deletion |
| 404 | + del b2j[elt] |
| 405 | + else: |
| 406 | + # Prepare LCSUB Automaton |
| 407 | + if isjunk: |
| 408 | + bcounts = _Counter(b) |
| 409 | + junk.update(filter(isjunk, bcounts)) |
| 410 | + for elt in junk: |
| 411 | + del bcounts[elt] |
| 412 | + self.aut_cache = (None, None) # Cache last automaton |
| 413 | + self.all_junk = junk | popular |
305 | 414 |
|
306 | 415 | def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): |
307 | 416 | """Find longest matching block in a[alo:ahi] and b[blo:bhi]. |
@@ -361,32 +470,43 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): |
361 | 470 | # Windiff ends up at the same place as diff, but by pairing up |
362 | 471 | # the unique 'b's and then matching the first two 'a's. |
363 | 472 |
|
364 | | - a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__ |
| 473 | + a, b, isbjunk = self.a, self.b, self.bjunk.__contains__ |
365 | 474 | if ahi is None: |
366 | 475 | ahi = len(a) |
367 | 476 | if bhi is None: |
368 | 477 | bhi = len(b) |
369 | | - besti, bestj, bestsize = alo, blo, 0 |
370 | | - # find longest junk-free match |
371 | | - # during an iteration of the loop, j2len[j] = length of longest |
372 | | - # junk-free match ending with a[i-1] and b[j] |
373 | | - j2len = {} |
374 | | - nothing = [] |
375 | | - for i in range(alo, ahi): |
376 | | - # look at all instances of a[i] in b; note that because |
377 | | - # b2j has no junk keys, the loop is skipped if a[i] is junk |
378 | | - j2lenget = j2len.get |
379 | | - newj2len = {} |
380 | | - for j in b2j.get(a[i], nothing): |
381 | | - # a[i] matches b[j] |
382 | | - if j < blo: |
383 | | - continue |
384 | | - if j >= bhi: |
385 | | - break |
386 | | - k = newj2len[j] = j2lenget(j-1, 0) + 1 |
387 | | - if k > bestsize: |
388 | | - besti, bestj, bestsize = i-k+1, j-k+1, k |
389 | | - j2len = newj2len |
| 478 | + if alo >= ahi: |
| 479 | + besti, bestj, bestsize = alo, blo, 0 |
| 480 | + elif self.autojunk: |
| 481 | + b2j = self.b2j |
| 482 | + besti, bestj, bestsize = alo, blo, 0 |
| 483 | + # find longest junk-free match |
| 484 | + # during an iteration of the loop, j2len[j] = length of longest |
| 485 | + # junk-free match ending with a[i-1] and b[j] |
| 486 | + j2len = {} |
| 487 | + nothing = [] |
| 488 | + for i in range(alo, ahi): |
| 489 | + # look at all instances of a[i] in b; note that because |
| 490 | + # b2j has no junk keys, the loop is skipped if a[i] is junk |
| 491 | + j2lenget = j2len.get |
| 492 | + newj2len = {} |
| 493 | + for j in b2j.get(a[i], nothing): |
| 494 | + # a[i] matches b[j] |
| 495 | + if j < blo: |
| 496 | + continue |
| 497 | + if j >= bhi: |
| 498 | + break |
| 499 | + k = newj2len[j] = j2lenget(j-1, 0) + 1 |
| 500 | + if k > bestsize: |
| 501 | + besti, bestj, bestsize = i-k+1, j-k+1, k |
| 502 | + j2len = newj2len |
| 503 | + else: |
| 504 | + # Without autojunk, run LCSUB Automaton |
| 505 | + blo_bhi, aut = self.aut_cache |
| 506 | + if aut is None or blo_bhi != (blo, bhi): |
| 507 | + aut = _LCSUBAutomaton(b, blo, bhi, junk=self.all_junk) |
| 508 | + self.aut_cache = ((blo, bhi), aut) |
| 509 | + besti, bestj, bestsize = aut.find(a, alo, ahi) |
390 | 510 |
|
391 | 511 | # Extend the best by non-junk elements on each end. In particular, |
392 | 512 | # "popular" non-junk elements aren't in b2j, which greatly speeds |
|
0 commit comments