quantumaikr
diff --git a/‎bench/rlv/stages/_text.py‎
Lines changed: 98 additions & 0 deletions b/‎bench/rlv/stages/_text.py‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎bench/rlv/stages/gist.py‎
Lines changed: 7 additions & 1 deletion b/‎bench/rlv/stages/gist.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎bench/rlv/stages/locator.py‎
Lines changed: 8 additions & 77 deletions b/‎bench/rlv/stages/locator.py‎
Lines changed: 8 additions & 77 deletions
diff --git a/‎bench/rlv/stages/lookup.py‎
Lines changed: 8 additions & 0 deletions b/‎bench/rlv/stages/lookup.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎bench/rlv/stages/verifier.py‎
Lines changed: 5 additions & 56 deletions b/‎bench/rlv/stages/verifier.py‎
Lines changed: 5 additions & 56 deletions
@@ -0,0 +1,98 @@
+"""Shared text matching utilities for RLV stages.
+
+Extracted from locator.py and verifier.py to eliminate code duplication
+(audit issues I2, I3). All fuzzy matching, normalization, and keyword
+extraction functions live here.
+"""
+import re
+from typing import List, Tuple
+
+
+# ============================================================
+# Text normalization
+# ============================================================
+def normalize(s: str) -> str:
+    """Lowercase and strip non-alphanum-or-space. Used for fuzzy matching
+    against Q4 visual jitter."""
+    return re.sub(r"[^a-z0-9 ]+", " ", s.lower())
+
+
+# ============================================================
+# Fuzzy word matching (Q4 jitter tolerant)
+# ============================================================
+def word_in_text(word: str, text_norm: str) -> bool:
+    """Word-boundary-aware fuzzy match. Tolerates Q4 KV jitter by
+    matching shared prefixes between the query word and each word in
+    the normalized text.
+
+    A `word` matches a region word `rw` if:
+      - exact: rw == word
+      - shared prefix: >= min_prefix chars, with shared length at least
+        min(len(w), len(rw)) - 2
+    """
+    if not word or len(word) < 3:
+        return False
+    w = word.lower()
+    min_prefix = 4 if len(w) > 6 else 3
+    for rw in text_norm.split():
+        if not rw:
+            continue
+        if rw == w:
+            return True
+        shared = 0
+        for a, b in zip(w, rw):
+            if a == b:
+                shared += 1
+            else:
+                break
+        if shared >= min_prefix and shared >= min(len(w), len(rw)) - 2:
+            return True
+    return False
+
+
+def term_in_text(term: str, text_norm: str) -> bool:
+    """Multi-word term match: >= 50% of the words must fuzzy-match.
+    Whole-phrase substring is allowed as a fast path for multi-word terms."""
+    t = normalize(term)
+    if not t:
+        return False
+    if " " in t and t in text_norm:
+        return True
+    words = [w for w in t.split() if len(w) >= 3]
+    if not words:
+        return False
+    matched = sum(1 for w in words if word_in_text(w, text_norm))
+    return matched >= max(1, (len(words) + 1) // 2)
+
+
+def fuzzy_in_region(term: str, region_norm: str) -> bool:
+    """Return True if `term` (possibly multi-word) appears in the region,
+    tolerant of Q4 visual jitter on individual words."""
+    return term_in_text(term, region_norm)
+
+
+# ============================================================
+# Stopwords & low-signal terms
+# ============================================================
+STOPWORDS = {
+    "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
+    "of", "in", "on", "at", "to", "for", "from", "by", "with", "as",
+    "and", "or", "but", "if", "then", "than", "that", "this", "these",
+    "those", "what", "which", "who", "whom", "whose", "where", "when",
+    "why", "how", "do", "does", "did", "done", "doing", "have", "has",
+    "had", "having", "i", "you", "he", "she", "it", "we", "they",
+    "me", "him", "her", "us", "them", "my", "your", "his", "its", "our",
+    "their", "about", "into", "through", "during", "before", "after",
+    "above", "below", "between", "out", "off", "over", "under",
+    "again", "further", "once", "here", "there", "all", "any", "both",
+    "each", "few", "more", "most", "other", "some", "such", "no", "nor",
+    "not", "only", "own", "same", "so", "too", "very", "can", "will",
+    "just", "would", "should", "could", "may", "might", "must",
+    "much", "many", "long", "ago", "later", "well", "thing",
+    "something", "anything", "nothing", "everything", "people",
+    "person", "anyone", "someone",
+}
+
+LOW_SIGNAL_TERMS = {
+    "company", "year", "section", "report", "annual", "fiscal",
+}
@@ -53,11 +53,14 @@ class GistChunk:
     char_end: int
     head_text: str = ""           # first ~200 chars (used by LLM-fallback outline)
     full_text: str = ""           # complete chunk text (used by Day 3 non-LLM keyword scoring)
+    full_text_norm: str = ""      # D6/D13: pre-normalized text (avoids re-normalizing per score call)
     entities: List[str] = field(default_factory=list)
     summary: str = ""             # optional LLM-generated summary
 
     def to_dict(self):
-        return asdict(self)
+        d = asdict(self)
+        d.pop("full_text_norm", None)  # don't serialize internal cache
+        return d
 
 
 @dataclass
@@ -240,12 +243,15 @@ def build_gist(
             else:
                 summary = _parse_summary_response(s_result.text)
 
+        # D6/D13: pre-normalize text once during gist build
+        from ._text import normalize as _norm
         gc = GistChunk(
             chunk_id=i,
             char_start=start,
             char_end=end,
             head_text=head_text,
             full_text=full_text,
+            full_text_norm=_norm(full_text),
             entities=entities,
             summary=summary,
         )
 
@@ -26,32 +26,8 @@
 
 from . import _llm
 from .gist import Gist
-
-
-# Common English stopwords + interrogatives + low-information question fillers.
-STOPWORDS = {
-    "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
-    "of", "in", "on", "at", "to", "for", "from", "by", "with", "as",
-    "and", "or", "but", "if", "then", "than", "that", "this", "these",
-    "those", "what", "which", "who", "whom", "whose", "where", "when",
-    "why", "how", "do", "does", "did", "done", "doing", "have", "has",
-    "had", "having", "i", "you", "he", "she", "it", "we", "they",
-    "me", "him", "her", "us", "them", "my", "your", "his", "its", "our",
-    "their", "about", "into", "through", "during", "before", "after",
-    "above", "below", "between", "out", "off", "over", "under",
-    "again", "further", "once", "here", "there", "all", "any", "both",
-    "each", "few", "more", "most", "other", "some", "such", "no", "nor",
-    "not", "only", "own", "same", "so", "too", "very", "can", "will",
-    "just", "would", "should", "could", "may", "might", "must",
-    "much", "many", "long", "ago", "later", "well", "thing",
-    "something", "anything", "nothing", "everything", "people",
-    "person", "anyone", "someone",
-}
-
-# Common business/document filler that adds noise to the score
-LOW_SIGNAL_TERMS = {
-    "company", "year", "section", "report", "annual", "fiscal",
-}
+from ._text import (normalize as _normalize, word_in_text as _word_in_text,
+                     term_in_text as _term_in_text, STOPWORDS, LOW_SIGNAL_TERMS)
 
 # Section title region weighting — words appearing in the first
 # SECTION_TITLE_CHARS of a chunk get this multiplier (matches in headers
@@ -97,11 +73,8 @@ def to_dict(self):
 
 # ----------------------------------------------------------------------------
 # Non-LLM keyword overlap (primary signal)
+# _normalize, _word_in_text, _term_in_text imported from _text.py (I2/I3 dedup)
 # ----------------------------------------------------------------------------
-def _normalize(s: str) -> str:
-    return re.sub(r"[^a-z0-9 ]+", " ", s.lower())
-
-
 def _question_keywords(question: str) -> List[Tuple[str, float]]:
     """Extract weighted (term, weight) tuples from a question."""
     terms: List[Tuple[str, float]] = []
@@ -131,51 +104,6 @@ def add(term: str, weight: float) -> None:
     return terms
 
 
-def _word_in_text(word: str, text_norm: str) -> bool:
-    """Day 3 word-boundary-aware fuzzy match.
-
-    A `word` matches a region word `rw` if:
-      - exact: rw == word
-      - shared prefix: ≥4 chars (≥3 for short ≤6-char words), with the
-        shared prefix at least min(len(w), len(rw)) - 2.
-    Word-by-word matching avoids the substring trap (e.g., "event" in
-    "revenue" via "even").
-    """
-    if not word or len(word) < 3:
-        return False
-    w = word.lower()
-    min_prefix = 4 if len(w) > 6 else 3
-    for rw in text_norm.split():
-        if not rw:
-            continue
-        if rw == w:
-            return True
-        shared = 0
-        for a, b in zip(w, rw):
-            if a == b:
-                shared += 1
-            else:
-                break
-        if shared >= min_prefix and shared >= min(len(w), len(rw)) - 2:
-            return True
-    return False
-
-
-def _term_in_text(term: str, text_norm: str) -> bool:
-    """Multi-word term match: ≥50% of the words must fuzzy-match.
-    Whole-phrase substring is allowed as a fast path for multi-word terms."""
-    t = _normalize(term)
-    if not t:
-        return False
-    if " " in t and t in text_norm:
-        return True
-    words = [w for w in t.split() if len(w) >= 3]
-    if not words:
-        return False
-    matched = sum(1 for w in words if _word_in_text(w, text_norm))
-    return matched >= max(1, (len(words) + 1) // 2)
-
-
 _HEADING_RE = re.compile(r"^(?:section|chapter|part|appendix)\s*[ivxlcdm\d]+\s*[:.\-]", re.IGNORECASE)
 
 
@@ -213,7 +141,8 @@ def _score_chunk(weighted_terms: List[Tuple[str, float]], chunk) -> float:
     mid-paragraph, and an unconditional bonus boosts incidental words.
     """
     text = chunk.full_text or chunk.head_text
-    text_norm = _normalize(text)
+    # D6/D13: use pre-computed normalized text if available
+    text_norm = chunk.full_text_norm if chunk.full_text_norm else _normalize(text)
     has_heading = _looks_like_heading(text)
     title_norm = _normalize(text[:SECTION_TITLE_CHARS]) if (text and has_heading) else ""
     entities_norm = _normalize(" ".join(chunk.entities)) if chunk.entities else ""
@@ -270,7 +199,9 @@ def _bm25_score_chunks(question: str, gist: Gist, excluded: List[int],
         return []
 
     # Document frequency for each term
-    texts = [_normalize(c.full_text or c.head_text) for c in chunks]
+    # D13: use pre-computed normalized text where available
+    texts = [c.full_text_norm if c.full_text_norm else _normalize(c.full_text or c.head_text)
+             for c in chunks]
     avg_dl = sum(len(t.split()) for t in texts) / max(N, 1)
 
     df = {}
 
@@ -118,6 +118,14 @@ def lookup(
     original text. The model never has to QUOTE — only SELECT.
     """
     region_text = doc_text[region.char_start:region.char_end]
+
+    # A9: guard empty region (e.g., fallback pointer with char_start=char_end=0)
+    if not region_text.strip():
+        return LookupResult(
+            answer="[no text in selected region]", region_text=region_text,
+            chunk_id=region.chunk_id, raw_llm_output="", method="error",
+        )
+
     sentences = _split_into_sentences(region_text)
 
     # Day 4 adaptive lookup: select-by-index for small chunks (≤8 sentences,
 
@@ -26,7 +26,8 @@
 from . import _llm
 from .gist import Gist
 from .lookup import LookupResult
-from .locator import _question_keywords, _term_in_text, _keyword_locate, _normalize as _loc_normalize
+from ._text import normalize as _normalize, word_in_text as _fuzzy_word_in_region, fuzzy_in_region as _fuzzy_in_region
+from .locator import _question_keywords, _term_in_text, _keyword_locate
 
 
 # Day 3: model-side preamble tokens that show up in answer text but
@@ -60,18 +61,11 @@ class VerifyResult:
 
 # ----------------------------------------------------------------------------
 # Literal (regex-based) citation check
+# _normalize, _fuzzy_word_in_region, _fuzzy_in_region imported from _text.py
 # ----------------------------------------------------------------------------
-def _normalize(s: str) -> str:
-    """Lowercase and strip non-alphanum-or-space. Used for fuzzy matching
-    against Q4 visual jitter."""
-    return re.sub(r"[^a-z0-9 ]+", " ", s.lower())
-
-
 def _extract_answer_key_terms(answer: str) -> tuple[list[str], list[str]]:
-    """Day 3: returns (word_terms, number_terms) so the matcher can apply
-    different rules — words use fuzzy matching for Q4 jitter, numbers
-    use exact match (2002 must NOT fuzzy-match 2023). Filters
-    ANSWER_NOISE_TOKENS so model preambles don't get extracted as facts."""
+    """Returns (word_terms, number_terms). Words use fuzzy matching for Q4
+    jitter, numbers use exact match. Filters ANSWER_NOISE_TOKENS."""
     multi_cap = re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b", answer)
     single_cap = re.findall(r"\b[A-Z][a-z]{3,}\b", answer)
     nums = re.findall(r"\b\d{2,5}\b", answer)
@@ -82,7 +76,6 @@ def _extract_answer_key_terms(answer: str) -> tuple[list[str], list[str]]:
         key = term.lower()
         if key in seen:
             continue
-        # Exact word match (not substring) — "text" must not filter "context"
         if key in ANSWER_NOISE_TOKENS:
             continue
         seen.add(key)
@@ -99,50 +92,6 @@ def _extract_answer_key_terms(answer: str) -> tuple[list[str], list[str]]:
     return word_terms[:8], number_terms[:4]
 
 
-def _fuzzy_word_in_region(word: str, region_norm: str) -> bool:
-    """Day 3: word-boundary-aware fuzzy match. Iterates region words and
-    checks shared-prefix similarity. Avoids cross-word substring traps
-    like "event" matching "revenue" via "even"."""
-    if not word or len(word) < 3:
-        return False
-    w = word.lower()
-    min_prefix = 4 if len(w) > 6 else 3
-    for rw in region_norm.split():
-        if not rw:
-            continue
-        if rw == w:
-            return True
-        shared = 0
-        for a, b in zip(w, rw):
-            if a == b:
-                shared += 1
-            else:
-                break
-        if shared >= min_prefix and shared >= min(len(w), len(rw)) - 2:
-            return True
-    return False
-
-
-def _fuzzy_in_region(term: str, region_norm: str) -> bool:
-    """Return True if `term` (possibly multi-word) appears in the region,
-    tolerant of Q4 visual jitter on individual words.
-
-    For multi-word terms (e.g., "John Williams"), require that ≥50% of the
-    words match individually via _fuzzy_word_in_region. For single words,
-    require that one word matches.
-    """
-    term_norm = _normalize(term)
-    if not term_norm:
-        return False
-    if term_norm in region_norm:
-        return True
-    words = [w for w in term_norm.split() if len(w) >= 3]
-    if not words:
-        return False
-    matched = sum(1 for w in words if _fuzzy_word_in_region(w, region_norm))
-    return matched >= max(1, len(words) // 2 + (len(words) % 2))
-
-
 def _question_grounded_via_locator(
     question: str,
     chunk_id: int | None,