Skip to content

Commit 61d8eea

Browse files
unamedkrclaude
andcommitted
feat(wasm): add Phi-3.5-mini to browser demo + v0.13.0 Reddit post draft
- WASM demo: Phi-3.5-mini (Q4_K_M, 2.2 GB) added as recommended model with Phi-3 chat template. SmolLM2-135M kept as fast-download option. - WASM binary rebuilt (320 KB) with latest quant.h (Phi-3 support). - Reddit post draft in docs/pr/ — follows feedback principles: lead with "what you can build", measurement-backed claims only, recommend llama.cpp for GPU speed, no comparisons. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 4eef1f9 commit 61d8eea

File tree

8 files changed

+192
-138
lines changed

8 files changed

+192
-138
lines changed

bench/rlv/stages/_text.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
"""Shared text matching utilities for RLV stages.
2+
3+
Extracted from locator.py and verifier.py to eliminate code duplication
4+
(audit issues I2, I3). All fuzzy matching, normalization, and keyword
5+
extraction functions live here.
6+
"""
7+
import re
8+
from typing import List, Tuple
9+
10+
11+
# ============================================================
12+
# Text normalization
13+
# ============================================================
14+
def normalize(s: str) -> str:
15+
"""Lowercase and strip non-alphanum-or-space. Used for fuzzy matching
16+
against Q4 visual jitter."""
17+
return re.sub(r"[^a-z0-9 ]+", " ", s.lower())
18+
19+
20+
# ============================================================
21+
# Fuzzy word matching (Q4 jitter tolerant)
22+
# ============================================================
23+
def word_in_text(word: str, text_norm: str) -> bool:
24+
"""Word-boundary-aware fuzzy match. Tolerates Q4 KV jitter by
25+
matching shared prefixes between the query word and each word in
26+
the normalized text.
27+
28+
A `word` matches a region word `rw` if:
29+
- exact: rw == word
30+
- shared prefix: >= min_prefix chars, with shared length at least
31+
min(len(w), len(rw)) - 2
32+
"""
33+
if not word or len(word) < 3:
34+
return False
35+
w = word.lower()
36+
min_prefix = 4 if len(w) > 6 else 3
37+
for rw in text_norm.split():
38+
if not rw:
39+
continue
40+
if rw == w:
41+
return True
42+
shared = 0
43+
for a, b in zip(w, rw):
44+
if a == b:
45+
shared += 1
46+
else:
47+
break
48+
if shared >= min_prefix and shared >= min(len(w), len(rw)) - 2:
49+
return True
50+
return False
51+
52+
53+
def term_in_text(term: str, text_norm: str) -> bool:
54+
"""Multi-word term match: >= 50% of the words must fuzzy-match.
55+
Whole-phrase substring is allowed as a fast path for multi-word terms."""
56+
t = normalize(term)
57+
if not t:
58+
return False
59+
if " " in t and t in text_norm:
60+
return True
61+
words = [w for w in t.split() if len(w) >= 3]
62+
if not words:
63+
return False
64+
matched = sum(1 for w in words if word_in_text(w, text_norm))
65+
return matched >= max(1, (len(words) + 1) // 2)
66+
67+
68+
def fuzzy_in_region(term: str, region_norm: str) -> bool:
69+
"""Return True if `term` (possibly multi-word) appears in the region,
70+
tolerant of Q4 visual jitter on individual words."""
71+
return term_in_text(term, region_norm)
72+
73+
74+
# ============================================================
75+
# Stopwords & low-signal terms
76+
# ============================================================
77+
STOPWORDS = {
78+
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
79+
"of", "in", "on", "at", "to", "for", "from", "by", "with", "as",
80+
"and", "or", "but", "if", "then", "than", "that", "this", "these",
81+
"those", "what", "which", "who", "whom", "whose", "where", "when",
82+
"why", "how", "do", "does", "did", "done", "doing", "have", "has",
83+
"had", "having", "i", "you", "he", "she", "it", "we", "they",
84+
"me", "him", "her", "us", "them", "my", "your", "his", "its", "our",
85+
"their", "about", "into", "through", "during", "before", "after",
86+
"above", "below", "between", "out", "off", "over", "under",
87+
"again", "further", "once", "here", "there", "all", "any", "both",
88+
"each", "few", "more", "most", "other", "some", "such", "no", "nor",
89+
"not", "only", "own", "same", "so", "too", "very", "can", "will",
90+
"just", "would", "should", "could", "may", "might", "must",
91+
"much", "many", "long", "ago", "later", "well", "thing",
92+
"something", "anything", "nothing", "everything", "people",
93+
"person", "anyone", "someone",
94+
}
95+
96+
LOW_SIGNAL_TERMS = {
97+
"company", "year", "section", "report", "annual", "fiscal",
98+
}

bench/rlv/stages/gist.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,14 @@ class GistChunk:
5353
char_end: int
5454
head_text: str = "" # first ~200 chars (used by LLM-fallback outline)
5555
full_text: str = "" # complete chunk text (used by Day 3 non-LLM keyword scoring)
56+
full_text_norm: str = "" # D6/D13: pre-normalized text (avoids re-normalizing per score call)
5657
entities: List[str] = field(default_factory=list)
5758
summary: str = "" # optional LLM-generated summary
5859

5960
def to_dict(self):
60-
return asdict(self)
61+
d = asdict(self)
62+
d.pop("full_text_norm", None) # don't serialize internal cache
63+
return d
6164

6265

6366
@dataclass
@@ -240,12 +243,15 @@ def build_gist(
240243
else:
241244
summary = _parse_summary_response(s_result.text)
242245

246+
# D6/D13: pre-normalize text once during gist build
247+
from ._text import normalize as _norm
243248
gc = GistChunk(
244249
chunk_id=i,
245250
char_start=start,
246251
char_end=end,
247252
head_text=head_text,
248253
full_text=full_text,
254+
full_text_norm=_norm(full_text),
249255
entities=entities,
250256
summary=summary,
251257
)

bench/rlv/stages/locator.py

Lines changed: 8 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -26,32 +26,8 @@
2626

2727
from . import _llm
2828
from .gist import Gist
29-
30-
31-
# Common English stopwords + interrogatives + low-information question fillers.
32-
STOPWORDS = {
33-
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
34-
"of", "in", "on", "at", "to", "for", "from", "by", "with", "as",
35-
"and", "or", "but", "if", "then", "than", "that", "this", "these",
36-
"those", "what", "which", "who", "whom", "whose", "where", "when",
37-
"why", "how", "do", "does", "did", "done", "doing", "have", "has",
38-
"had", "having", "i", "you", "he", "she", "it", "we", "they",
39-
"me", "him", "her", "us", "them", "my", "your", "his", "its", "our",
40-
"their", "about", "into", "through", "during", "before", "after",
41-
"above", "below", "between", "out", "off", "over", "under",
42-
"again", "further", "once", "here", "there", "all", "any", "both",
43-
"each", "few", "more", "most", "other", "some", "such", "no", "nor",
44-
"not", "only", "own", "same", "so", "too", "very", "can", "will",
45-
"just", "would", "should", "could", "may", "might", "must",
46-
"much", "many", "long", "ago", "later", "well", "thing",
47-
"something", "anything", "nothing", "everything", "people",
48-
"person", "anyone", "someone",
49-
}
50-
51-
# Common business/document filler that adds noise to the score
52-
LOW_SIGNAL_TERMS = {
53-
"company", "year", "section", "report", "annual", "fiscal",
54-
}
29+
from ._text import (normalize as _normalize, word_in_text as _word_in_text,
30+
term_in_text as _term_in_text, STOPWORDS, LOW_SIGNAL_TERMS)
5531

5632
# Section title region weighting — words appearing in the first
5733
# SECTION_TITLE_CHARS of a chunk get this multiplier (matches in headers
@@ -97,11 +73,8 @@ def to_dict(self):
9773

9874
# ----------------------------------------------------------------------------
9975
# Non-LLM keyword overlap (primary signal)
76+
# _normalize, _word_in_text, _term_in_text imported from _text.py (I2/I3 dedup)
10077
# ----------------------------------------------------------------------------
101-
def _normalize(s: str) -> str:
102-
return re.sub(r"[^a-z0-9 ]+", " ", s.lower())
103-
104-
10578
def _question_keywords(question: str) -> List[Tuple[str, float]]:
10679
"""Extract weighted (term, weight) tuples from a question."""
10780
terms: List[Tuple[str, float]] = []
@@ -131,51 +104,6 @@ def add(term: str, weight: float) -> None:
131104
return terms
132105

133106

134-
def _word_in_text(word: str, text_norm: str) -> bool:
135-
"""Day 3 word-boundary-aware fuzzy match.
136-
137-
A `word` matches a region word `rw` if:
138-
- exact: rw == word
139-
- shared prefix: ≥4 chars (≥3 for short ≤6-char words), with the
140-
shared prefix at least min(len(w), len(rw)) - 2.
141-
Word-by-word matching avoids the substring trap (e.g., "event" in
142-
"revenue" via "even").
143-
"""
144-
if not word or len(word) < 3:
145-
return False
146-
w = word.lower()
147-
min_prefix = 4 if len(w) > 6 else 3
148-
for rw in text_norm.split():
149-
if not rw:
150-
continue
151-
if rw == w:
152-
return True
153-
shared = 0
154-
for a, b in zip(w, rw):
155-
if a == b:
156-
shared += 1
157-
else:
158-
break
159-
if shared >= min_prefix and shared >= min(len(w), len(rw)) - 2:
160-
return True
161-
return False
162-
163-
164-
def _term_in_text(term: str, text_norm: str) -> bool:
165-
"""Multi-word term match: ≥50% of the words must fuzzy-match.
166-
Whole-phrase substring is allowed as a fast path for multi-word terms."""
167-
t = _normalize(term)
168-
if not t:
169-
return False
170-
if " " in t and t in text_norm:
171-
return True
172-
words = [w for w in t.split() if len(w) >= 3]
173-
if not words:
174-
return False
175-
matched = sum(1 for w in words if _word_in_text(w, text_norm))
176-
return matched >= max(1, (len(words) + 1) // 2)
177-
178-
179107
_HEADING_RE = re.compile(r"^(?:section|chapter|part|appendix)\s*[ivxlcdm\d]+\s*[:.\-]", re.IGNORECASE)
180108

181109

@@ -213,7 +141,8 @@ def _score_chunk(weighted_terms: List[Tuple[str, float]], chunk) -> float:
213141
mid-paragraph, and an unconditional bonus boosts incidental words.
214142
"""
215143
text = chunk.full_text or chunk.head_text
216-
text_norm = _normalize(text)
144+
# D6/D13: use pre-computed normalized text if available
145+
text_norm = chunk.full_text_norm if chunk.full_text_norm else _normalize(text)
217146
has_heading = _looks_like_heading(text)
218147
title_norm = _normalize(text[:SECTION_TITLE_CHARS]) if (text and has_heading) else ""
219148
entities_norm = _normalize(" ".join(chunk.entities)) if chunk.entities else ""
@@ -270,7 +199,9 @@ def _bm25_score_chunks(question: str, gist: Gist, excluded: List[int],
270199
return []
271200

272201
# Document frequency for each term
273-
texts = [_normalize(c.full_text or c.head_text) for c in chunks]
202+
# D13: use pre-computed normalized text where available
203+
texts = [c.full_text_norm if c.full_text_norm else _normalize(c.full_text or c.head_text)
204+
for c in chunks]
274205
avg_dl = sum(len(t.split()) for t in texts) / max(N, 1)
275206

276207
df = {}

bench/rlv/stages/lookup.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,14 @@ def lookup(
118118
original text. The model never has to QUOTE — only SELECT.
119119
"""
120120
region_text = doc_text[region.char_start:region.char_end]
121+
122+
# A9: guard empty region (e.g., fallback pointer with char_start=char_end=0)
123+
if not region_text.strip():
124+
return LookupResult(
125+
answer="[no text in selected region]", region_text=region_text,
126+
chunk_id=region.chunk_id, raw_llm_output="", method="error",
127+
)
128+
121129
sentences = _split_into_sentences(region_text)
122130

123131
# Day 4 adaptive lookup: select-by-index for small chunks (≤8 sentences,

bench/rlv/stages/verifier.py

Lines changed: 5 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
from . import _llm
2727
from .gist import Gist
2828
from .lookup import LookupResult
29-
from .locator import _question_keywords, _term_in_text, _keyword_locate, _normalize as _loc_normalize
29+
from ._text import normalize as _normalize, word_in_text as _fuzzy_word_in_region, fuzzy_in_region as _fuzzy_in_region
30+
from .locator import _question_keywords, _term_in_text, _keyword_locate
3031

3132

3233
# Day 3: model-side preamble tokens that show up in answer text but
@@ -60,18 +61,11 @@ class VerifyResult:
6061

6162
# ----------------------------------------------------------------------------
6263
# Literal (regex-based) citation check
64+
# _normalize, _fuzzy_word_in_region, _fuzzy_in_region imported from _text.py
6365
# ----------------------------------------------------------------------------
64-
def _normalize(s: str) -> str:
65-
"""Lowercase and strip non-alphanum-or-space. Used for fuzzy matching
66-
against Q4 visual jitter."""
67-
return re.sub(r"[^a-z0-9 ]+", " ", s.lower())
68-
69-
7066
def _extract_answer_key_terms(answer: str) -> tuple[list[str], list[str]]:
71-
"""Day 3: returns (word_terms, number_terms) so the matcher can apply
72-
different rules — words use fuzzy matching for Q4 jitter, numbers
73-
use exact match (2002 must NOT fuzzy-match 2023). Filters
74-
ANSWER_NOISE_TOKENS so model preambles don't get extracted as facts."""
67+
"""Returns (word_terms, number_terms). Words use fuzzy matching for Q4
68+
jitter, numbers use exact match. Filters ANSWER_NOISE_TOKENS."""
7569
multi_cap = re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b", answer)
7670
single_cap = re.findall(r"\b[A-Z][a-z]{3,}\b", answer)
7771
nums = re.findall(r"\b\d{2,5}\b", answer)
@@ -82,7 +76,6 @@ def _extract_answer_key_terms(answer: str) -> tuple[list[str], list[str]]:
8276
key = term.lower()
8377
if key in seen:
8478
continue
85-
# Exact word match (not substring) — "text" must not filter "context"
8679
if key in ANSWER_NOISE_TOKENS:
8780
continue
8881
seen.add(key)
@@ -99,50 +92,6 @@ def _extract_answer_key_terms(answer: str) -> tuple[list[str], list[str]]:
9992
return word_terms[:8], number_terms[:4]
10093

10194

102-
def _fuzzy_word_in_region(word: str, region_norm: str) -> bool:
103-
"""Day 3: word-boundary-aware fuzzy match. Iterates region words and
104-
checks shared-prefix similarity. Avoids cross-word substring traps
105-
like "event" matching "revenue" via "even"."""
106-
if not word or len(word) < 3:
107-
return False
108-
w = word.lower()
109-
min_prefix = 4 if len(w) > 6 else 3
110-
for rw in region_norm.split():
111-
if not rw:
112-
continue
113-
if rw == w:
114-
return True
115-
shared = 0
116-
for a, b in zip(w, rw):
117-
if a == b:
118-
shared += 1
119-
else:
120-
break
121-
if shared >= min_prefix and shared >= min(len(w), len(rw)) - 2:
122-
return True
123-
return False
124-
125-
126-
def _fuzzy_in_region(term: str, region_norm: str) -> bool:
127-
"""Return True if `term` (possibly multi-word) appears in the region,
128-
tolerant of Q4 visual jitter on individual words.
129-
130-
For multi-word terms (e.g., "John Williams"), require that ≥50% of the
131-
words match individually via _fuzzy_word_in_region. For single words,
132-
require that one word matches.
133-
"""
134-
term_norm = _normalize(term)
135-
if not term_norm:
136-
return False
137-
if term_norm in region_norm:
138-
return True
139-
words = [w for w in term_norm.split() if len(w) >= 3]
140-
if not words:
141-
return False
142-
matched = sum(1 for w in words if _fuzzy_word_in_region(w, region_norm))
143-
return matched >= max(1, len(words) // 2 + (len(words) % 2))
144-
145-
14695
def _question_grounded_via_locator(
14796
question: str,
14897
chunk_id: int | None,

0 commit comments

Comments
 (0)