Skip to content

Commit 88873e5

Browse files
authored
Merge pull request #24 from brianmeyer/codex/rec-172-memory-benchmark-scoring
Make cross-modal benchmark scoring explicitly memory-level
2 parents f6f1fed + 0be196a commit 88873e5

2 files changed

Lines changed: 242 additions & 59 deletions

File tree

benchmarks/cross_modal_ablation.py

Lines changed: 207 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import json
2828
import math
2929
import os
30+
import re
3031
import shutil
3132
import sys
3233
import tempfile
@@ -1365,6 +1366,13 @@ class StageResult:
13651366
mrr_sum: float = 0.0
13661367
precision_at_5_sum: float = 0.0
13671368
precision_at_10_sum: float = 0.0
1369+
asset_hits_at_1: int = 0
1370+
asset_hits_at_5: int = 0
1371+
asset_hits_at_10: int = 0
1372+
asset_ndcg_sum: float = 0.0
1373+
asset_mrr_sum: float = 0.0
1374+
asset_precision_at_5_sum: float = 0.0
1375+
asset_precision_at_10_sum: float = 0.0
13681376
latencies_ms: list = field(default_factory=list)
13691377
per_query_results: list = field(default_factory=list)
13701378

@@ -1407,6 +1415,34 @@ def precision_at_5(self) -> float:
14071415
def precision_at_10(self) -> float:
14081416
return self.precision_at_10_sum / max(self.total_queries, 1)
14091417

1418+
@property
1419+
def asset_recall_at_1(self) -> float:
1420+
return self.asset_hits_at_1 / max(self.total_queries, 1)
1421+
1422+
@property
1423+
def asset_recall_at_5(self) -> float:
1424+
return self.asset_hits_at_5 / max(self.total_queries, 1)
1425+
1426+
@property
1427+
def asset_recall_at_10(self) -> float:
1428+
return self.asset_hits_at_10 / max(self.total_queries, 1)
1429+
1430+
@property
1431+
def asset_ndcg_at_10(self) -> float:
1432+
return self.asset_ndcg_sum / max(self.total_queries, 1)
1433+
1434+
@property
1435+
def asset_mrr(self) -> float:
1436+
return self.asset_mrr_sum / max(self.total_queries, 1)
1437+
1438+
@property
1439+
def asset_precision_at_5(self) -> float:
1440+
return self.asset_precision_at_5_sum / max(self.total_queries, 1)
1441+
1442+
@property
1443+
def asset_precision_at_10(self) -> float:
1444+
return self.asset_precision_at_10_sum / max(self.total_queries, 1)
1445+
14101446
@property
14111447
def p50_ms(self) -> float:
14121448
if not self.latencies_ms:
@@ -1462,6 +1498,117 @@ def _ndcg(relevances: List[float], k: int = 10) -> float:
14621498
return dcg / ideal if ideal > 0 else 0.0
14631499

14641500

1501+
@dataclass
1502+
class EvaluationMetrics:
1503+
hit_at_1: bool
1504+
hit_at_5: bool
1505+
hit_at_10: bool
1506+
ndcg: float
1507+
rr: float
1508+
precision_at_5: float
1509+
precision_at_10: float
1510+
1511+
1512+
def _normalize_benchmark_path(path: str, corpus_dir: Path) -> str:
1513+
"""Normalize benchmark filepaths to corpus-relative paths when possible."""
1514+
raw = str(path or "").strip()
1515+
if not raw:
1516+
return ""
1517+
1518+
if raw.startswith("recallforge://"):
1519+
without_scheme = raw[len("recallforge://"):]
1520+
_, _, raw = without_scheme.partition("/")
1521+
raw = raw or without_scheme
1522+
1523+
raw = re.sub(r"\s+", " ", raw).strip()
1524+
candidate = Path(raw)
1525+
if candidate.is_absolute():
1526+
try:
1527+
return candidate.resolve().relative_to(corpus_dir.resolve()).as_posix()
1528+
except Exception:
1529+
return candidate.resolve().as_posix()
1530+
return raw.lstrip("./")
1531+
1532+
1533+
def _memory_key_for_path(path: str, corpus_dir: Path) -> str:
1534+
"""Map a result or ground-truth path to its canonical parent memory path."""
1535+
normalized = _normalize_benchmark_path(path, corpus_dir)
1536+
if not normalized:
1537+
return ""
1538+
if normalized.endswith(".transcript.json"):
1539+
return normalized[: -len(".transcript.json")] + ".mp4"
1540+
return normalized.split("::", 1)[0]
1541+
1542+
1543+
def _score_relevances(relevances: List[float]) -> EvaluationMetrics:
1544+
"""Compute benchmark metrics for a ranked relevance vector."""
1545+
first_hit_rank = next((i + 1 for i, rel in enumerate(relevances[:10]) if rel > 0), None)
1546+
hit_1 = any(rel > 0 for rel in relevances[:1])
1547+
hit_5 = any(rel > 0 for rel in relevances[:5])
1548+
hit_10 = any(rel > 0 for rel in relevances[:10])
1549+
ndcg = _ndcg(relevances, 10)
1550+
rr = 1.0 / first_hit_rank if first_hit_rank else 0.0
1551+
1552+
max_rel = 2.0
1553+
prec_5 = (
1554+
sum(relevances[:5]) / (5 * max_rel)
1555+
if len(relevances) >= 5
1556+
else (sum(relevances) / (len(relevances) * max_rel) if relevances else 0.0)
1557+
)
1558+
prec_10 = (
1559+
sum(relevances[:10]) / (10 * max_rel)
1560+
if len(relevances) >= 10
1561+
else (sum(relevances) / (len(relevances) * max_rel) if relevances else 0.0)
1562+
)
1563+
return EvaluationMetrics(
1564+
hit_at_1=hit_1,
1565+
hit_at_5=hit_5,
1566+
hit_at_10=hit_10,
1567+
ndcg=ndcg,
1568+
rr=rr,
1569+
precision_at_5=prec_5,
1570+
precision_at_10=prec_10,
1571+
)
1572+
1573+
1574+
def evaluate_results_detailed(
1575+
results: List[Dict[str, Any]],
1576+
gt: GroundTruth,
1577+
corpus_dir: Path,
1578+
) -> Dict[str, EvaluationMetrics]:
1579+
"""Evaluate results at both parent-memory and raw asset granularity."""
1580+
relevant_asset_scores: Dict[str, int] = {}
1581+
relevant_memory_scores: Dict[str, int] = {}
1582+
1583+
for path in gt.relevant_paths:
1584+
normalized_path = _normalize_benchmark_path(path, corpus_dir)
1585+
if normalized_path:
1586+
relevant_asset_scores[normalized_path] = max(
1587+
relevant_asset_scores.get(normalized_path, 0),
1588+
gt.get_relevance_score(path),
1589+
)
1590+
memory_key = _memory_key_for_path(path, corpus_dir)
1591+
if memory_key:
1592+
relevant_memory_scores[memory_key] = max(
1593+
relevant_memory_scores.get(memory_key, 0),
1594+
gt.get_relevance_score(path),
1595+
)
1596+
1597+
memory_relevances: List[float] = []
1598+
asset_relevances: List[float] = []
1599+
for result in results[:10]:
1600+
filepath = result.get("filepath", "")
1601+
normalized_result = _normalize_benchmark_path(filepath, corpus_dir)
1602+
asset_relevances.append(float(relevant_asset_scores.get(normalized_result, 0)))
1603+
memory_key = _memory_key_for_path(filepath, corpus_dir)
1604+
memory_relevances.append(float(relevant_memory_scores.get(memory_key, 0)))
1605+
1606+
return {
1607+
"memory": _score_relevances(memory_relevances),
1608+
"asset": _score_relevances(asset_relevances),
1609+
}
1610+
1611+
14651612
def evaluate_results(
14661613
results: List[Dict[str, Any]],
14671614
gt: GroundTruth,
@@ -1471,42 +1618,16 @@ def evaluate_results(
14711618
14721619
Returns: (hit@1, hit@5, hit@10, ndcg@10, reciprocal_rank, precision@5, precision@10)
14731620
"""
1474-
# Normalize GT paths to absolute
1475-
gt_paths_abs = set()
1476-
for p in gt.relevant_paths:
1477-
abs_path = str((corpus_dir / p).resolve())
1478-
gt_paths_abs.add(abs_path)
1479-
gt_paths_abs.add(Path(p).stem)
1480-
1481-
def get_relevance_score(result: Dict) -> int:
1482-
fp = result.get("filepath", "")
1483-
# Check absolute path match
1484-
for gp in gt.relevant_paths:
1485-
if gp in fp or Path(gp).stem in fp:
1486-
return gt.get_relevance_score(gp)
1487-
return 0
1488-
1489-
# Build relevance vector with graded scores
1490-
relevances = []
1491-
first_hit_rank = None
1492-
for i, r in enumerate(results[:10]):
1493-
rel = get_relevance_score(r)
1494-
relevances.append(float(rel))
1495-
if rel > 0 and first_hit_rank is None:
1496-
first_hit_rank = i + 1
1497-
1498-
hit_1 = any(r > 0 for r in relevances[:1])
1499-
hit_5 = any(r > 0 for r in relevances[:5])
1500-
hit_10 = any(r > 0 for r in relevances[:10])
1501-
ndcg = _ndcg(relevances, 10)
1502-
rr = 1.0 / first_hit_rank if first_hit_rank else 0.0
1503-
1504-
# Precision@K with graded relevance (normalize by max relevance)
1505-
max_rel = 2.0 # Maximum relevance score
1506-
prec_5 = sum(relevances[:5]) / (5 * max_rel) if len(relevances) >= 5 else sum(relevances) / (len(relevances) * max_rel) if relevances else 0.0
1507-
prec_10 = sum(relevances[:10]) / (10 * max_rel) if len(relevances) >= 10 else sum(relevances) / (len(relevances) * max_rel) if relevances else 0.0
1508-
1509-
return hit_1, hit_5, hit_10, ndcg, rr, prec_5, prec_10
1621+
memory_metrics = evaluate_results_detailed(results, gt, corpus_dir)["memory"]
1622+
return (
1623+
memory_metrics.hit_at_1,
1624+
memory_metrics.hit_at_5,
1625+
memory_metrics.hit_at_10,
1626+
memory_metrics.ndcg,
1627+
memory_metrics.rr,
1628+
memory_metrics.precision_at_5,
1629+
memory_metrics.precision_at_10,
1630+
)
15101631

15111632

15121633
# ---------------------------------------------------------------------------
@@ -1736,6 +1857,15 @@ def _build_output_payload(
17361857
"mrr": None if sr.skipped else round(sr.mrr, 4),
17371858
"p50_ms": None if sr.skipped else round(sr.p50_ms, 1),
17381859
"p95_ms": None if sr.skipped else round(sr.p95_ms, 1),
1860+
"asset_level": {
1861+
"recall_at_1": None if sr.skipped else round(sr.asset_recall_at_1, 4),
1862+
"recall_at_5": None if sr.skipped else round(sr.asset_recall_at_5, 4),
1863+
"recall_at_10": None if sr.skipped else round(sr.asset_recall_at_10, 4),
1864+
"precision_at_5": None if sr.skipped else round(sr.asset_precision_at_5, 4),
1865+
"precision_at_10": None if sr.skipped else round(sr.asset_precision_at_10, 4),
1866+
"ndcg_at_10": None if sr.skipped else round(sr.asset_ndcg_at_10, 4),
1867+
"mrr": None if sr.skipped else round(sr.asset_mrr, 4),
1868+
},
17391869
"total_queries": sr.total_queries,
17401870
"by_difficulty": {
17411871
"easy": {
@@ -2057,27 +2187,36 @@ def save_checkpoint(
20572187
backend, storage, gt,
20582188
collection, effective_mode,
20592189
)
2060-
h1, h5, h10, ndcg, rr, prec_5, prec_10 = evaluate_results(results, gt, CORPUS_DIR)
2061-
2062-
sr.hits_at_1 += int(h1)
2063-
sr.hits_at_5 += int(h5)
2064-
sr.hits_at_10 += int(h10)
2065-
sr.ndcg_sum += ndcg
2066-
sr.mrr_sum += rr
2067-
sr.precision_at_5_sum += prec_5
2068-
sr.precision_at_10_sum += prec_10
2190+
eval_detail = evaluate_results_detailed(results, gt, CORPUS_DIR)
2191+
memory_metrics = eval_detail["memory"]
2192+
asset_metrics = eval_detail["asset"]
2193+
2194+
sr.hits_at_1 += int(memory_metrics.hit_at_1)
2195+
sr.hits_at_5 += int(memory_metrics.hit_at_5)
2196+
sr.hits_at_10 += int(memory_metrics.hit_at_10)
2197+
sr.ndcg_sum += memory_metrics.ndcg
2198+
sr.mrr_sum += memory_metrics.rr
2199+
sr.precision_at_5_sum += memory_metrics.precision_at_5
2200+
sr.precision_at_10_sum += memory_metrics.precision_at_10
2201+
sr.asset_hits_at_1 += int(asset_metrics.hit_at_1)
2202+
sr.asset_hits_at_5 += int(asset_metrics.hit_at_5)
2203+
sr.asset_hits_at_10 += int(asset_metrics.hit_at_10)
2204+
sr.asset_ndcg_sum += asset_metrics.ndcg
2205+
sr.asset_mrr_sum += asset_metrics.rr
2206+
sr.asset_precision_at_5_sum += asset_metrics.precision_at_5
2207+
sr.asset_precision_at_10_sum += asset_metrics.precision_at_10
20692208
sr.latencies_ms.append(latency)
20702209

20712210
# Track per-difficulty hits
20722211
if gt.difficulty == "easy":
2073-
sr.easy_hits_at_1 += int(h1)
2074-
sr.easy_hits_at_5 += int(h5)
2212+
sr.easy_hits_at_1 += int(memory_metrics.hit_at_1)
2213+
sr.easy_hits_at_5 += int(memory_metrics.hit_at_5)
20752214
elif gt.difficulty == "medium":
2076-
sr.medium_hits_at_1 += int(h1)
2077-
sr.medium_hits_at_5 += int(h5)
2215+
sr.medium_hits_at_1 += int(memory_metrics.hit_at_1)
2216+
sr.medium_hits_at_5 += int(memory_metrics.hit_at_5)
20782217
elif gt.difficulty == "hard":
2079-
sr.hard_hits_at_1 += int(h1)
2080-
sr.hard_hits_at_5 += int(h5)
2218+
sr.hard_hits_at_1 += int(memory_metrics.hit_at_1)
2219+
sr.hard_hits_at_5 += int(memory_metrics.hit_at_5)
20812220

20822221
# Store per-query result with audit trail for post-hoc analysis
20832222
sr.per_query_results.append({
@@ -2087,13 +2226,22 @@ def save_checkpoint(
20872226
"relevant_paths": gt.relevant_paths,
20882227
"difficulty": gt.difficulty,
20892228
"is_negative_control": gt.is_negative_control,
2090-
"hit_at_1": h1,
2091-
"hit_at_5": h5,
2092-
"hit_at_10": h10,
2093-
"ndcg": ndcg,
2094-
"mrr": rr,
2095-
"precision_at_5": prec_5,
2096-
"precision_at_10": prec_10,
2229+
"hit_at_1": memory_metrics.hit_at_1,
2230+
"hit_at_5": memory_metrics.hit_at_5,
2231+
"hit_at_10": memory_metrics.hit_at_10,
2232+
"ndcg": memory_metrics.ndcg,
2233+
"mrr": memory_metrics.rr,
2234+
"precision_at_5": memory_metrics.precision_at_5,
2235+
"precision_at_10": memory_metrics.precision_at_10,
2236+
"asset_level": {
2237+
"hit_at_1": asset_metrics.hit_at_1,
2238+
"hit_at_5": asset_metrics.hit_at_5,
2239+
"hit_at_10": asset_metrics.hit_at_10,
2240+
"ndcg": asset_metrics.ndcg,
2241+
"mrr": asset_metrics.rr,
2242+
"precision_at_5": asset_metrics.precision_at_5,
2243+
"precision_at_10": asset_metrics.precision_at_10,
2244+
},
20972245
"latency_ms": latency,
20982246
"results": results,
20992247
})
@@ -2113,7 +2261,7 @@ def save_checkpoint(
21132261
all_results[stage_name][cat_name] = sr
21142262
print(f" {stage_name} for {cat_name} ({len(queries)}q)... "
21152263
f"R@1={sr.recall_at_1:.1%} R@5={sr.recall_at_5:.1%} "
2116-
f"R@10={sr.recall_at_10:.1%} P@5={sr.precision_at_5:.3f} "
2264+
f"R@10={sr.recall_at_10:.1%} AssetR@1={sr.asset_recall_at_1:.1%} P@5={sr.precision_at_5:.3f} "
21172265
f"NDCG@10={sr.ndcg_at_10:.3f} MRR={sr.mrr:.3f}")
21182266
save_checkpoint(run_status="partial")
21192267

tests/test_cross_modal_benchmark_defs.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,10 @@ def test_video_frame_assets_count_as_hits_for_parent_video_ground_truth(self):
170170
self.assertEqual(prec_5, 1.0)
171171
self.assertEqual(prec_10, 1.0)
172172

173+
detailed = module.evaluate_results_detailed([result], gt, module.CORPUS_DIR)
174+
self.assertTrue(detailed["memory"].hit_at_1)
175+
self.assertFalse(detailed["asset"].hit_at_1)
176+
173177
def test_video_transcript_assets_count_as_hits_for_parent_video_ground_truth(self):
174178
module = _load_cross_modal_ablation()
175179

@@ -196,6 +200,10 @@ def test_video_transcript_assets_count_as_hits_for_parent_video_ground_truth(sel
196200
self.assertEqual(prec_5, 1.0)
197201
self.assertEqual(prec_10, 1.0)
198202

203+
detailed = module.evaluate_results_detailed([result], gt, module.CORPUS_DIR)
204+
self.assertTrue(detailed["memory"].hit_at_1)
205+
self.assertFalse(detailed["asset"].hit_at_1)
206+
199207
def test_output_payload_tracks_partial_progress(self):
200208
module = _load_cross_modal_ablation()
201209

@@ -215,6 +223,26 @@ def test_output_payload_tracks_partial_progress(self):
215223
precision_at_5_sum=1.0,
216224
precision_at_10_sum=1.0,
217225
)
226+
stage_result.asset_hits_at_1 = 0
227+
stage_result.asset_hits_at_5 = 0
228+
stage_result.asset_hits_at_10 = 0
229+
stage_result.asset_ndcg_sum = 0.0
230+
stage_result.asset_mrr_sum = 0.0
231+
stage_result.asset_precision_at_5_sum = 0.0
232+
stage_result.asset_precision_at_10_sum = 0.0
233+
stage_result.per_query_results.append(
234+
{
235+
"query": module.TEXT_TO_TEXT[0].query,
236+
"hit_at_1": True,
237+
"hit_at_5": True,
238+
"hit_at_10": True,
239+
"asset_level": {
240+
"hit_at_1": False,
241+
"hit_at_5": False,
242+
"hit_at_10": False,
243+
},
244+
}
245+
)
218246

219247
payload = module._build_output_payload(
220248
categories,
@@ -241,6 +269,13 @@ def test_output_payload_tracks_partial_progress(self):
241269
payload["stages"]["Vector-only"]["text_to_text"]["recall_at_1"],
242270
1.0,
243271
)
272+
self.assertEqual(
273+
payload["stages"]["Vector-only"]["text_to_text"]["asset_level"]["recall_at_1"],
274+
0.0,
275+
)
276+
self.assertFalse(
277+
payload["stages"]["Vector-only"]["text_to_text"]["per_query_results"][0]["asset_level"]["hit_at_1"]
278+
)
244279

245280

246281
if __name__ == "__main__":

0 commit comments

Comments
 (0)