2727import json
2828import math
2929import os
30+ import re
3031import shutil
3132import sys
3233import tempfile
@@ -1365,6 +1366,13 @@ class StageResult:
13651366 mrr_sum : float = 0.0
13661367 precision_at_5_sum : float = 0.0
13671368 precision_at_10_sum : float = 0.0
1369+ asset_hits_at_1 : int = 0
1370+ asset_hits_at_5 : int = 0
1371+ asset_hits_at_10 : int = 0
1372+ asset_ndcg_sum : float = 0.0
1373+ asset_mrr_sum : float = 0.0
1374+ asset_precision_at_5_sum : float = 0.0
1375+ asset_precision_at_10_sum : float = 0.0
13681376 latencies_ms : list = field (default_factory = list )
13691377 per_query_results : list = field (default_factory = list )
13701378
@@ -1407,6 +1415,34 @@ def precision_at_5(self) -> float:
14071415 def precision_at_10 (self ) -> float :
14081416 return self .precision_at_10_sum / max (self .total_queries , 1 )
14091417
1418+ @property
1419+ def asset_recall_at_1 (self ) -> float :
1420+ return self .asset_hits_at_1 / max (self .total_queries , 1 )
1421+
1422+ @property
1423+ def asset_recall_at_5 (self ) -> float :
1424+ return self .asset_hits_at_5 / max (self .total_queries , 1 )
1425+
1426+ @property
1427+ def asset_recall_at_10 (self ) -> float :
1428+ return self .asset_hits_at_10 / max (self .total_queries , 1 )
1429+
1430+ @property
1431+ def asset_ndcg_at_10 (self ) -> float :
1432+ return self .asset_ndcg_sum / max (self .total_queries , 1 )
1433+
1434+ @property
1435+ def asset_mrr (self ) -> float :
1436+ return self .asset_mrr_sum / max (self .total_queries , 1 )
1437+
1438+ @property
1439+ def asset_precision_at_5 (self ) -> float :
1440+ return self .asset_precision_at_5_sum / max (self .total_queries , 1 )
1441+
1442+ @property
1443+ def asset_precision_at_10 (self ) -> float :
1444+ return self .asset_precision_at_10_sum / max (self .total_queries , 1 )
1445+
14101446 @property
14111447 def p50_ms (self ) -> float :
14121448 if not self .latencies_ms :
@@ -1462,6 +1498,117 @@ def _ndcg(relevances: List[float], k: int = 10) -> float:
14621498 return dcg / ideal if ideal > 0 else 0.0
14631499
14641500
1501+ @dataclass
1502+ class EvaluationMetrics :
1503+ hit_at_1 : bool
1504+ hit_at_5 : bool
1505+ hit_at_10 : bool
1506+ ndcg : float
1507+ rr : float
1508+ precision_at_5 : float
1509+ precision_at_10 : float
1510+
1511+
1512+ def _normalize_benchmark_path (path : str , corpus_dir : Path ) -> str :
1513+ """Normalize benchmark filepaths to corpus-relative paths when possible."""
1514+ raw = str (path or "" ).strip ()
1515+ if not raw :
1516+ return ""
1517+
1518+ if raw .startswith ("recallforge://" ):
1519+ without_scheme = raw [len ("recallforge://" ):]
1520+ _ , _ , raw = without_scheme .partition ("/" )
1521+ raw = raw or without_scheme
1522+
1523+ raw = re .sub (r"\s+" , " " , raw ).strip ()
1524+ candidate = Path (raw )
1525+ if candidate .is_absolute ():
1526+ try :
1527+ return candidate .resolve ().relative_to (corpus_dir .resolve ()).as_posix ()
1528+ except Exception :
1529+ return candidate .resolve ().as_posix ()
1530+ return raw .lstrip ("./" )
1531+
1532+
1533+ def _memory_key_for_path (path : str , corpus_dir : Path ) -> str :
1534+ """Map a result or ground-truth path to its canonical parent memory path."""
1535+ normalized = _normalize_benchmark_path (path , corpus_dir )
1536+ if not normalized :
1537+ return ""
1538+ if normalized .endswith (".transcript.json" ):
1539+ return normalized [: - len (".transcript.json" )] + ".mp4"
1540+ return normalized .split ("::" , 1 )[0 ]
1541+
1542+
1543+ def _score_relevances (relevances : List [float ]) -> EvaluationMetrics :
1544+ """Compute benchmark metrics for a ranked relevance vector."""
1545+ first_hit_rank = next ((i + 1 for i , rel in enumerate (relevances [:10 ]) if rel > 0 ), None )
1546+ hit_1 = any (rel > 0 for rel in relevances [:1 ])
1547+ hit_5 = any (rel > 0 for rel in relevances [:5 ])
1548+ hit_10 = any (rel > 0 for rel in relevances [:10 ])
1549+ ndcg = _ndcg (relevances , 10 )
1550+ rr = 1.0 / first_hit_rank if first_hit_rank else 0.0
1551+
1552+ max_rel = 2.0
1553+ prec_5 = (
1554+ sum (relevances [:5 ]) / (5 * max_rel )
1555+ if len (relevances ) >= 5
1556+ else (sum (relevances ) / (len (relevances ) * max_rel ) if relevances else 0.0 )
1557+ )
1558+ prec_10 = (
1559+ sum (relevances [:10 ]) / (10 * max_rel )
1560+ if len (relevances ) >= 10
1561+ else (sum (relevances ) / (len (relevances ) * max_rel ) if relevances else 0.0 )
1562+ )
1563+ return EvaluationMetrics (
1564+ hit_at_1 = hit_1 ,
1565+ hit_at_5 = hit_5 ,
1566+ hit_at_10 = hit_10 ,
1567+ ndcg = ndcg ,
1568+ rr = rr ,
1569+ precision_at_5 = prec_5 ,
1570+ precision_at_10 = prec_10 ,
1571+ )
1572+
1573+
1574+ def evaluate_results_detailed (
1575+ results : List [Dict [str , Any ]],
1576+ gt : GroundTruth ,
1577+ corpus_dir : Path ,
1578+ ) -> Dict [str , EvaluationMetrics ]:
1579+ """Evaluate results at both parent-memory and raw asset granularity."""
1580+ relevant_asset_scores : Dict [str , int ] = {}
1581+ relevant_memory_scores : Dict [str , int ] = {}
1582+
1583+ for path in gt .relevant_paths :
1584+ normalized_path = _normalize_benchmark_path (path , corpus_dir )
1585+ if normalized_path :
1586+ relevant_asset_scores [normalized_path ] = max (
1587+ relevant_asset_scores .get (normalized_path , 0 ),
1588+ gt .get_relevance_score (path ),
1589+ )
1590+ memory_key = _memory_key_for_path (path , corpus_dir )
1591+ if memory_key :
1592+ relevant_memory_scores [memory_key ] = max (
1593+ relevant_memory_scores .get (memory_key , 0 ),
1594+ gt .get_relevance_score (path ),
1595+ )
1596+
1597+ memory_relevances : List [float ] = []
1598+ asset_relevances : List [float ] = []
1599+ for result in results [:10 ]:
1600+ filepath = result .get ("filepath" , "" )
1601+ normalized_result = _normalize_benchmark_path (filepath , corpus_dir )
1602+ asset_relevances .append (float (relevant_asset_scores .get (normalized_result , 0 )))
1603+ memory_key = _memory_key_for_path (filepath , corpus_dir )
1604+ memory_relevances .append (float (relevant_memory_scores .get (memory_key , 0 )))
1605+
1606+ return {
1607+ "memory" : _score_relevances (memory_relevances ),
1608+ "asset" : _score_relevances (asset_relevances ),
1609+ }
1610+
1611+
14651612def evaluate_results (
14661613 results : List [Dict [str , Any ]],
14671614 gt : GroundTruth ,
@@ -1471,42 +1618,16 @@ def evaluate_results(
14711618
14721619 Returns: (hit@1, hit@5, hit@10, ndcg@10, reciprocal_rank, precision@5, precision@10)
14731620 """
1474- # Normalize GT paths to absolute
1475- gt_paths_abs = set ()
1476- for p in gt .relevant_paths :
1477- abs_path = str ((corpus_dir / p ).resolve ())
1478- gt_paths_abs .add (abs_path )
1479- gt_paths_abs .add (Path (p ).stem )
1480-
1481- def get_relevance_score (result : Dict ) -> int :
1482- fp = result .get ("filepath" , "" )
1483- # Check absolute path match
1484- for gp in gt .relevant_paths :
1485- if gp in fp or Path (gp ).stem in fp :
1486- return gt .get_relevance_score (gp )
1487- return 0
1488-
1489- # Build relevance vector with graded scores
1490- relevances = []
1491- first_hit_rank = None
1492- for i , r in enumerate (results [:10 ]):
1493- rel = get_relevance_score (r )
1494- relevances .append (float (rel ))
1495- if rel > 0 and first_hit_rank is None :
1496- first_hit_rank = i + 1
1497-
1498- hit_1 = any (r > 0 for r in relevances [:1 ])
1499- hit_5 = any (r > 0 for r in relevances [:5 ])
1500- hit_10 = any (r > 0 for r in relevances [:10 ])
1501- ndcg = _ndcg (relevances , 10 )
1502- rr = 1.0 / first_hit_rank if first_hit_rank else 0.0
1503-
1504- # Precision@K with graded relevance (normalize by max relevance)
1505- max_rel = 2.0 # Maximum relevance score
1506- prec_5 = sum (relevances [:5 ]) / (5 * max_rel ) if len (relevances ) >= 5 else sum (relevances ) / (len (relevances ) * max_rel ) if relevances else 0.0
1507- prec_10 = sum (relevances [:10 ]) / (10 * max_rel ) if len (relevances ) >= 10 else sum (relevances ) / (len (relevances ) * max_rel ) if relevances else 0.0
1508-
1509- return hit_1 , hit_5 , hit_10 , ndcg , rr , prec_5 , prec_10
1621+ memory_metrics = evaluate_results_detailed (results , gt , corpus_dir )["memory" ]
1622+ return (
1623+ memory_metrics .hit_at_1 ,
1624+ memory_metrics .hit_at_5 ,
1625+ memory_metrics .hit_at_10 ,
1626+ memory_metrics .ndcg ,
1627+ memory_metrics .rr ,
1628+ memory_metrics .precision_at_5 ,
1629+ memory_metrics .precision_at_10 ,
1630+ )
15101631
15111632
15121633# ---------------------------------------------------------------------------
@@ -1736,6 +1857,15 @@ def _build_output_payload(
17361857 "mrr" : None if sr .skipped else round (sr .mrr , 4 ),
17371858 "p50_ms" : None if sr .skipped else round (sr .p50_ms , 1 ),
17381859 "p95_ms" : None if sr .skipped else round (sr .p95_ms , 1 ),
1860+ "asset_level" : {
1861+ "recall_at_1" : None if sr .skipped else round (sr .asset_recall_at_1 , 4 ),
1862+ "recall_at_5" : None if sr .skipped else round (sr .asset_recall_at_5 , 4 ),
1863+ "recall_at_10" : None if sr .skipped else round (sr .asset_recall_at_10 , 4 ),
1864+ "precision_at_5" : None if sr .skipped else round (sr .asset_precision_at_5 , 4 ),
1865+ "precision_at_10" : None if sr .skipped else round (sr .asset_precision_at_10 , 4 ),
1866+ "ndcg_at_10" : None if sr .skipped else round (sr .asset_ndcg_at_10 , 4 ),
1867+ "mrr" : None if sr .skipped else round (sr .asset_mrr , 4 ),
1868+ },
17391869 "total_queries" : sr .total_queries ,
17401870 "by_difficulty" : {
17411871 "easy" : {
@@ -2057,27 +2187,36 @@ def save_checkpoint(
20572187 backend , storage , gt ,
20582188 collection , effective_mode ,
20592189 )
2060- h1 , h5 , h10 , ndcg , rr , prec_5 , prec_10 = evaluate_results (results , gt , CORPUS_DIR )
2061-
2062- sr .hits_at_1 += int (h1 )
2063- sr .hits_at_5 += int (h5 )
2064- sr .hits_at_10 += int (h10 )
2065- sr .ndcg_sum += ndcg
2066- sr .mrr_sum += rr
2067- sr .precision_at_5_sum += prec_5
2068- sr .precision_at_10_sum += prec_10
2190+ eval_detail = evaluate_results_detailed (results , gt , CORPUS_DIR )
2191+ memory_metrics = eval_detail ["memory" ]
2192+ asset_metrics = eval_detail ["asset" ]
2193+
2194+ sr .hits_at_1 += int (memory_metrics .hit_at_1 )
2195+ sr .hits_at_5 += int (memory_metrics .hit_at_5 )
2196+ sr .hits_at_10 += int (memory_metrics .hit_at_10 )
2197+ sr .ndcg_sum += memory_metrics .ndcg
2198+ sr .mrr_sum += memory_metrics .rr
2199+ sr .precision_at_5_sum += memory_metrics .precision_at_5
2200+ sr .precision_at_10_sum += memory_metrics .precision_at_10
2201+ sr .asset_hits_at_1 += int (asset_metrics .hit_at_1 )
2202+ sr .asset_hits_at_5 += int (asset_metrics .hit_at_5 )
2203+ sr .asset_hits_at_10 += int (asset_metrics .hit_at_10 )
2204+ sr .asset_ndcg_sum += asset_metrics .ndcg
2205+ sr .asset_mrr_sum += asset_metrics .rr
2206+ sr .asset_precision_at_5_sum += asset_metrics .precision_at_5
2207+ sr .asset_precision_at_10_sum += asset_metrics .precision_at_10
20692208 sr .latencies_ms .append (latency )
20702209
20712210 # Track per-difficulty hits
20722211 if gt .difficulty == "easy" :
2073- sr .easy_hits_at_1 += int (h1 )
2074- sr .easy_hits_at_5 += int (h5 )
2212+ sr .easy_hits_at_1 += int (memory_metrics . hit_at_1 )
2213+ sr .easy_hits_at_5 += int (memory_metrics . hit_at_5 )
20752214 elif gt .difficulty == "medium" :
2076- sr .medium_hits_at_1 += int (h1 )
2077- sr .medium_hits_at_5 += int (h5 )
2215+ sr .medium_hits_at_1 += int (memory_metrics . hit_at_1 )
2216+ sr .medium_hits_at_5 += int (memory_metrics . hit_at_5 )
20782217 elif gt .difficulty == "hard" :
2079- sr .hard_hits_at_1 += int (h1 )
2080- sr .hard_hits_at_5 += int (h5 )
2218+ sr .hard_hits_at_1 += int (memory_metrics . hit_at_1 )
2219+ sr .hard_hits_at_5 += int (memory_metrics . hit_at_5 )
20812220
20822221 # Store per-query result with audit trail for post-hoc analysis
20832222 sr .per_query_results .append ({
@@ -2087,13 +2226,22 @@ def save_checkpoint(
20872226 "relevant_paths" : gt .relevant_paths ,
20882227 "difficulty" : gt .difficulty ,
20892228 "is_negative_control" : gt .is_negative_control ,
2090- "hit_at_1" : h1 ,
2091- "hit_at_5" : h5 ,
2092- "hit_at_10" : h10 ,
2093- "ndcg" : ndcg ,
2094- "mrr" : rr ,
2095- "precision_at_5" : prec_5 ,
2096- "precision_at_10" : prec_10 ,
2229+ "hit_at_1" : memory_metrics .hit_at_1 ,
2230+ "hit_at_5" : memory_metrics .hit_at_5 ,
2231+ "hit_at_10" : memory_metrics .hit_at_10 ,
2232+ "ndcg" : memory_metrics .ndcg ,
2233+ "mrr" : memory_metrics .rr ,
2234+ "precision_at_5" : memory_metrics .precision_at_5 ,
2235+ "precision_at_10" : memory_metrics .precision_at_10 ,
2236+ "asset_level" : {
2237+ "hit_at_1" : asset_metrics .hit_at_1 ,
2238+ "hit_at_5" : asset_metrics .hit_at_5 ,
2239+ "hit_at_10" : asset_metrics .hit_at_10 ,
2240+ "ndcg" : asset_metrics .ndcg ,
2241+ "mrr" : asset_metrics .rr ,
2242+ "precision_at_5" : asset_metrics .precision_at_5 ,
2243+ "precision_at_10" : asset_metrics .precision_at_10 ,
2244+ },
20972245 "latency_ms" : latency ,
20982246 "results" : results ,
20992247 })
@@ -2113,7 +2261,7 @@ def save_checkpoint(
21132261 all_results [stage_name ][cat_name ] = sr
21142262 print (f" { stage_name } for { cat_name } ({ len (queries )} q)... "
21152263 f"R@1={ sr .recall_at_1 :.1%} R@5={ sr .recall_at_5 :.1%} "
2116- f"R@10={ sr .recall_at_10 :.1%} P@5={ sr .precision_at_5 :.3f} "
2264+ f"R@10={ sr .recall_at_10 :.1%} AssetR@1= { sr . asset_recall_at_1 :.1% } P@5={ sr .precision_at_5 :.3f} "
21172265 f"NDCG@10={ sr .ndcg_at_10 :.3f} MRR={ sr .mrr :.3f} " )
21182266 save_checkpoint (run_status = "partial" )
21192267
0 commit comments