@@ -5,7 +5,9 @@ use std::path::{Path, PathBuf};
55
66use crate :: config;
77
8- use super :: super :: metrics:: build_lifecycle_accuracy;
8+ use super :: super :: metrics:: {
9+ build_report_usefulness_signals, compute_usefulness_score, EvalUsefulnessSignals ,
10+ } ;
911use super :: super :: report:: evaluation_failure_message;
1012use super :: super :: { EvalReport , EvalRunOptions } ;
1113use super :: build_eval_run_metadata;
@@ -86,6 +88,33 @@ struct EvalReviewerLeaderboardEntry {
8688 provisional : bool ,
8789}
8890
91+ #[ derive( Debug , Serialize ) ]
92+ struct EvalIndependentAuditorStory {
93+ benchmark_label : String ,
94+ winning_reviewer : String ,
95+ winning_model : String ,
96+ winning_provider : Option < String > ,
97+ winning_review_mode : String ,
98+ winning_usefulness_score : f32 ,
99+ winning_weighted_score : Option < f32 > ,
100+ winning_micro_f1 : Option < f32 > ,
101+ winning_pass_rate : f32 ,
102+ winning_verification_health : Option < f32 > ,
103+ winning_lifecycle_accuracy : Option < f32 > ,
104+ winning_provisional : bool ,
105+ review_mode_comparison : Option < EvalIndependentAuditorStoryComparison > ,
106+ }
107+
108+ #[ derive( Debug , Serialize ) ]
109+ struct EvalIndependentAuditorStoryComparison {
110+ baseline_review_mode : String ,
111+ compare_review_mode : String ,
112+ micro_f1_delta : Option < f32 > ,
113+ weighted_score_delta : Option < f32 > ,
114+ pass_rate_delta : f32 ,
115+ usefulness_score_delta : f32 ,
116+ }
117+
89118#[ derive( Debug , Serialize ) ]
90119struct EvalBatchModelSummary {
91120 model : String ,
@@ -106,6 +135,7 @@ struct EvalBatchReport {
106135 models : Vec < String > ,
107136 review_modes : Vec < String > ,
108137 leaderboard : Vec < EvalReviewerLeaderboardEntry > ,
138+ independent_auditor_story : Option < EvalIndependentAuditorStory > ,
109139 by_model : Vec < EvalBatchModelSummary > ,
110140 runs : Vec < EvalReport > ,
111141}
@@ -138,6 +168,8 @@ pub(super) async fn run_eval_batch(
138168 run_options. compare_agent_loop = false ;
139169 run_options. matrix_models . clear ( ) ;
140170 run_options. repeat = 1 ;
171+ run_options. comparison_group =
172+ Some ( options. label . clone ( ) . unwrap_or_else ( || "eval" . to_string ( ) ) ) ;
141173 run_options. label = Some ( build_run_label (
142174 options. label . as_deref ( ) ,
143175 model,
@@ -343,6 +375,7 @@ fn build_batch_report(
343375 review_modes : Vec < String > ,
344376 runs : Vec < EvalReport > ,
345377) -> EvalBatchReport {
378+ let leaderboard = build_reviewer_leaderboard ( & runs) ;
346379 let by_model = models
347380 . iter ( )
348381 . map ( |model| {
@@ -404,7 +437,8 @@ fn build_batch_report(
404437 repeat : repeat_total,
405438 models,
406439 review_modes,
407- leaderboard : build_reviewer_leaderboard ( & runs) ,
440+ independent_auditor_story : build_independent_auditor_story ( options, & leaderboard) ,
441+ leaderboard,
408442 by_model,
409443 runs,
410444 }
@@ -426,27 +460,7 @@ fn build_reviewer_leaderboard(runs: &[EvalReport]) -> Vec<EvalReviewerLeaderboar
426460 let mut leaderboard = grouped
427461 . into_iter ( )
428462 . map ( |( ( model, provider, review_mode) , reports) | {
429- let average_micro_f1 = average_benchmark_metric ( & reports, |report| {
430- report
431- . benchmark_summary
432- . as_ref ( )
433- . map ( |metrics| metrics. micro_f1 )
434- } ) ;
435- let average_weighted_score = average_benchmark_metric ( & reports, |report| {
436- report
437- . benchmark_summary
438- . as_ref ( )
439- . map ( |metrics| metrics. weighted_score )
440- } ) ;
441- let average_verification_health = average_report_metric ( & reports, |report| {
442- report
443- . verification_health
444- . as_ref ( )
445- . map ( |health| health. verified_pct )
446- } ) ;
447- let average_lifecycle_accuracy = average_report_metric ( & reports, |report| {
448- build_lifecycle_accuracy ( & report. results ) . map ( |accuracy| accuracy. rate )
449- } ) ;
463+ let average_signals = average_usefulness_signals ( & reports) ;
450464 let passing_runs = reports
451465 . iter ( )
452466 . filter ( |report| evaluation_failure_message ( report) . is_none ( ) )
@@ -462,17 +476,14 @@ fn build_reviewer_leaderboard(runs: &[EvalReport]) -> Vec<EvalReviewerLeaderboar
462476 runs : reports. len ( ) ,
463477 passing_runs,
464478 pass_rate,
465- average_micro_f1,
466- average_weighted_score,
467- average_verification_health,
468- average_lifecycle_accuracy,
469- usefulness_score : usefulness_score (
470- average_micro_f1,
471- average_weighted_score,
479+ average_micro_f1 : average_signals. micro_f1 ,
480+ average_weighted_score : average_signals. weighted_score ,
481+ average_verification_health : average_signals. verification_health ,
482+ average_lifecycle_accuracy : average_signals. lifecycle_accuracy ,
483+ usefulness_score : compute_usefulness_score ( EvalUsefulnessSignals {
472484 pass_rate,
473- average_verification_health,
474- average_lifecycle_accuracy,
475- ) ,
485+ ..average_signals
486+ } ) ,
476487 provisional : reports. len ( ) < 2 ,
477488 }
478489 } )
@@ -506,58 +517,86 @@ fn reviewer_identity(model: &str, provider: Option<&str>, review_mode: &str) ->
506517 }
507518}
508519
509- fn usefulness_score (
510- average_micro_f1 : Option < f32 > ,
511- average_weighted_score : Option < f32 > ,
512- pass_rate : f32 ,
513- average_verification_health : Option < f32 > ,
514- average_lifecycle_accuracy : Option < f32 > ,
515- ) -> f32 {
516- const MICRO_F1_WEIGHT : f32 = 0.20 ;
517- const WEIGHTED_SCORE_WEIGHT : f32 = 0.35 ;
518- const PASS_RATE_WEIGHT : f32 = 0.25 ;
519- const VERIFICATION_HEALTH_WEIGHT : f32 = 0.10 ;
520- const LIFECYCLE_ACCURACY_WEIGHT : f32 = 0.10 ;
521-
522- let mut weighted_sum = pass_rate * PASS_RATE_WEIGHT ;
523- let mut total_weight = PASS_RATE_WEIGHT ;
524-
525- for ( value, weight) in [
526- ( average_micro_f1, MICRO_F1_WEIGHT ) ,
527- ( average_weighted_score, WEIGHTED_SCORE_WEIGHT ) ,
528- ( average_verification_health, VERIFICATION_HEALTH_WEIGHT ) ,
529- ( average_lifecycle_accuracy, LIFECYCLE_ACCURACY_WEIGHT ) ,
530- ] {
531- if let Some ( value) = value {
532- weighted_sum += value * weight;
533- total_weight += weight;
534- }
535- }
520+ fn build_independent_auditor_story (
521+ options : & EvalRunOptions ,
522+ leaderboard : & [ EvalReviewerLeaderboardEntry ] ,
523+ ) -> Option < EvalIndependentAuditorStory > {
524+ let winner = leaderboard. first ( ) ?;
525+ let single_pass = leaderboard. iter ( ) . find ( |entry| {
526+ entry. model == winner. model
527+ && entry. provider == winner. provider
528+ && entry. review_mode == review_mode_label ( false )
529+ } ) ;
530+ let agent_loop = leaderboard. iter ( ) . find ( |entry| {
531+ entry. model == winner. model
532+ && entry. provider == winner. provider
533+ && entry. review_mode == review_mode_label ( true )
534+ } ) ;
536535
537- if total_weight == 0.0 {
538- 0.0
539- } else {
540- weighted_sum / total_weight
541- }
536+ Some ( EvalIndependentAuditorStory {
537+ benchmark_label : options. label . clone ( ) . unwrap_or_else ( || "eval" . to_string ( ) ) ,
538+ winning_reviewer : winner. reviewer . clone ( ) ,
539+ winning_model : winner. model . clone ( ) ,
540+ winning_provider : winner. provider . clone ( ) ,
541+ winning_review_mode : winner. review_mode . clone ( ) ,
542+ winning_usefulness_score : winner. usefulness_score ,
543+ winning_weighted_score : winner. average_weighted_score ,
544+ winning_micro_f1 : winner. average_micro_f1 ,
545+ winning_pass_rate : winner. pass_rate ,
546+ winning_verification_health : winner. average_verification_health ,
547+ winning_lifecycle_accuracy : winner. average_lifecycle_accuracy ,
548+ winning_provisional : winner. provisional ,
549+ review_mode_comparison : single_pass
550+ . zip ( agent_loop)
551+ . map (
552+ |( single_pass, agent_loop) | EvalIndependentAuditorStoryComparison {
553+ baseline_review_mode : single_pass. review_mode . clone ( ) ,
554+ compare_review_mode : agent_loop. review_mode . clone ( ) ,
555+ micro_f1_delta : delta (
556+ agent_loop. average_micro_f1 ,
557+ single_pass. average_micro_f1 ,
558+ ) ,
559+ weighted_score_delta : delta (
560+ agent_loop. average_weighted_score ,
561+ single_pass. average_weighted_score ,
562+ ) ,
563+ pass_rate_delta : agent_loop. pass_rate - single_pass. pass_rate ,
564+ usefulness_score_delta : agent_loop. usefulness_score
565+ - single_pass. usefulness_score ,
566+ } ,
567+ ) ,
568+ } )
542569}
543570
544- fn average_benchmark_metric < F > ( reports : & [ & EvalReport ] , metric : F ) -> Option < f32 >
545- where
546- F : Fn ( & EvalReport ) -> Option < f32 > ,
547- {
548- average_report_metric ( reports, metric)
549- }
571+ fn average_usefulness_signals ( reports : & [ & EvalReport ] ) -> EvalUsefulnessSignals {
572+ let mut micro_f1 = Vec :: new ( ) ;
573+ let mut weighted_score = Vec :: new ( ) ;
574+ let mut verification_health = Vec :: new ( ) ;
575+ let mut lifecycle_accuracy = Vec :: new ( ) ;
550576
551- fn average_report_metric < F > ( reports : & [ & EvalReport ] , metric : F ) -> Option < f32 >
552- where
553- F : Fn ( & EvalReport ) -> Option < f32 > ,
554- {
555- average (
556- & reports
557- . iter ( )
558- . filter_map ( |report| metric ( report) )
559- . collect :: < Vec < _ > > ( ) ,
560- )
577+ for report in reports {
578+ let signals = build_report_usefulness_signals ( report) ;
579+ if let Some ( value) = signals. micro_f1 {
580+ micro_f1. push ( value) ;
581+ }
582+ if let Some ( value) = signals. weighted_score {
583+ weighted_score. push ( value) ;
584+ }
585+ if let Some ( value) = signals. verification_health {
586+ verification_health. push ( value) ;
587+ }
588+ if let Some ( value) = signals. lifecycle_accuracy {
589+ lifecycle_accuracy. push ( value) ;
590+ }
591+ }
592+
593+ EvalUsefulnessSignals {
594+ micro_f1 : average ( & micro_f1) ,
595+ weighted_score : average ( & weighted_score) ,
596+ verification_health : average ( & verification_health) ,
597+ lifecycle_accuracy : average ( & lifecycle_accuracy) ,
598+ ..Default :: default ( )
599+ }
561600}
562601
563602fn build_review_mode_summary (
@@ -761,6 +800,36 @@ fn print_eval_batch_report(report: &EvalBatchReport) {
761800 ) ;
762801 }
763802 }
803+
804+ if let Some ( story) = report. independent_auditor_story . as_ref ( ) {
805+ println ! ( "Independent auditor benchmark ({}):" , story. benchmark_label) ;
806+ println ! (
807+ " winner: {} | usefulness={} weighted={} micro F1={} pass={} verification={} lifecycle={}{}" ,
808+ story. winning_reviewer,
809+ percentage( story. winning_usefulness_score) ,
810+ percentage_or_na( story. winning_weighted_score) ,
811+ percentage_or_na( story. winning_micro_f1) ,
812+ percentage( story. winning_pass_rate) ,
813+ percentage_or_na( story. winning_verification_health) ,
814+ percentage_or_na( story. winning_lifecycle_accuracy) ,
815+ if story. winning_provisional {
816+ " provisional"
817+ } else {
818+ ""
819+ }
820+ ) ;
821+ if let Some ( comparison) = story. review_mode_comparison . as_ref ( ) {
822+ println ! (
823+ " {} vs {}: usefulness {} weighted {} micro F1 {} pass rate {:+.0}%" ,
824+ comparison. compare_review_mode,
825+ comparison. baseline_review_mode,
826+ format_args!( "{:+.0}%" , comparison. usefulness_score_delta * 100.0 ) ,
827+ signed_percentage_or_na( comparison. weighted_score_delta) ,
828+ signed_percentage_or_na( comparison. micro_f1_delta) ,
829+ comparison. pass_rate_delta * 100.0
830+ ) ;
831+ }
832+ }
764833}
765834
766835fn percentage_or_na ( value : Option < f32 > ) -> String {
@@ -823,6 +892,7 @@ mod tests {
823892 fixture_name_filters : vec ! [ ] ,
824893 max_fixtures : None ,
825894 label : Some ( "smoke" . to_string ( ) ) ,
895+ comparison_group : None ,
826896 trend_file : None ,
827897 artifact_dir : None ,
828898 allow_subfrontier_models : false ,
@@ -1050,6 +1120,21 @@ mod tests {
10501120 assert ! ( report. leaderboard[ 0 ] . usefulness_score > report. leaderboard[ 1 ] . usefulness_score) ;
10511121 assert_eq ! ( report. leaderboard[ 0 ] . average_verification_health, Some ( 0.9 ) ) ;
10521122 assert_eq ! ( report. leaderboard[ 0 ] . average_lifecycle_accuracy, Some ( 1.0 ) ) ;
1123+ assert_eq ! (
1124+ report
1125+ . independent_auditor_story
1126+ . as_ref( )
1127+ . map( |story| story. winning_review_mode. as_str( ) ) ,
1128+ Some ( review_mode_label( true ) )
1129+ ) ;
1130+ assert_eq ! (
1131+ report
1132+ . independent_auditor_story
1133+ . as_ref( )
1134+ . and_then( |story| story. review_mode_comparison. as_ref( ) )
1135+ . map( |comparison| comparison. usefulness_score_delta > 0.0 ) ,
1136+ Some ( true )
1137+ ) ;
10531138 assert ! (
10541139 ( report. by_model[ 0 ] . review_mode_comparisons[ 0 ]
10551140 . micro_f1_delta
0 commit comments