Skip to content

Commit b7ba04b

Browse files
committed
feat(eval): publish independent auditor benchmark story
1 parent ec57860 commit b7ba04b

File tree

14 files changed

+622
-84
lines changed

14 files changed

+622
-84
lines changed

TODO.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ This roadmap is derived from deep research into Greptile's public docs, blog, MC
148148
97. [x] Add leaderboard reporting for reviewer usefulness metrics, not just precision/recall.
149149
98. [x] Add regression gates for feedback coverage, verifier health, and lifecycle-state accuracy.
150150
99. [ ] Add model-routing policies that explicitly separate generation, verification, and auditing roles.
151-
100. [ ] Publish a repeatable "independent auditor" benchmark story in the UI and CLI so DiffScope's differentiation is measurable.
151+
100. [x] Publish a repeatable "independent auditor" benchmark story in the UI and CLI so DiffScope's differentiation is measurable.
152152

153153
## Current Execution Slice
154154

src/commands/eval/command.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ fn build_eval_run_metadata(
105105
fixtures_discovered: execution.discovered_count,
106106
fixtures_selected: execution.selected_count,
107107
label: options.label.clone(),
108+
comparison_group: options.comparison_group.clone(),
108109
model: config.model.clone(),
109110
review_mode: review_mode_label(config.agent.enabled).to_string(),
110111
adapter: resolved_adapter.or_else(|| config.adapter.clone()),

src/commands/eval/command/batch.rs

Lines changed: 166 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@ use std::path::{Path, PathBuf};
55

66
use crate::config;
77

8-
use super::super::metrics::build_lifecycle_accuracy;
8+
use super::super::metrics::{
9+
build_report_usefulness_signals, compute_usefulness_score, EvalUsefulnessSignals,
10+
};
911
use super::super::report::evaluation_failure_message;
1012
use super::super::{EvalReport, EvalRunOptions};
1113
use super::build_eval_run_metadata;
@@ -86,6 +88,33 @@ struct EvalReviewerLeaderboardEntry {
8688
provisional: bool,
8789
}
8890

91+
#[derive(Debug, Serialize)]
92+
struct EvalIndependentAuditorStory {
93+
benchmark_label: String,
94+
winning_reviewer: String,
95+
winning_model: String,
96+
winning_provider: Option<String>,
97+
winning_review_mode: String,
98+
winning_usefulness_score: f32,
99+
winning_weighted_score: Option<f32>,
100+
winning_micro_f1: Option<f32>,
101+
winning_pass_rate: f32,
102+
winning_verification_health: Option<f32>,
103+
winning_lifecycle_accuracy: Option<f32>,
104+
winning_provisional: bool,
105+
review_mode_comparison: Option<EvalIndependentAuditorStoryComparison>,
106+
}
107+
108+
#[derive(Debug, Serialize)]
109+
struct EvalIndependentAuditorStoryComparison {
110+
baseline_review_mode: String,
111+
compare_review_mode: String,
112+
micro_f1_delta: Option<f32>,
113+
weighted_score_delta: Option<f32>,
114+
pass_rate_delta: f32,
115+
usefulness_score_delta: f32,
116+
}
117+
89118
#[derive(Debug, Serialize)]
90119
struct EvalBatchModelSummary {
91120
model: String,
@@ -106,6 +135,7 @@ struct EvalBatchReport {
106135
models: Vec<String>,
107136
review_modes: Vec<String>,
108137
leaderboard: Vec<EvalReviewerLeaderboardEntry>,
138+
independent_auditor_story: Option<EvalIndependentAuditorStory>,
109139
by_model: Vec<EvalBatchModelSummary>,
110140
runs: Vec<EvalReport>,
111141
}
@@ -138,6 +168,8 @@ pub(super) async fn run_eval_batch(
138168
run_options.compare_agent_loop = false;
139169
run_options.matrix_models.clear();
140170
run_options.repeat = 1;
171+
run_options.comparison_group =
172+
Some(options.label.clone().unwrap_or_else(|| "eval".to_string()));
141173
run_options.label = Some(build_run_label(
142174
options.label.as_deref(),
143175
model,
@@ -343,6 +375,7 @@ fn build_batch_report(
343375
review_modes: Vec<String>,
344376
runs: Vec<EvalReport>,
345377
) -> EvalBatchReport {
378+
let leaderboard = build_reviewer_leaderboard(&runs);
346379
let by_model = models
347380
.iter()
348381
.map(|model| {
@@ -404,7 +437,8 @@ fn build_batch_report(
404437
repeat: repeat_total,
405438
models,
406439
review_modes,
407-
leaderboard: build_reviewer_leaderboard(&runs),
440+
independent_auditor_story: build_independent_auditor_story(options, &leaderboard),
441+
leaderboard,
408442
by_model,
409443
runs,
410444
}
@@ -426,27 +460,7 @@ fn build_reviewer_leaderboard(runs: &[EvalReport]) -> Vec<EvalReviewerLeaderboar
426460
let mut leaderboard = grouped
427461
.into_iter()
428462
.map(|((model, provider, review_mode), reports)| {
429-
let average_micro_f1 = average_benchmark_metric(&reports, |report| {
430-
report
431-
.benchmark_summary
432-
.as_ref()
433-
.map(|metrics| metrics.micro_f1)
434-
});
435-
let average_weighted_score = average_benchmark_metric(&reports, |report| {
436-
report
437-
.benchmark_summary
438-
.as_ref()
439-
.map(|metrics| metrics.weighted_score)
440-
});
441-
let average_verification_health = average_report_metric(&reports, |report| {
442-
report
443-
.verification_health
444-
.as_ref()
445-
.map(|health| health.verified_pct)
446-
});
447-
let average_lifecycle_accuracy = average_report_metric(&reports, |report| {
448-
build_lifecycle_accuracy(&report.results).map(|accuracy| accuracy.rate)
449-
});
463+
let average_signals = average_usefulness_signals(&reports);
450464
let passing_runs = reports
451465
.iter()
452466
.filter(|report| evaluation_failure_message(report).is_none())
@@ -462,17 +476,14 @@ fn build_reviewer_leaderboard(runs: &[EvalReport]) -> Vec<EvalReviewerLeaderboar
462476
runs: reports.len(),
463477
passing_runs,
464478
pass_rate,
465-
average_micro_f1,
466-
average_weighted_score,
467-
average_verification_health,
468-
average_lifecycle_accuracy,
469-
usefulness_score: usefulness_score(
470-
average_micro_f1,
471-
average_weighted_score,
479+
average_micro_f1: average_signals.micro_f1,
480+
average_weighted_score: average_signals.weighted_score,
481+
average_verification_health: average_signals.verification_health,
482+
average_lifecycle_accuracy: average_signals.lifecycle_accuracy,
483+
usefulness_score: compute_usefulness_score(EvalUsefulnessSignals {
472484
pass_rate,
473-
average_verification_health,
474-
average_lifecycle_accuracy,
475-
),
485+
..average_signals
486+
}),
476487
provisional: reports.len() < 2,
477488
}
478489
})
@@ -506,58 +517,86 @@ fn reviewer_identity(model: &str, provider: Option<&str>, review_mode: &str) ->
506517
}
507518
}
508519

509-
fn usefulness_score(
510-
average_micro_f1: Option<f32>,
511-
average_weighted_score: Option<f32>,
512-
pass_rate: f32,
513-
average_verification_health: Option<f32>,
514-
average_lifecycle_accuracy: Option<f32>,
515-
) -> f32 {
516-
const MICRO_F1_WEIGHT: f32 = 0.20;
517-
const WEIGHTED_SCORE_WEIGHT: f32 = 0.35;
518-
const PASS_RATE_WEIGHT: f32 = 0.25;
519-
const VERIFICATION_HEALTH_WEIGHT: f32 = 0.10;
520-
const LIFECYCLE_ACCURACY_WEIGHT: f32 = 0.10;
521-
522-
let mut weighted_sum = pass_rate * PASS_RATE_WEIGHT;
523-
let mut total_weight = PASS_RATE_WEIGHT;
524-
525-
for (value, weight) in [
526-
(average_micro_f1, MICRO_F1_WEIGHT),
527-
(average_weighted_score, WEIGHTED_SCORE_WEIGHT),
528-
(average_verification_health, VERIFICATION_HEALTH_WEIGHT),
529-
(average_lifecycle_accuracy, LIFECYCLE_ACCURACY_WEIGHT),
530-
] {
531-
if let Some(value) = value {
532-
weighted_sum += value * weight;
533-
total_weight += weight;
534-
}
535-
}
520+
fn build_independent_auditor_story(
521+
options: &EvalRunOptions,
522+
leaderboard: &[EvalReviewerLeaderboardEntry],
523+
) -> Option<EvalIndependentAuditorStory> {
524+
let winner = leaderboard.first()?;
525+
let single_pass = leaderboard.iter().find(|entry| {
526+
entry.model == winner.model
527+
&& entry.provider == winner.provider
528+
&& entry.review_mode == review_mode_label(false)
529+
});
530+
let agent_loop = leaderboard.iter().find(|entry| {
531+
entry.model == winner.model
532+
&& entry.provider == winner.provider
533+
&& entry.review_mode == review_mode_label(true)
534+
});
536535

537-
if total_weight == 0.0 {
538-
0.0
539-
} else {
540-
weighted_sum / total_weight
541-
}
536+
Some(EvalIndependentAuditorStory {
537+
benchmark_label: options.label.clone().unwrap_or_else(|| "eval".to_string()),
538+
winning_reviewer: winner.reviewer.clone(),
539+
winning_model: winner.model.clone(),
540+
winning_provider: winner.provider.clone(),
541+
winning_review_mode: winner.review_mode.clone(),
542+
winning_usefulness_score: winner.usefulness_score,
543+
winning_weighted_score: winner.average_weighted_score,
544+
winning_micro_f1: winner.average_micro_f1,
545+
winning_pass_rate: winner.pass_rate,
546+
winning_verification_health: winner.average_verification_health,
547+
winning_lifecycle_accuracy: winner.average_lifecycle_accuracy,
548+
winning_provisional: winner.provisional,
549+
review_mode_comparison: single_pass
550+
.zip(agent_loop)
551+
.map(
552+
|(single_pass, agent_loop)| EvalIndependentAuditorStoryComparison {
553+
baseline_review_mode: single_pass.review_mode.clone(),
554+
compare_review_mode: agent_loop.review_mode.clone(),
555+
micro_f1_delta: delta(
556+
agent_loop.average_micro_f1,
557+
single_pass.average_micro_f1,
558+
),
559+
weighted_score_delta: delta(
560+
agent_loop.average_weighted_score,
561+
single_pass.average_weighted_score,
562+
),
563+
pass_rate_delta: agent_loop.pass_rate - single_pass.pass_rate,
564+
usefulness_score_delta: agent_loop.usefulness_score
565+
- single_pass.usefulness_score,
566+
},
567+
),
568+
})
542569
}
543570

544-
fn average_benchmark_metric<F>(reports: &[&EvalReport], metric: F) -> Option<f32>
545-
where
546-
F: Fn(&EvalReport) -> Option<f32>,
547-
{
548-
average_report_metric(reports, metric)
549-
}
571+
fn average_usefulness_signals(reports: &[&EvalReport]) -> EvalUsefulnessSignals {
572+
let mut micro_f1 = Vec::new();
573+
let mut weighted_score = Vec::new();
574+
let mut verification_health = Vec::new();
575+
let mut lifecycle_accuracy = Vec::new();
550576

551-
fn average_report_metric<F>(reports: &[&EvalReport], metric: F) -> Option<f32>
552-
where
553-
F: Fn(&EvalReport) -> Option<f32>,
554-
{
555-
average(
556-
&reports
557-
.iter()
558-
.filter_map(|report| metric(report))
559-
.collect::<Vec<_>>(),
560-
)
577+
for report in reports {
578+
let signals = build_report_usefulness_signals(report);
579+
if let Some(value) = signals.micro_f1 {
580+
micro_f1.push(value);
581+
}
582+
if let Some(value) = signals.weighted_score {
583+
weighted_score.push(value);
584+
}
585+
if let Some(value) = signals.verification_health {
586+
verification_health.push(value);
587+
}
588+
if let Some(value) = signals.lifecycle_accuracy {
589+
lifecycle_accuracy.push(value);
590+
}
591+
}
592+
593+
EvalUsefulnessSignals {
594+
micro_f1: average(&micro_f1),
595+
weighted_score: average(&weighted_score),
596+
verification_health: average(&verification_health),
597+
lifecycle_accuracy: average(&lifecycle_accuracy),
598+
..Default::default()
599+
}
561600
}
562601

563602
fn build_review_mode_summary(
@@ -761,6 +800,36 @@ fn print_eval_batch_report(report: &EvalBatchReport) {
761800
);
762801
}
763802
}
803+
804+
if let Some(story) = report.independent_auditor_story.as_ref() {
805+
println!("Independent auditor benchmark ({}):", story.benchmark_label);
806+
println!(
807+
" winner: {} | usefulness={} weighted={} micro F1={} pass={} verification={} lifecycle={}{}",
808+
story.winning_reviewer,
809+
percentage(story.winning_usefulness_score),
810+
percentage_or_na(story.winning_weighted_score),
811+
percentage_or_na(story.winning_micro_f1),
812+
percentage(story.winning_pass_rate),
813+
percentage_or_na(story.winning_verification_health),
814+
percentage_or_na(story.winning_lifecycle_accuracy),
815+
if story.winning_provisional {
816+
" provisional"
817+
} else {
818+
""
819+
}
820+
);
821+
if let Some(comparison) = story.review_mode_comparison.as_ref() {
822+
println!(
823+
" {} vs {}: usefulness {} weighted {} micro F1 {} pass rate {:+.0}%",
824+
comparison.compare_review_mode,
825+
comparison.baseline_review_mode,
826+
format_args!("{:+.0}%", comparison.usefulness_score_delta * 100.0),
827+
signed_percentage_or_na(comparison.weighted_score_delta),
828+
signed_percentage_or_na(comparison.micro_f1_delta),
829+
comparison.pass_rate_delta * 100.0
830+
);
831+
}
832+
}
764833
}
765834

766835
fn percentage_or_na(value: Option<f32>) -> String {
@@ -823,6 +892,7 @@ mod tests {
823892
fixture_name_filters: vec![],
824893
max_fixtures: None,
825894
label: Some("smoke".to_string()),
895+
comparison_group: None,
826896
trend_file: None,
827897
artifact_dir: None,
828898
allow_subfrontier_models: false,
@@ -1050,6 +1120,21 @@ mod tests {
10501120
assert!(report.leaderboard[0].usefulness_score > report.leaderboard[1].usefulness_score);
10511121
assert_eq!(report.leaderboard[0].average_verification_health, Some(0.9));
10521122
assert_eq!(report.leaderboard[0].average_lifecycle_accuracy, Some(1.0));
1123+
assert_eq!(
1124+
report
1125+
.independent_auditor_story
1126+
.as_ref()
1127+
.map(|story| story.winning_review_mode.as_str()),
1128+
Some(review_mode_label(true))
1129+
);
1130+
assert_eq!(
1131+
report
1132+
.independent_auditor_story
1133+
.as_ref()
1134+
.and_then(|story| story.review_mode_comparison.as_ref())
1135+
.map(|comparison| comparison.usefulness_score_delta > 0.0),
1136+
Some(true)
1137+
);
10531138
assert!(
10541139
(report.by_model[0].review_mode_comparisons[0]
10551140
.micro_f1_delta

src/commands/eval/command/fixtures.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ mod tests {
231231
fixture_name_filters: vec!["shell".to_string()],
232232
max_fixtures: None,
233233
label: None,
234+
comparison_group: None,
234235
trend_file: None,
235236
artifact_dir: None,
236237
allow_subfrontier_models: false,
@@ -276,6 +277,7 @@ mod tests {
276277
fixture_name_filters: vec![],
277278
max_fixtures: Some(1),
278279
label: None,
280+
comparison_group: None,
279281
trend_file: None,
280282
artifact_dir: None,
281283
allow_subfrontier_models: false,

0 commit comments

Comments
 (0)