Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions app/src/components/ModelLeaderboard.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import ProviderMark from "./ProviderMark";
import {
SENSITIVITY_VIEWS,
buildAllRows,
householdImpactScores,
modelScoresForView,
viewSupportsSelected,
type SensitivityViewId,
Expand Down Expand Up @@ -119,6 +120,7 @@ export default function ModelLeaderboard({
const isGlobal = selectedView === "global";
const [sensitivityView, setSensitivityView] =
useState<SensitivityViewId>("main");
const [showIntervals, setShowIntervals] = useState(false);

const allRows = useMemo(() => buildAllRows(dashboard), [dashboard]);

Expand All @@ -138,8 +140,11 @@ export default function ModelLeaderboard({
: sensitivityView;

const sensitivityScores = useMemo(() => {
if (effectiveView === "household_weighted") {
return householdImpactScores(dashboard, selectedView);
}
return modelScoresForView(allRows, effectiveView, selectedView);
}, [allRows, effectiveView, selectedView]);
}, [allRows, dashboard, effectiveView, selectedView]);

const sensitivityScoreByModel = useMemo(() => {
const out = new Map<string, number>();
Expand All @@ -160,14 +165,20 @@ export default function ModelLeaderboard({
.sort((a, b) => b.score - a.score);
}, [data, effectiveView, sensitivityScoreByModel]);

// Bootstrap intervals are off by default — they roughly triple the
// first-paint cost and are noise to most readers. Compute on-demand when
// the user opens the toggle. Households-weighted view doesn't have a
// bootstrap path yet; fall back to no intervals there.
const intervals = useMemo(() => {
if (!showIntervals) return new Map();
if (effectiveView === "household_weighted") return new Map();
return bootstrapIntervals(
allRows,
selectedView,
viewToFilter(effectiveView),
{ draws: DEFAULT_DRAWS, seed: 42 },
);
}, [allRows, selectedView, effectiveView]);
}, [allRows, selectedView, effectiveView, showIntervals]);

const pendingModels = useMemo<PendingModel[]>(() => {
const present = new Set(noTools.map((model) => model.model));
Expand Down Expand Up @@ -281,6 +292,16 @@ export default function ModelLeaderboard({
<span className="text-[11px] text-text-muted">
{activeView.description}
</span>
<label className="ml-auto inline-flex items-center gap-1.5 text-[11px] text-text-secondary">
<input
type="checkbox"
checked={showIntervals}
onChange={(event) => setShowIntervals(event.target.checked)}
className="h-3 w-3 rounded border-border accent-primary"
aria-label="Show 95% bootstrap intervals"
/>
<span>Show 95% intervals</span>
</label>
</div>
{sensitivityUnsupportedForView && (
<p
Expand Down
2 changes: 1 addition & 1 deletion app/src/data.json

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions app/src/lib/sensitivity.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {

export type SensitivityViewId =
| "main"
| "household_weighted"
| "amount_only"
| "binary_only"
| "positive_only"
Expand All @@ -32,6 +33,12 @@ export const SENSITIVITY_VIEWS: SensitivityView[] = [
label: "Main",
description: "Equal-weight average across output groups; baseline ranking.",
},
{
id: "household_weighted",
label: "Household-weighted",
description:
"Each household contributes equally; within a household, outputs are weighted by absolute reference dollar share (with a 0.3 floor).",
},
{
id: "amount_only",
label: "Amount only",
Expand Down Expand Up @@ -193,6 +200,7 @@ export function viewSupportsSelected(
view: SensitivityViewId,
selectedView: ViewKey,
): boolean {
if (view === "household_weighted") return true;
if (selectedView === "global") return viewSupportsGlobal(rows, view);
const filtered = filterRows(rows, view);
for (const row of filtered) {
Expand All @@ -201,6 +209,30 @@ export function viewSupportsSelected(
return false;
}

/** Read pre-computed household-equal impact scores from the dashboard payload. */
export function householdImpactScores(
dashboard: DashboardBundle,
selectedView: ViewKey,
): ModelScore[] {
const scores: ModelScore[] = [];
if (selectedView === "global") {
for (const stat of dashboard.global?.modelStats ?? []) {
if (stat.condition !== "no_tools") continue;
if (typeof stat.impactScore !== "number") continue;
scores.push({ model: stat.model, score: stat.impactScore });
}
} else {
const country = dashboard.countries[selectedView];
if (!country) return [];
for (const stat of country.modelStats) {
if (stat.condition !== "no_tools") continue;
if (typeof stat.impactScore !== "number") continue;
scores.push({ model: stat.model, score: stat.impactScore });
}
}
return scores.sort((a, b) => b.score - a.score);
}

export function modelScoresForView(
rows: ScoreRow[],
view: SensitivityViewId,
Expand Down
2 changes: 2 additions & 0 deletions app/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,8 @@ export type ModelStat = {
maeRunMean?: number;
maeRunStd?: number;
countryScores?: Partial<Record<CountryCode, number>>;
impactScore?: number;
impactCountryScores?: Partial<Record<CountryCode, number>>;
};

export type ProgramStat = {
Expand Down
2 changes: 1 addition & 1 deletion paper/snapshot/20260501/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
},
"dashboard_export": {
"path": "app/src/data.json",
"sha256": "d3ee592e809d341cc734eba6c4a5ce4a762660d6082e1eaf5a0ce2a6ad8e9d20",
"sha256": "916923fcf29cdc26877366ac618c3c2bcb959bbfaf940fff7ca4826f6e46fe45",
"description": "Committed dashboard export containing parsed model predictions, explanations, model summaries, program summaries, prompts, and PolicyEngine runtime bundle metadata."
},
"source_run_labels": {
Expand Down
21 changes: 21 additions & 0 deletions policybench/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -1180,6 +1180,16 @@ def build_dashboard_payload(
item["prompt"] = first_prompt
scenario_payload[row["scenario_id"]] = item

impact_summary = analysis.get("impact_summary")
impact_by_model: dict[str, float] = {}
if isinstance(impact_summary, pd.DataFrame) and not impact_summary.empty:
for _, impact_row in impact_summary.iterrows():
model_name = impact_row.get("model")
score = impact_row.get("mean_impact_score")
if model_name is None or pd.isna(score):
continue
impact_by_model[str(model_name)] = float(score) * 100

model_stats = []
for _, row in (
analysis["model_summary"].sort_values("mean_score", ascending=False).iterrows()
Expand Down Expand Up @@ -1285,6 +1295,9 @@ def build_dashboard_payload(
}
if not pd.isna(row["mean_accuracy"]):
item["accuracy"] = float(row["mean_accuracy"] * 100)
impact_score = impact_by_model.get(str(row["model"]))
if impact_score is not None:
item["impactScore"] = impact_score
model_stats.append({k: v for k, v in item.items() if v is not None})

program_rows = []
Expand Down Expand Up @@ -1425,6 +1438,14 @@ def _mean(values: list[float | int | None]) -> float | None:
accuracy = _mean([row.get("accuracy") for row in rows.values()])
if accuracy is not None:
item["accuracy"] = accuracy
impact_values = [row.get("impactScore") for row in rows.values()]
if all(value is not None for value in impact_values) and impact_values:
item["impactScore"] = _mean(impact_values)
item["impactCountryScores"] = {
country: float(row["impactScore"])
for country, row in rows.items()
if row.get("impactScore") is not None
}
model_stats.append(item)

model_stats.sort(key=lambda row: row["score"], reverse=True)
Expand Down