Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 59 additions & 21 deletions bench/compare.exs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ defmodule Bench.Compare do
defp compare(results, baseline) do
status = Map.get(baseline, "_status", "unknown")
tolerance = Map.get(baseline, "tolerance", %{})
baseline_scenarios = Map.get(baseline, "scenarios", %{})

IO.puts("# Phase D — Performance Regression Report")
IO.puts("")
Expand All @@ -59,15 +60,18 @@ defmodule Bench.Compare do
"> SCAFFOLD MODE — bench/baseline.json has not been populated yet " <>
"(Phase D-4 collects the real baseline). This run is informational " <>
"only; the gate is **non-blocking** until baseline.json `_status` " <>
"is flipped to `active`."
"is flipped to `active`. Schema drift (a scenario present in " <>
"results.json but absent from baseline.json, or vice versa) is " <>
"surfaced inline as `scaffold (would fail: ...)` so a rebaseline " <>
"PR previews the active-mode verdict before the gate is armed."
)

IO.puts("")
emit_table(results, nil, tolerance)
emit_table(results, baseline_scenarios, tolerance, enforce: false)
System.halt(0)

"active" ->
emit_table(results, baseline["scenarios"], tolerance)
emit_table(results, baseline_scenarios, tolerance, enforce: true)
|> case do
:ok -> System.halt(0)
:regressed -> System.halt(1)
Expand All @@ -80,34 +84,70 @@ defmodule Bench.Compare do
end

# ── Pretty-print + regression check ────────────────────────────────────────
#
# Iterates the UNION of scenario names from results and baseline so neither
# schema-drift direction is silent:
# • results-only scenario → "MISSING IN BASELINE" (new harness scenario
# landed without a rebaseline; the regression gate has no anchor for it).
# • baseline-only scenario → "MISSING IN RESULTS" (the harness dropped a
# scenario the baseline still claims; the gate must not silently pass).
# Both directions fail-closed when `enforce: true` (active mode) and surface
# as informational `scaffold (would fail: ...)` rows when `enforce: false`
# (scaffold-placeholder mode) — see docs/perf-contract.md § Schema drift.

defp emit_table(results, baseline_scenarios, tolerance, opts) do
enforce = Keyword.fetch!(opts, :enforce)

defp emit_table(results, baseline_scenarios, tolerance) do
# Benchee JSON shape: top-level "statistics" -> per-scenario map.
stats = Map.get(results, "statistics", %{})

result_names = stats |> Map.keys() |> MapSet.new()
baseline_names = baseline_scenarios |> Map.keys() |> MapSet.new()
all_names = result_names |> MapSet.union(baseline_names) |> Enum.sort()

IO.puts("| Scenario | p50 (µs) | p95 (µs) | p99 (µs) | Status |")
IO.puts("|----------|----------|----------|----------|--------|")

Enum.reduce(stats, :ok, fn {name, scenario_stats}, acc ->
p50 = percentile_us(scenario_stats, "50")
p95 = percentile_us(scenario_stats, "95")
p99 = percentile_us(scenario_stats, "99")
Enum.reduce(all_names, :ok, fn name, acc ->
in_results = MapSet.member?(result_names, name)
in_baseline = MapSet.member?(baseline_names, name)

{p50, p95, p99} =
if in_results do
scenario_stats = Map.fetch!(stats, name)

{percentile_us(scenario_stats, "50"), percentile_us(scenario_stats, "95"),
percentile_us(scenario_stats, "99")}
else
{nil, nil, nil}
end

{raw_status, drift?} =
cond do
in_results and not in_baseline ->
{"MISSING IN BASELINE", true}

in_baseline and not in_results ->
{"MISSING IN RESULTS", true}

status =
case baseline_scenarios do
nil ->
"scaffold"
true ->
base = Map.fetch!(baseline_scenarios, name)
regressed? = check_regression(base, p50, p95, p99, tolerance) == "REGRESSED"
{if(regressed?, do: "REGRESSED", else: "ok"), regressed?}
end

map ->
base = Map.get(map, name)
check_regression(base, p50, p95, p99, tolerance)
display_status =
cond do
enforce -> raw_status
drift? -> "scaffold (would fail: #{raw_status})"
true -> "scaffold"
end

IO.puts("| #{name} | #{fmt(p50)} | #{fmt(p95)} | #{fmt(p99)} | #{status} |")
IO.puts("| #{name} | #{fmt(p50)} | #{fmt(p95)} | #{fmt(p99)} | #{display_status} |")

cond do
acc == :regressed -> :regressed
status == "REGRESSED" -> :regressed
enforce and drift? -> :regressed
true -> acc
end
end)
Expand All @@ -126,8 +166,6 @@ defmodule Bench.Compare do
defp fmt(n) when is_float(n), do: :erlang.float_to_binary(n, decimals: 2)
defp fmt(n), do: to_string(n)

defp check_regression(nil, _, _, _, _), do: "no baseline"

defp check_regression(base, p50, p95, p99, tolerance) do
bp50 = num(base["p50_us"])
bp95 = num(base["p95_us"])
Expand All @@ -138,8 +176,8 @@ defmodule Bench.Compare do
t99 = Map.get(tolerance, "p99_max_ratio", 1.50)

breached =
(bp50 && p50 && p50 > bp50 * t50) or
(bp95 && p95 && p95 > bp95 * t95) or
(bp50 && p50 && p50 > bp50 * t50) ||
(bp95 && p95 && p95 > bp95 * t95) ||
(bp99 && p99 && p99 > bp99 * t99)

if breached, do: "REGRESSED", else: "ok"
Expand Down
26 changes: 26 additions & 0 deletions docs/perf-contract.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,32 @@ The CI gate (`.github/workflows/perf-regression.yml`) fails a PR when
Tolerances are looser as the percentile gets noisier — Phase D-4 will
tighten these once intra-run variance is characterised.

## Schema drift

The comparator iterates the **union** of scenario names from
`bench/results.json` and `bench/baseline.json`. Either direction of
schema drift fails the build in `active` mode:

- **`MISSING IN BASELINE`** — a scenario in `results.json` (the harness
just emitted it) has no entry in `baseline.json`. A new scenario
landed without a rebaseline; the gate has no anchor for it and
cannot meaningfully report regression. Rebaseline before merging.
- **`MISSING IN RESULTS`** — a scenario in `baseline.json` is absent
from `results.json` (the harness skipped or dropped it). Either the
harness regressed silently, or a scenario was removed without
rebaselining the file. The gate must not silently pass.

Both directions are surfaced inline in the markdown table (in the
`Status` column). In `scaffold-placeholder` mode they appear as
informational `scaffold (would fail: MISSING IN BASELINE)` /
`scaffold (would fail: MISSING IN RESULTS)` rows so a rebaseline PR
previews the eventual active-mode verdict before the gate is armed;
the build still passes. In `active` mode they appear as bare
`MISSING IN BASELINE` / `MISSING IN RESULTS` and exit the comparator
with status 1. Behaviour pivots on the single `_status` flag in
`bench/baseline.json` — no code change is needed to arm the schema
checks once the gate goes live.

## Baseline lifecycle

The baseline lives in `bench/baseline.json`. Its `_status` field gates
Expand Down
Loading