ix/Ix/Benchmark/Bench.lean at 1d1ddf51cab90bba105b99a2d2e026cbcd1a63e0 · argumentcomputer/ix · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
module
public import Ix.Address
public import Ix.Meta
public import Ix.Cronos
public import Ix.Address
public import Batteries

public import Ix.Benchmark.Serde
public import Ix.Benchmark.Tukey

public section

open Batteries (RBMap)

/-!
# Benchmarking library modeled after Criterion in Haskell and Rust

## Limitations
- Measures time in nanoseconds using `IO.monoNanosNow`, which is less precise than the picoseconds used in Criterion.rs
-/

/--
Computes the result and then wraps the return type in IO. This prevents Lean from optimizing away the function call due to an unused return value.
-/
@[never_extract, noinline]
def blackBoxIO (f : α → β) (a : α) : IO β := do
  pure $ f a

/--
A benchmark group defines a collection of related benchmarks that share a configuration, such as number of samples and noise threshold
-/
structure BenchGroup where
  name : String
  config : Config

inductive BenchFn ( α β : Type) where
| pure (fn : α → β)
| io (fn : α → IO β)

structure Benchmarkable (α β : Type) where
  name : String
  func : BenchFn α β
  arg : α

/--
Creates a `Benchmarkable` instance. Use with pure functions, which will later be called with `blackBoxIO` to prevent compiler optimizations.
-/
def bench (name : String) (fn : α → β) (arg : α) : Benchmarkable α β := { name, func := BenchFn.pure fn, arg }

/--
Creates a `Benchmarkable` instance. Use with functions that return IO, as they don't need to be black boxed.
-/
def benchIO (name : String) (fn : α → IO β) (arg : α) : Benchmarkable α β :=  { name, func := BenchFn.io fn, arg }

/-- If the function is pure, we need to wrap it in `blackBoxIO` so that Lean doesn't optimize away the result -/
def Benchmarkable.getFn ( bench: Benchmarkable α β) : α → IO β :=
  match bench.func with
  | .pure f => blackBoxIO f
  | .io f => f

-- TODO: According to Criterion.rs docs the warmup iterations should increase linearly until the warmup time is reached, rather than one iteration per time check
def BenchGroup.warmup (bg : BenchGroup) (bench : Benchmarkable α β) : IO Float := do
  IO.println s!"Warming up for {bg.config.warmupTime.floatPretty 2}s"
  let mut count := 0
  let warmupNanos := Cronos.secToNano bg.config.warmupTime
  let mut elapsed := 0
  let func := bench.getFn
  let startTime ← IO.monoNanosNow
  while elapsed < warmupNanos do
    let _res ← func bench.arg
    let now ← IO.monoNanosNow
    count := count + 1
    elapsed := now - startTime
  let mean := Float.ofNat elapsed / Float.ofNat count
  --IO.println s!"{bench.name} warmup avg: {mean}ns"
  return mean

def printRunning (config : Config) (expectedTime : Float) (numIters : Nat) (warningFactor : Nat) : IO Unit := do
  if warningFactor == 1 then
    IO.eprintln s!"Warning: Unable to complete {config.numSamples} samples in {config.sampleTime.floatPretty 2}s. You may wish to increase target time to {expectedTime.floatPretty 2}s"
  IO.println s!"Running {config.numSamples} samples in {expectedTime.floatPretty 2}s ({numIters.natPretty} iterations)\n"

-- Core sampling loop that runs the benchmark function and returns the array of measured timings
def runSample (sampleIters : Array Nat) (bench : Benchmarkable α β) : IO (Array Nat) := do
  let func := bench.getFn
  let mut timings : Array Nat := #[]
  for iters in sampleIters do
    let start ← IO.monoNanosNow
    for _i in Array.range iters do
      let _res ← func bench.arg
    let finish ← IO.monoNanosNow
    timings := timings.push (finish - start)
  return timings

-- TODO: Recommend sample count if expectedTime >>> bg.config.sampleTime (i.e. itersPerSample == 1)
/--
Runs the sample as a sequence of constant iterations per data point, where the iteration count attempts to fit into the configurable `sampleTime` but cannot be less than 1.

Returns the iteration counts and elapsed time per data point.
-/
def BenchGroup.sampleFlat (bg : BenchGroup) (bench : Benchmarkable α β)
(warmupMean : Float) : IO (Array Nat × Array Nat) := do
  let targetTime := bg.config.sampleTime.toNanos
  let timePerSample := targetTime / (Float.ofNat bg.config.numSamples)
  let itersPerSample := (timePerSample / warmupMean).ceil.toUInt64.toNat.max 1
  let totalIters := itersPerSample * bg.config.numSamples
  let expectedTime := warmupMean * Float.ofNat itersPerSample * bg.config.numSamples.toSeconds
  printRunning bg.config expectedTime totalIters itersPerSample
  --IO.println s!"Flat sample. Iters per sample: {itersPerSample}"
  let sampleIters := Array.replicate bg.config.numSamples itersPerSample
  let timings ← runSample sampleIters bench
  return (sampleIters, timings)

/--
Runs the sample as a sequence of linearly increasing iterations [d, 2d, 3d, ..., Nd] where `N` is the total number of samples and `d` is a factor derived from the warmup iteration time.

The sum of this series should be roughly equivalent to the total `sampleTime`.

Returns the iteration counts and elapsed time per sample.
-/
def BenchGroup.sampleLinear (bg : BenchGroup) (bench : Benchmarkable α β) (warmupMean : Float) : IO (Array Nat × Array Nat) := do
  let totalRuns := bg.config.numSamples * (bg.config.numSamples + 1) / 2
  let targetTime := bg.config.sampleTime.toNanos
  -- `d` has a minimum value of 1 if the `warmupMean` is sufficiently large
  -- If so, that likely means the benchmark takes too long and target time should be increased, as well as considering other config options like flat sampling mode
  let d := (targetTime / warmupMean / (Float.ofNat totalRuns)).ceil.toUInt64.toNat.max 1
  let expectedTime := (Float.ofNat totalRuns) * (Float.ofNat d) * warmupMean.toSeconds
  let sampleIters := (Array.range bg.config.numSamples).map (fun x => (x + 1) * d)
  printRunning bg.config expectedTime sampleIters.sum d
  --IO.println s!"Linear sample. Iters increase by a factor of {d} per sample"
  let timings ← runSample sampleIters bench
  return (sampleIters, timings)

def BenchGroup.sample (bg : BenchGroup) (bench : Benchmarkable α β) (warmupMean : Float) : IO (Array Nat × Array Nat) := do
  match bg.config.samplingMode with
  | .flat => bg.sampleFlat bench warmupMean
  | .linear => bg.sampleLinear bench warmupMean

-- TODO
inductive Throughput where
| Bytes (n : UInt64)
| BytesDecimal (n : UInt64)
| Elements (n : UInt64)
| Bits (n : UInt64)
  deriving Repr

structure MeasurementData where
  data : Data
  avgTimes : Distribution
  absoluteEstimates : Estimates
  distributions : Distributions
  comparison : Option ComparisonData
  throughput : Option Throughput
  deriving Repr

/-- Compares bench results against prior run, using T-test to determine statistical significance -/
def BenchGroup.compare (bg : BenchGroup) (baseSample : Data) (baseEstimates : Estimates) (avgTimes : Distribution) (gen : StdGen) : ComparisonData :=
  -- Gets `base` data for comparison
  let baseAvgTimes : Distribution := { d := baseSample.d.map (fun (x,y) => Float.ofNat y / Float.ofNat x) }
  -- Then performs statistical analysis
  let (tValue, tDistribution) := tTest avgTimes baseAvgTimes bg.config gen
  let (relativeEstimates, relativeDistributions) := changeEstimates avgTimes baseAvgTimes bg.config gen
  let pValue := tDistribution.pValue tValue
  {
    pValue,
    tDistribution,
    tValue,
    relativeEstimates,
    relativeDistributions,
    significanceThreshold := bg.config.significanceLevel,
    noiseThreshold := bg.config.noiseThreshold,
    baseSample,
    baseAvgTimes,
    baseEstimates
  }

-- TODO: Don't compare if different sampling modes were used
/-- Retrieves prior bench result from disk and runs the comparison -/
def BenchGroup.getComparison (bg : BenchGroup) (benchName : String) (avgTimes : Distribution) (config: Config) : IO (Option ComparisonData) := do
  let benchPath := System.mkFilePath [".", ".lake", "benches", bg.name, benchName]
  let fileExt := toString config.serde
  -- Base data is at `new` since we haven't written the latest result to disk yet, which moves the prior data to `base`
  let basePath := benchPath / "new"
  if (← System.FilePath.pathExists (basePath / s!"estimates.{fileExt}")) then do
    let baseSample : Data ← loadFile config.serde (basePath / s!"sample.{fileExt}")
    let baseEstimates : Estimates ← loadFile config.serde (basePath / s!"estimates.{fileExt}")
    let gen ← IO.stdGenRef.get
    return some $ bg.compare baseSample baseEstimates avgTimes gen
  else
    return none

inductive ComparisonResult where
| Improved
| Regressed
| NonSignificant

def compareToThreshold (estimate : Estimate) (noiseThreshold : Float) : ComparisonResult :=
  let ci := estimate.confidenceInterval
  let (lb, ub) := (ci.lowerBound, ci.upperBound)
  let noiseNeg := noiseThreshold.neg

  if lb < noiseNeg && ub < noiseNeg
  then
    ComparisonResult.Improved
  else if lb > noiseThreshold && ub > noiseThreshold
  then
    ComparisonResult.Regressed
  else
    ComparisonResult.NonSignificant


-- TODO: Print ~24 whitespace characters before time, change, regression
def BenchGroup.printResults (bg : BenchGroup) (benchName : String) (m : MeasurementData) : IO Unit := do
  let estimates := m.absoluteEstimates
  let typicalEstimate := estimates.slope.getD estimates.mean
  IO.println s!"{bg.name}/{benchName}"
  let lb := typicalEstimate.confidenceInterval.lowerBound.formatNanos
  let pointEst := typicalEstimate.pointEstimate.formatNanos
  let ub := typicalEstimate.confidenceInterval.upperBound.formatNanos
  IO.println s!"time:   [{lb} {pointEst} {ub}]"
  if let some comp := m.comparison
  then
    let diffMean := comp.pValue > comp.significanceThreshold
    let meanEst := comp.relativeEstimates.mean
    let pointEst := (meanEst.pointEstimate * 100).floatPretty 4
    let lb := (meanEst.confidenceInterval.lowerBound * 100).floatPretty 4
    let ub := (meanEst.confidenceInterval.upperBound * 100).floatPretty 4
    let symbol := if diffMean then ">" else "<"
    IO.println s!"change: [{lb}% {pointEst}% {ub}%] (p = {comp.pValue.floatPretty 2} {symbol} {comp.significanceThreshold.floatPretty 2})"
    let explanation := if diffMean
    then
      "No change in performance detected"
    else
      match compareToThreshold meanEst comp.noiseThreshold with
      | ComparisonResult.Improved => "Performance has improved"
      | ComparisonResult.Regressed => "Performance has regressed"
      | ComparisonResult.NonSignificant => "Change within noise threshold"
    IO.println explanation
  IO.println ""
  m.avgTimes.tukey

-- TODO: Write sampling mode and config in `sample.json` for comparison
def saveComparison (groupName : String) (benchName : String) (comparison : ComparisonData) (config : Config) : IO Unit := do
  let changePath := System.mkFilePath [".", ".lake", "benches", groupName, benchName, "change"]
  storeFile config.serde comparison.relativeEstimates (changePath / s!"estimates.{toString config.serde}")

def mkDirs (groupName : String) (benchName : String) : IO (System.FilePath × System.FilePath) := do
  let benchPath := System.mkFilePath [".", ".lake", "benches", groupName, benchName]
  let (basePath, changePath, newPath) := (benchPath / "base", benchPath / "change", benchPath / "new")
  let (baseDir, changeDir, newDir) := (basePath.toString, changePath.toString, newPath.toString)
  let _out ← IO.Process.run {cmd := "mkdir", args := #["-p", baseDir, changeDir, newDir] }
  return (basePath, newPath)

def moveBaseFile (file : System.FilePath) (baseDir : String) : IO Unit := do
  if (← System.FilePath.pathExists file) then do
    -- Move prior bench from `new` to `base`
    let _ ← IO.Process.run { cmd := "mv", args := #[file.toString, baseDir] }

-- Results are always saved to `.lake/benches/<benchName>/new`. If files of the same serde format already exist from a prior run, move them to `base`
def saveMeasurement (groupName : String) (benchName : String) (mData : MeasurementData) (config : Config) : IO Unit := do
  let (basePath, newPath) ← mkDirs groupName benchName
  let baseDir := basePath.toString
  let fileExt := toString config.serde
  if let some compData := mData.comparison then saveComparison groupName benchName compData config
  else
    println! "No compdata"
  let (newEstimatesFile, newSampleFile) := (newPath / s!"estimates.{fileExt}", newPath / s!"sample.{fileExt}")
  moveBaseFile newSampleFile baseDir
  moveBaseFile newEstimatesFile baseDir
  storeFile config.serde mData.data newSampleFile
  storeFile config.serde mData.absoluteEstimates newEstimatesFile

-- TODO: Use a typeclass instead?
inductive BenchResult where
| oneShot : OneShot → BenchResult
| sample : Estimates → BenchResult
  deriving Repr

def BenchResult.getTime (bench: BenchResult) : Float :=
  match bench with
  | oneShot o => o.benchTime.toFloat
  | sample s => s.mean.pointEstimate

-- TODO: Include input parameters if specified for post-processing
-- E.g. Blake3 bench wants to have `dataSize` and `numHashes`
-- Currently has to parse this from the `function` string
structure BenchReport where
  function: String
  newBench : BenchResult
  baseBench : Option BenchResult
  percentChange : Option Float

def percentChangeToString (pc : Float) : String :=
  let rounded := (100 * pc).floatPretty 2
  if pc < 0 then s!"{rounded.drop 1}% faster"
  else if pc > 0 then s!"{rounded}% slower"
  else "No change"

structure ColumnWidths where
  function : Nat
  newBench : Nat
  baseBench : Nat
  percentChange : Nat

def getColumnWidths' (maxWidths : ColumnWidths) (row: BenchReport) : ColumnWidths :=
  let fnLen := row.function.length
  let function := if fnLen > maxWidths.function then fnLen else maxWidths.function
  let newBenchLen := row.newBench.getTime.formatNanos.length
  let newBench := if newBenchLen > maxWidths.newBench then newBenchLen else maxWidths.newBench
  let baseBench := if let some baseBench := row.baseBench then
    let baseBenchLen := baseBench.getTime.formatNanos.length
    if baseBenchLen > maxWidths.baseBench then baseBenchLen
    else maxWidths.baseBench
  else maxWidths.baseBench
  let percentChange := if let some percentChange := row.percentChange then
    let percentChangeStr := percentChangeToString percentChange
    let percentLen := percentChangeStr.length
    if percentLen > maxWidths.percentChange then percentLen
    else maxWidths.percentChange
  else maxWidths.percentChange
  { function, newBench, baseBench, percentChange }

-- Gets the max lengths of each data type for pretty-printing columns
def getColumnWidths (report : Array BenchReport) : ColumnWidths :=
  let maxWidths : ColumnWidths := {
    function := "Function".length
    newBench := "New Benchmark".length
    baseBench := "Base Benchmark".length
    percentChange := "% change".length
  }
  report.foldl getColumnWidths' maxWidths

-- Centers a string with padded whitespace given the total width
def padWhitespace (input : String) (width : Nat) : String :=
  let padWidth := width - input.length
  let leftPad := padWidth / 2
  let rightPad := padWidth - leftPad
  String.ofList (List.replicate leftPad ' ') ++ input ++ (String.ofList (List.replicate rightPad ' '))

def padDashes (width : Nat) : String :=
  String.ofList (List.replicate width '-')

def mkReportPretty' (columnWidths : ColumnWidths) (reportPretty : String) (row : BenchReport) : String :=
  let functionStr := padWhitespace row.function columnWidths.function
  let newBenchStr := row.newBench.getTime.formatNanos
  let newBenchStr := padWhitespace newBenchStr columnWidths.newBench
  let baseBenchStr := if let some baseBench := row.baseBench then
    baseBench.getTime.formatNanos
  else "None"
  let baseBenchStr := padWhitespace baseBenchStr columnWidths.baseBench
  let percentChangeStr := if let some percentChange := row.percentChange then
    percentChangeToString percentChange
  else "N/A"
  let percentChangeStr := padWhitespace percentChangeStr columnWidths.percentChange
  let rowPretty :=
    s!"| {functionStr} | {newBenchStr} | {baseBenchStr} | {percentChangeStr} |\n"
  reportPretty ++ rowPretty

-- TODO: Bold the faster result and faster/slower % change
/--
Generates a Markdown table with comparative benchmark timings based on the results on disk.
Each table row contains the benchmakr function name, the new timing, the base timing, and the percent change between the two.
-/
def mkReportPretty (groupName : String) (report : Array BenchReport) : String :=
  let columnWidths := getColumnWidths report
  let title := s!"## {groupName}\n\n"
  let (fn, new, base, percent) := (
    padWhitespace "Function" columnWidths.function,
    padWhitespace "New Benchmark" columnWidths.newBench,
    padWhitespace "Base Benchmark" columnWidths.baseBench,
    padWhitespace "% change" columnWidths.percentChange
  )
  let header := s!"| {fn} | {new} | {base} | {percent} |\n"
  let (d1, d2, d3, d4) := (
    padDashes columnWidths.function,
    padDashes columnWidths.newBench,
    padDashes columnWidths.baseBench,
    padDashes columnWidths.percentChange
  )
  let dashes := s!"|-{d1}-|-{d2}-|-{d3}-|-{d4}-|\n"
  let reportPretty := title ++ header ++ dashes
  report.foldl (mkReportPretty' columnWidths) reportPretty

def oneShotBench {α β : Type} (groupName : String) (bench: Benchmarkable α β) (config : Config) : IO BenchReport := do
  let start ← IO.monoNanosNow
  let _res ← bench.getFn bench.arg
  let finish ← IO.monoNanosNow
  let benchTime := finish - start
  IO.println s!"time:   {benchTime.toFloat.formatNanos}"
  let (basePath, newPath) ← mkDirs groupName bench.name
  let fileExt := toString config.serde
  let newFile := newPath / s!"one-shot.{fileExt}"
  let (baseBench, percentChange) ← if (← System.FilePath.pathExists newFile) then do
    let baseBench : OneShot ← loadFile config.serde newFile
    let baseTime := baseBench.benchTime.toFloat
    let percentChange := (benchTime.toFloat - baseTime) / baseTime
    IO.println s!"change: {percentChangeToString percentChange}"
    let _ ← IO.Process.run { cmd := "mv", args := #[newFile.toString, basePath.toString] }
    pure (some baseBench, some percentChange)
  else
    let _ ← IO.Process.run { cmd := "mkdir", args := #["-p", newPath.toString] }
    pure (.none, .none)
  let newBench : OneShot := { benchTime }
  IO.println ""
  storeFile config.serde newBench newFile
  let benchReport : BenchReport := { function := bench.name, newBench := (.oneShot newBench), baseBench := baseBench.map .oneShot, percentChange }
  return benchReport

-- TODO: Make sure compiler isn't caching partial evaluation result for future runs of the same function (measure first vs subsequent runs)
/-- Runs each benchmark in a `BenchGroup` and analyzes the results -/
def bgroup {α β : Type} (name: String) (benches : List (Benchmarkable α β)) (config : Config := {}) : IO $ Array BenchReport := do
  let config ← getConfigEnv config
  let bg : BenchGroup := { name, config }
  IO.println s!"Running bench group {name}\n"
  let mut reports : Array BenchReport := #[]
  for b in benches do
    let report : BenchReport ← if config.oneShot then do
      IO.println s!"{bg.name}/{b.name}"
      oneShotBench name b config
    else
      let warmupMean ← bg.warmup b
      IO.println s!"Running {b.name}"
      let (iters, times) ← bg.sample b warmupMean
      let data := iters.zip times
      let avgTimes : Distribution := { d := data.map (fun (x,y) => Float.ofNat y / Float.ofNat x) }
      let gen ← IO.stdGenRef.get
      let mut (distributions, estimates) := avgTimes.estimates bg.config gen
      if bg.config.samplingMode == .linear
      then
        let data' : Data := { d := data }
        let (distribution, slope) := data'.regression bg.config gen
        estimates := { estimates with slope := .some slope }
        distributions := { distributions with slope := .some distribution }
      let comparisonData : Option ComparisonData ← bg.getComparison b.name avgTimes bg.config
      let measurement :=  {
        data := { d := data },
        avgTimes,
        absoluteEstimates := estimates,
        distributions,
        comparison := comparisonData
        throughput := none
      }
      bg.printResults b.name measurement
      IO.println ""
      saveMeasurement name b.name measurement config
      let benchReport : BenchReport := { function := b.name, newBench := (.sample estimates), baseBench := (comparisonData.map (.sample ·.baseEstimates)), percentChange := comparisonData.map (·.relativeEstimates.mean.pointEstimate) }
      pure benchReport
    reports := reports.push report
  if config.report then
    let table := mkReportPretty name reports
    IO.println table
    IO.FS.writeFile (System.mkFilePath [".", s!"benchmark-report-{name}.md"]) table
  return reports

end