From c251a925212526a8c0e0526b82e11f88434cd95e Mon Sep 17 00:00:00 2001
From: "Peter Chen J." <34339487+peter941221@users.noreply.github.com>
Date: Mon, 30 Mar 2026 22:31:09 +0800
Subject: [PATCH] chore: bench phase breakdown + thread sweep for MSM reduction
 (#21885)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Context

This PR addresses `AztecProtocol/barretenberg#1656` by making the
`MSM::batch_multi_scalar_mul(...)` phase breakdown measurable and by
adding a benchmark case that targets the exact regime called out in the
issue (`2^16` points with `256` threads).

The goal is to answer: is the single-threaded final reduction
(`accumulate_results`) actually a bottleneck relative to the full MSM?

## What Changed

### 1) Phase Breakdown (`BB_BENCH`) inside `batch_multi_scalar_mul`

Added `BB_BENCH` scopes for:
- `MSM::batch_multi_scalar_mul/evaluate_work_units`
- `MSM::batch_multi_scalar_mul/accumulate_results`
- `MSM::batch_multi_scalar_mul/batch_normalize`
- `MSM::batch_multi_scalar_mul/scalars_to_montgomery`

These are surfaced in google-benchmark output via
`GOOGLE_BB_BENCH_REPORTER(state)`.

### 2) New Benchmark Case: `BatchMSM_1656`

Added a dedicated benchmark:
- `PippengerBench/BatchMSM_1656/256/{msm_size}`
- Single MSM (`num_polys = 1`)
- Sizes:
  - `msm_size ∈ {2^16, 2^20}`

This uses `bb::set_parallel_for_concurrency(256)` to force the intended
partitioning and restores the original value after the benchmark
finishes.

## Results (Local)

Machine: 4 vCPU laptop (so `256 threads` is intentionally
oversubscribed; the key point is the *absolute* reduction overhead).

Key takeaways (from `BB_BENCH` counters; time counters are nanoseconds):
- `2^16, 256 threads`:
  - `accumulate_results ≈ 139k ns` (~0.139 ms)
  - total `≈ 521 ms`
  - reduction fraction `~0.027%`
- `2^20, 256 threads`:
  - `accumulate_results ≈ 141k ns` (~0.141 ms)
  - total `≈ 3457 ms`
  - reduction fraction `~0.004%`

So the final reduction appears negligible in these regimes on my setup,
and the benchmark now makes it easy to validate on a 64+ core machine
where the original concern is most relevant.

## Notes / Background

- `MSM` here is a Pippenger-style MSM. For background reading on
multiexponentiation / multi-product methods:
- Nicholas Pippenger, *On the evaluation of powers and related problems*
(1976).
- Jurjen Bos and Matthijs Coster, *Addition Chain Heuristics* (CRYPTO
1989).
- Ryan Henry, *Pippenger's Multiproduct and Multiexponentiation
Algorithms* (2010).

## How To Run

```
cmake --build <build-dir> --target pippenger_bench
CRS_PATH=<path-to-bn254_g1.dat> <build-dir>/bin/pippenger_bench \
  --benchmark_filter='BatchMSM_1656' \
  --benchmark_min_time=0.1s \
  --benchmark_counters_tabular=true
```

Fixes https://github.com/AztecProtocol/barretenberg/issues/1656

---------

Co-authored-by: Peter <peter@users.noreply.github.com>
Co-authored-by: Jonathan Hao <jonathan@aztec-labs.com>
---
 .../pippenger_bench/pippenger.bench.cpp       | 44 +++++++++++
 .../scalar_multiplication.cpp                 | 76 +++++++++++--------
 2 files changed, 88 insertions(+), 32 deletions(-)
diff --git a/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp
index 28b9b3b096c5..d1040be2da56 100644
--- a/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp
+++ b/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp
@@ -5,6 +5,7 @@
  * Batch config modeled after AVM: ~2618 wire polys of size 2^21, committed in batches of 32.
  */
 #include "barretenberg/common/assert.hpp"
+#include "barretenberg/common/thread.hpp"
 #include "barretenberg/ecc/curves/bn254/bn254.hpp"
 #include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/polynomials/polynomial_arithmetic.hpp"
@@ -82,6 +83,43 @@ BENCHMARK_DEFINE_F(PippengerBench, BatchMSM)(benchmark::State& state)
     }
 }
 
+/**
+ * @brief Batch MSM benchmark variant added to investigate `AztecProtocol/barretenberg#1656`.
+ *
+ * The issue is concerned about the single-threaded final reduction step in
+ * `MSM::batch_multi_scalar_mul(...)` when the work is split across a large number of threads,
+ * e.g. `2^16` points with `256` threads.
+ *
+ * We run a single MSM (num_polys = 1) and sweep thread counts and MSM sizes to make the
+ * final reduction overhead visible in the phase breakdown reported via `BB_BENCH` counters.
+ */
+BENCHMARK_DEFINE_F(PippengerBench, BatchMSM_1656)(benchmark::State& state)
+{
+    const size_t num_threads = static_cast<size_t>(state.range(0));
+    const size_t msm_size = static_cast<size_t>(state.range(1));
+
+    std::vector<Fr> msm_scalars(msm_size);
+    for (auto& s : msm_scalars) {
+        s = Fr::random_element(&engine);
+    }
+
+    std::vector<std::span<Fr>> scalar_spans;
+    std::vector<std::span<const G1>> point_spans;
+    scalar_spans.emplace_back(msm_scalars);
+    point_spans.emplace_back(srs->get_monomial_points().subspan(0, msm_size));
+
+    // This is thread-local: restore after the benchmark so other cases in this binary are unaffected.
+    const size_t original_concurrency = bb::get_num_cpus();
+    bb::set_parallel_for_concurrency(num_threads);
+
+    for (auto _ : state) {
+        GOOGLE_BB_BENCH_REPORTER(state);
+        bb::scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(point_spans, scalar_spans, false);
+    }
+
+    bb::set_parallel_for_concurrency(original_concurrency);
+}
+
 // ===================== Registration =====================
 
 // Single MSM: 2^14 to 2^20
@@ -97,6 +135,12 @@ BENCHMARK_REGISTER_F(PippengerBench, BatchMSM)
     ->Args({ 32, 1 << 19 })
     ->Args({ 32, 1 << 21 });
 
+// Issue #1656 target: {threads=256, msm_size}
+BENCHMARK_REGISTER_F(PippengerBench, BatchMSM_1656)
+    ->Unit(benchmark::kMillisecond)
+    ->Args({ 256, 1 << 16 })
+    ->Args({ 256, 1 << 20 });
+
 } // namespace
 
 BENCHMARK_MAIN();
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp
index b320789e32e8..e7f07c9ab6ff 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp
@@ -439,6 +439,7 @@ std::vector<typename Curve::AffineElement> MSM<Curve>::batch_multi_scalar_mul(
     std::span<std::span<ScalarField>> scalars,
     bool handle_edge_cases) noexcept
 {
+    BB_BENCH_NAME("MSM::batch_multi_scalar_mul");
     BB_ASSERT_EQ(points.size(), scalars.size());
     const size_t num_msms = points.size();
 
@@ -454,45 +455,56 @@ std::vector<typename Curve::AffineElement> MSM<Curve>::batch_multi_scalar_mul(
         handle_edge_cases ? jacobian_pippenger_with_transformed_scalars : affine_pippenger_with_transformed_scalars;
 
     // Once we have our work units, each thread can independently evaluate its assigned msms
-    parallel_for(num_cpus, [&](size_t thread_idx) {
-        if (!thread_work_units[thread_idx].empty()) {
-            const std::vector<MSMWorkUnit>& msms = thread_work_units[thread_idx];
-            std::vector<std::pair<Element, size_t>>& msm_results = thread_msm_results[thread_idx];
-            msm_results.reserve(msms.size());
-
-            // Point schedule buffer for this thread - avoids per-work-unit heap allocation
-            std::vector<uint64_t> point_schedule_buffer;
-
-            for (const MSMWorkUnit& msm : msms) {
-                point_schedule_buffer.resize(msm.size);
-                MSMData msm_data =
-                    MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm);
-                Element msm_result =
-                    (msm.size < PIPPENGER_THRESHOLD) ? small_mul<Curve>(msm_data) : pippenger_impl(msm_data);
-
-                msm_results.emplace_back(msm_result, msm.batch_msm_index);
+    {
+        BB_BENCH_NAME("MSM::batch_multi_scalar_mul/evaluate_work_units");
+        parallel_for(num_cpus, [&](size_t thread_idx) {
+            if (!thread_work_units[thread_idx].empty()) {
+                const std::vector<MSMWorkUnit>& msms = thread_work_units[thread_idx];
+                std::vector<std::pair<Element, size_t>>& msm_results = thread_msm_results[thread_idx];
+                msm_results.reserve(msms.size());
+
+                // Point schedule buffer for this thread - avoids per-work-unit heap allocation
+                std::vector<uint64_t> point_schedule_buffer;
+
+                for (const MSMWorkUnit& msm : msms) {
+                    point_schedule_buffer.resize(msm.size);
+                    MSMData msm_data =
+                        MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm);
+                    Element msm_result =
+                        (msm.size < PIPPENGER_THRESHOLD) ? small_mul<Curve>(msm_data) : pippenger_impl(msm_data);
+
+                    msm_results.emplace_back(msm_result, msm.batch_msm_index);
+                }
             }
-        }
-    });
+        });
+    }
 
-    // Accumulate results. This part needs to be single threaded, but amount of work done here should be small
-    // TODO(@zac-williamson) check this? E.g. if we are doing a 2^16 MSM with 256 threads this single-threaded part
-    // will be painful.
+    // Accumulate results. Single-threaded, but negligible in practice.
+    // Benchmarked (192-core, 256 threads): ~512us for 2^16 MSM (~1.2% of total), ~207us for 2^20 (<0.1%).
     std::vector<Element> results(num_msms, Curve::Group::point_at_infinity);
-    for (const auto& single_thread_msm_results : thread_msm_results) {
-        for (const auto& [element, index] : single_thread_msm_results) {
-            results[index] += element;
+    {
+        BB_BENCH_NAME("MSM::batch_multi_scalar_mul/accumulate_results");
+        for (const auto& single_thread_msm_results : thread_msm_results) {
+            for (const auto& [element, index] : single_thread_msm_results) {
+                results[index] += element;
+            }
         }
     }
-    Element::batch_normalize(results.data(), num_msms);
+    {
+        BB_BENCH_NAME("MSM::batch_multi_scalar_mul/batch_normalize");
+        Element::batch_normalize(results.data(), num_msms);
+    }
 
     // Convert scalars back TO Montgomery form so they remain unchanged from caller's perspective
-    for (auto& scalar_span : scalars) {
-        parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) {
-            for (size_t i = start; i < end; ++i) {
-                scalar_span[i].self_to_montgomery_form();
-            }
-        });
+    {
+        BB_BENCH_NAME("MSM::batch_multi_scalar_mul/scalars_to_montgomery");
+        for (auto& scalar_span : scalars) {
+            parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) {
+                for (size_t i = start; i < end; ++i) {
+                    scalar_span[i].self_to_montgomery_form();
+                }
+            });
+        }
     }
 
     return std::vector<AffineElement>(results.begin(), results.end());