From c251a925212526a8c0e0526b82e11f88434cd95e Mon Sep 17 00:00:00 2001 From: "Peter Chen J." <34339487+peter941221@users.noreply.github.com> Date: Mon, 30 Mar 2026 22:31:09 +0800 Subject: [PATCH] chore: bench phase breakdown + thread sweep for MSM reduction (#21885) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context This PR addresses `AztecProtocol/barretenberg#1656` by making the `MSM::batch_multi_scalar_mul(...)` phase breakdown measurable and by adding a benchmark case that targets the exact regime called out in the issue (`2^16` points with `256` threads). The goal is to answer: is the single-threaded final reduction (`accumulate_results`) actually a bottleneck relative to the full MSM? ## What Changed ### 1) Phase Breakdown (`BB_BENCH`) inside `batch_multi_scalar_mul` Added `BB_BENCH` scopes for: - `MSM::batch_multi_scalar_mul/evaluate_work_units` - `MSM::batch_multi_scalar_mul/accumulate_results` - `MSM::batch_multi_scalar_mul/batch_normalize` - `MSM::batch_multi_scalar_mul/scalars_to_montgomery` These are surfaced in google-benchmark output via `GOOGLE_BB_BENCH_REPORTER(state)`. ### 2) New Benchmark Case: `BatchMSM_1656` Added a dedicated benchmark: - `PippengerBench/BatchMSM_1656/256/{msm_size}` - Single MSM (`num_polys = 1`) - Sizes: - `msm_size ∈ {2^16, 2^20}` This uses `bb::set_parallel_for_concurrency(256)` to force the intended partitioning and restores the original value after the benchmark finishes. ## Results (Local) Machine: 4 vCPU laptop (so `256 threads` is intentionally oversubscribed; the key point is the *absolute* reduction overhead). Key takeaways (from `BB_BENCH` counters; time counters are nanoseconds): - `2^16, 256 threads`: - `accumulate_results ≈ 139k ns` (~0.139 ms) - total `≈ 521 ms` - reduction fraction `~0.027%` - `2^20, 256 threads`: - `accumulate_results ≈ 141k ns` (~0.141 ms) - total `≈ 3457 ms` - reduction fraction `~0.004%` So the final reduction appears negligible in these regimes on my setup, and the benchmark now makes it easy to validate on a 64+ core machine where the original concern is most relevant. ## Notes / Background - `MSM` here is a Pippenger-style MSM. For background reading on multiexponentiation / multi-product methods: - Nicholas Pippenger, *On the evaluation of powers and related problems* (1976). - Jurjen Bos and Matthijs Coster, *Addition Chain Heuristics* (CRYPTO 1989). - Ryan Henry, *Pippenger's Multiproduct and Multiexponentiation Algorithms* (2010). ## How To Run ``` cmake --build --target pippenger_bench CRS_PATH= /bin/pippenger_bench \ --benchmark_filter='BatchMSM_1656' \ --benchmark_min_time=0.1s \ --benchmark_counters_tabular=true ``` Fixes https://github.com/AztecProtocol/barretenberg/issues/1656 --------- Co-authored-by: Peter Co-authored-by: Jonathan Hao --- .../pippenger_bench/pippenger.bench.cpp | 44 +++++++++++ .../scalar_multiplication.cpp | 76 +++++++++++-------- 2 files changed, 88 insertions(+), 32 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp index 28b9b3b096c5..d1040be2da56 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp +++ b/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp @@ -5,6 +5,7 @@ * Batch config modeled after AVM: ~2618 wire polys of size 2^21, committed in batches of 32. */ #include "barretenberg/common/assert.hpp" +#include "barretenberg/common/thread.hpp" #include "barretenberg/ecc/curves/bn254/bn254.hpp" #include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp" #include "barretenberg/polynomials/polynomial_arithmetic.hpp" @@ -82,6 +83,43 @@ BENCHMARK_DEFINE_F(PippengerBench, BatchMSM)(benchmark::State& state) } } +/** + * @brief Batch MSM benchmark variant added to investigate `AztecProtocol/barretenberg#1656`. + * + * The issue is concerned about the single-threaded final reduction step in + * `MSM::batch_multi_scalar_mul(...)` when the work is split across a large number of threads, + * e.g. `2^16` points with `256` threads. + * + * We run a single MSM (num_polys = 1) and sweep thread counts and MSM sizes to make the + * final reduction overhead visible in the phase breakdown reported via `BB_BENCH` counters. + */ +BENCHMARK_DEFINE_F(PippengerBench, BatchMSM_1656)(benchmark::State& state) +{ + const size_t num_threads = static_cast(state.range(0)); + const size_t msm_size = static_cast(state.range(1)); + + std::vector msm_scalars(msm_size); + for (auto& s : msm_scalars) { + s = Fr::random_element(&engine); + } + + std::vector> scalar_spans; + std::vector> point_spans; + scalar_spans.emplace_back(msm_scalars); + point_spans.emplace_back(srs->get_monomial_points().subspan(0, msm_size)); + + // This is thread-local: restore after the benchmark so other cases in this binary are unaffected. + const size_t original_concurrency = bb::get_num_cpus(); + bb::set_parallel_for_concurrency(num_threads); + + for (auto _ : state) { + GOOGLE_BB_BENCH_REPORTER(state); + bb::scalar_multiplication::MSM::batch_multi_scalar_mul(point_spans, scalar_spans, false); + } + + bb::set_parallel_for_concurrency(original_concurrency); +} + // ===================== Registration ===================== // Single MSM: 2^14 to 2^20 @@ -97,6 +135,12 @@ BENCHMARK_REGISTER_F(PippengerBench, BatchMSM) ->Args({ 32, 1 << 19 }) ->Args({ 32, 1 << 21 }); +// Issue #1656 target: {threads=256, msm_size} +BENCHMARK_REGISTER_F(PippengerBench, BatchMSM_1656) + ->Unit(benchmark::kMillisecond) + ->Args({ 256, 1 << 16 }) + ->Args({ 256, 1 << 20 }); + } // namespace BENCHMARK_MAIN(); diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp index b320789e32e8..e7f07c9ab6ff 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp @@ -439,6 +439,7 @@ std::vector MSM::batch_multi_scalar_mul( std::span> scalars, bool handle_edge_cases) noexcept { + BB_BENCH_NAME("MSM::batch_multi_scalar_mul"); BB_ASSERT_EQ(points.size(), scalars.size()); const size_t num_msms = points.size(); @@ -454,45 +455,56 @@ std::vector MSM::batch_multi_scalar_mul( handle_edge_cases ? jacobian_pippenger_with_transformed_scalars : affine_pippenger_with_transformed_scalars; // Once we have our work units, each thread can independently evaluate its assigned msms - parallel_for(num_cpus, [&](size_t thread_idx) { - if (!thread_work_units[thread_idx].empty()) { - const std::vector& msms = thread_work_units[thread_idx]; - std::vector>& msm_results = thread_msm_results[thread_idx]; - msm_results.reserve(msms.size()); - - // Point schedule buffer for this thread - avoids per-work-unit heap allocation - std::vector point_schedule_buffer; - - for (const MSMWorkUnit& msm : msms) { - point_schedule_buffer.resize(msm.size); - MSMData msm_data = - MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm); - Element msm_result = - (msm.size < PIPPENGER_THRESHOLD) ? small_mul(msm_data) : pippenger_impl(msm_data); - - msm_results.emplace_back(msm_result, msm.batch_msm_index); + { + BB_BENCH_NAME("MSM::batch_multi_scalar_mul/evaluate_work_units"); + parallel_for(num_cpus, [&](size_t thread_idx) { + if (!thread_work_units[thread_idx].empty()) { + const std::vector& msms = thread_work_units[thread_idx]; + std::vector>& msm_results = thread_msm_results[thread_idx]; + msm_results.reserve(msms.size()); + + // Point schedule buffer for this thread - avoids per-work-unit heap allocation + std::vector point_schedule_buffer; + + for (const MSMWorkUnit& msm : msms) { + point_schedule_buffer.resize(msm.size); + MSMData msm_data = + MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm); + Element msm_result = + (msm.size < PIPPENGER_THRESHOLD) ? small_mul(msm_data) : pippenger_impl(msm_data); + + msm_results.emplace_back(msm_result, msm.batch_msm_index); + } } - } - }); + }); + } - // Accumulate results. This part needs to be single threaded, but amount of work done here should be small - // TODO(@zac-williamson) check this? E.g. if we are doing a 2^16 MSM with 256 threads this single-threaded part - // will be painful. + // Accumulate results. Single-threaded, but negligible in practice. + // Benchmarked (192-core, 256 threads): ~512us for 2^16 MSM (~1.2% of total), ~207us for 2^20 (<0.1%). std::vector results(num_msms, Curve::Group::point_at_infinity); - for (const auto& single_thread_msm_results : thread_msm_results) { - for (const auto& [element, index] : single_thread_msm_results) { - results[index] += element; + { + BB_BENCH_NAME("MSM::batch_multi_scalar_mul/accumulate_results"); + for (const auto& single_thread_msm_results : thread_msm_results) { + for (const auto& [element, index] : single_thread_msm_results) { + results[index] += element; + } } } - Element::batch_normalize(results.data(), num_msms); + { + BB_BENCH_NAME("MSM::batch_multi_scalar_mul/batch_normalize"); + Element::batch_normalize(results.data(), num_msms); + } // Convert scalars back TO Montgomery form so they remain unchanged from caller's perspective - for (auto& scalar_span : scalars) { - parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) { - for (size_t i = start; i < end; ++i) { - scalar_span[i].self_to_montgomery_form(); - } - }); + { + BB_BENCH_NAME("MSM::batch_multi_scalar_mul/scalars_to_montgomery"); + for (auto& scalar_span : scalars) { + parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + scalar_span[i].self_to_montgomery_form(); + } + }); + } } return std::vector(results.begin(), results.end());