diff --git a/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp index 28b9b3b096c5..d1040be2da56 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp +++ b/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp @@ -5,6 +5,7 @@ * Batch config modeled after AVM: ~2618 wire polys of size 2^21, committed in batches of 32. */ #include "barretenberg/common/assert.hpp" +#include "barretenberg/common/thread.hpp" #include "barretenberg/ecc/curves/bn254/bn254.hpp" #include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp" #include "barretenberg/polynomials/polynomial_arithmetic.hpp" @@ -82,6 +83,43 @@ BENCHMARK_DEFINE_F(PippengerBench, BatchMSM)(benchmark::State& state) } } +/** + * @brief Batch MSM benchmark variant added to investigate `AztecProtocol/barretenberg#1656`. + * + * The issue is concerned about the single-threaded final reduction step in + * `MSM::batch_multi_scalar_mul(...)` when the work is split across a large number of threads, + * e.g. `2^16` points with `256` threads. + * + * We run a single MSM (num_polys = 1) and sweep thread counts and MSM sizes to make the + * final reduction overhead visible in the phase breakdown reported via `BB_BENCH` counters. + */ +BENCHMARK_DEFINE_F(PippengerBench, BatchMSM_1656)(benchmark::State& state) +{ + const size_t num_threads = static_cast(state.range(0)); + const size_t msm_size = static_cast(state.range(1)); + + std::vector msm_scalars(msm_size); + for (auto& s : msm_scalars) { + s = Fr::random_element(&engine); + } + + std::vector> scalar_spans; + std::vector> point_spans; + scalar_spans.emplace_back(msm_scalars); + point_spans.emplace_back(srs->get_monomial_points().subspan(0, msm_size)); + + // This is thread-local: restore after the benchmark so other cases in this binary are unaffected. + const size_t original_concurrency = bb::get_num_cpus(); + bb::set_parallel_for_concurrency(num_threads); + + for (auto _ : state) { + GOOGLE_BB_BENCH_REPORTER(state); + bb::scalar_multiplication::MSM::batch_multi_scalar_mul(point_spans, scalar_spans, false); + } + + bb::set_parallel_for_concurrency(original_concurrency); +} + // ===================== Registration ===================== // Single MSM: 2^14 to 2^20 @@ -97,6 +135,12 @@ BENCHMARK_REGISTER_F(PippengerBench, BatchMSM) ->Args({ 32, 1 << 19 }) ->Args({ 32, 1 << 21 }); +// Issue #1656 target: {threads=256, msm_size} +BENCHMARK_REGISTER_F(PippengerBench, BatchMSM_1656) + ->Unit(benchmark::kMillisecond) + ->Args({ 256, 1 << 16 }) + ->Args({ 256, 1 << 20 }); + } // namespace BENCHMARK_MAIN(); diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp index b320789e32e8..e7f07c9ab6ff 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp @@ -439,6 +439,7 @@ std::vector MSM::batch_multi_scalar_mul( std::span> scalars, bool handle_edge_cases) noexcept { + BB_BENCH_NAME("MSM::batch_multi_scalar_mul"); BB_ASSERT_EQ(points.size(), scalars.size()); const size_t num_msms = points.size(); @@ -454,45 +455,56 @@ std::vector MSM::batch_multi_scalar_mul( handle_edge_cases ? jacobian_pippenger_with_transformed_scalars : affine_pippenger_with_transformed_scalars; // Once we have our work units, each thread can independently evaluate its assigned msms - parallel_for(num_cpus, [&](size_t thread_idx) { - if (!thread_work_units[thread_idx].empty()) { - const std::vector& msms = thread_work_units[thread_idx]; - std::vector>& msm_results = thread_msm_results[thread_idx]; - msm_results.reserve(msms.size()); - - // Point schedule buffer for this thread - avoids per-work-unit heap allocation - std::vector point_schedule_buffer; - - for (const MSMWorkUnit& msm : msms) { - point_schedule_buffer.resize(msm.size); - MSMData msm_data = - MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm); - Element msm_result = - (msm.size < PIPPENGER_THRESHOLD) ? small_mul(msm_data) : pippenger_impl(msm_data); - - msm_results.emplace_back(msm_result, msm.batch_msm_index); + { + BB_BENCH_NAME("MSM::batch_multi_scalar_mul/evaluate_work_units"); + parallel_for(num_cpus, [&](size_t thread_idx) { + if (!thread_work_units[thread_idx].empty()) { + const std::vector& msms = thread_work_units[thread_idx]; + std::vector>& msm_results = thread_msm_results[thread_idx]; + msm_results.reserve(msms.size()); + + // Point schedule buffer for this thread - avoids per-work-unit heap allocation + std::vector point_schedule_buffer; + + for (const MSMWorkUnit& msm : msms) { + point_schedule_buffer.resize(msm.size); + MSMData msm_data = + MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm); + Element msm_result = + (msm.size < PIPPENGER_THRESHOLD) ? small_mul(msm_data) : pippenger_impl(msm_data); + + msm_results.emplace_back(msm_result, msm.batch_msm_index); + } } - } - }); + }); + } - // Accumulate results. This part needs to be single threaded, but amount of work done here should be small - // TODO(@zac-williamson) check this? E.g. if we are doing a 2^16 MSM with 256 threads this single-threaded part - // will be painful. + // Accumulate results. Single-threaded, but negligible in practice. + // Benchmarked (192-core, 256 threads): ~512us for 2^16 MSM (~1.2% of total), ~207us for 2^20 (<0.1%). std::vector results(num_msms, Curve::Group::point_at_infinity); - for (const auto& single_thread_msm_results : thread_msm_results) { - for (const auto& [element, index] : single_thread_msm_results) { - results[index] += element; + { + BB_BENCH_NAME("MSM::batch_multi_scalar_mul/accumulate_results"); + for (const auto& single_thread_msm_results : thread_msm_results) { + for (const auto& [element, index] : single_thread_msm_results) { + results[index] += element; + } } } - Element::batch_normalize(results.data(), num_msms); + { + BB_BENCH_NAME("MSM::batch_multi_scalar_mul/batch_normalize"); + Element::batch_normalize(results.data(), num_msms); + } // Convert scalars back TO Montgomery form so they remain unchanged from caller's perspective - for (auto& scalar_span : scalars) { - parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) { - for (size_t i = start; i < end; ++i) { - scalar_span[i].self_to_montgomery_form(); - } - }); + { + BB_BENCH_NAME("MSM::batch_multi_scalar_mul/scalars_to_montgomery"); + for (auto& scalar_span : scalars) { + parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + scalar_span[i].self_to_montgomery_form(); + } + }); + } } return std::vector(results.begin(), results.end());