Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* Batch config modeled after AVM: ~2618 wire polys of size 2^21, committed in batches of 32.
*/
#include "barretenberg/common/assert.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/ecc/curves/bn254/bn254.hpp"
#include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"
#include "barretenberg/polynomials/polynomial_arithmetic.hpp"
Expand Down Expand Up @@ -82,6 +83,43 @@ BENCHMARK_DEFINE_F(PippengerBench, BatchMSM)(benchmark::State& state)
}
}

/**
* @brief Batch MSM benchmark variant added to investigate `AztecProtocol/barretenberg#1656`.
*
* The issue is concerned about the single-threaded final reduction step in
* `MSM::batch_multi_scalar_mul(...)` when the work is split across a large number of threads,
* e.g. `2^16` points with `256` threads.
*
* We run a single MSM (num_polys = 1) and sweep thread counts and MSM sizes to make the
* final reduction overhead visible in the phase breakdown reported via `BB_BENCH` counters.
*/
BENCHMARK_DEFINE_F(PippengerBench, BatchMSM_1656)(benchmark::State& state)
{
const size_t num_threads = static_cast<size_t>(state.range(0));
const size_t msm_size = static_cast<size_t>(state.range(1));

std::vector<Fr> msm_scalars(msm_size);
for (auto& s : msm_scalars) {
s = Fr::random_element(&engine);
}

std::vector<std::span<Fr>> scalar_spans;
std::vector<std::span<const G1>> point_spans;
scalar_spans.emplace_back(msm_scalars);
point_spans.emplace_back(srs->get_monomial_points().subspan(0, msm_size));

// This is thread-local: restore after the benchmark so other cases in this binary are unaffected.
const size_t original_concurrency = bb::get_num_cpus();
bb::set_parallel_for_concurrency(num_threads);

for (auto _ : state) {
GOOGLE_BB_BENCH_REPORTER(state);
bb::scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(point_spans, scalar_spans, false);
}

bb::set_parallel_for_concurrency(original_concurrency);
}

// ===================== Registration =====================

// Single MSM: 2^14 to 2^20
Expand All @@ -97,6 +135,12 @@ BENCHMARK_REGISTER_F(PippengerBench, BatchMSM)
->Args({ 32, 1 << 19 })
->Args({ 32, 1 << 21 });

// Issue #1656 target: {threads=256, msm_size}
BENCHMARK_REGISTER_F(PippengerBench, BatchMSM_1656)
->Unit(benchmark::kMillisecond)
->Args({ 256, 1 << 16 })
->Args({ 256, 1 << 20 });

} // namespace

BENCHMARK_MAIN();
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,7 @@ std::vector<typename Curve::AffineElement> MSM<Curve>::batch_multi_scalar_mul(
std::span<std::span<ScalarField>> scalars,
bool handle_edge_cases) noexcept
{
BB_BENCH_NAME("MSM::batch_multi_scalar_mul");
BB_ASSERT_EQ(points.size(), scalars.size());
const size_t num_msms = points.size();

Expand All @@ -454,45 +455,56 @@ std::vector<typename Curve::AffineElement> MSM<Curve>::batch_multi_scalar_mul(
handle_edge_cases ? jacobian_pippenger_with_transformed_scalars : affine_pippenger_with_transformed_scalars;

// Once we have our work units, each thread can independently evaluate its assigned msms
parallel_for(num_cpus, [&](size_t thread_idx) {
if (!thread_work_units[thread_idx].empty()) {
const std::vector<MSMWorkUnit>& msms = thread_work_units[thread_idx];
std::vector<std::pair<Element, size_t>>& msm_results = thread_msm_results[thread_idx];
msm_results.reserve(msms.size());

// Point schedule buffer for this thread - avoids per-work-unit heap allocation
std::vector<uint64_t> point_schedule_buffer;

for (const MSMWorkUnit& msm : msms) {
point_schedule_buffer.resize(msm.size);
MSMData msm_data =
MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm);
Element msm_result =
(msm.size < PIPPENGER_THRESHOLD) ? small_mul<Curve>(msm_data) : pippenger_impl(msm_data);

msm_results.emplace_back(msm_result, msm.batch_msm_index);
{
BB_BENCH_NAME("MSM::batch_multi_scalar_mul/evaluate_work_units");
parallel_for(num_cpus, [&](size_t thread_idx) {
if (!thread_work_units[thread_idx].empty()) {
const std::vector<MSMWorkUnit>& msms = thread_work_units[thread_idx];
std::vector<std::pair<Element, size_t>>& msm_results = thread_msm_results[thread_idx];
msm_results.reserve(msms.size());

// Point schedule buffer for this thread - avoids per-work-unit heap allocation
std::vector<uint64_t> point_schedule_buffer;

for (const MSMWorkUnit& msm : msms) {
point_schedule_buffer.resize(msm.size);
MSMData msm_data =
MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm);
Element msm_result =
(msm.size < PIPPENGER_THRESHOLD) ? small_mul<Curve>(msm_data) : pippenger_impl(msm_data);

msm_results.emplace_back(msm_result, msm.batch_msm_index);
}
}
}
});
});
}

// Accumulate results. This part needs to be single threaded, but amount of work done here should be small
// TODO(@zac-williamson) check this? E.g. if we are doing a 2^16 MSM with 256 threads this single-threaded part
// will be painful.
// Accumulate results. Single-threaded, but negligible in practice.
// Benchmarked (192-core, 256 threads): ~512us for 2^16 MSM (~1.2% of total), ~207us for 2^20 (<0.1%).
std::vector<Element> results(num_msms, Curve::Group::point_at_infinity);
for (const auto& single_thread_msm_results : thread_msm_results) {
for (const auto& [element, index] : single_thread_msm_results) {
results[index] += element;
{
BB_BENCH_NAME("MSM::batch_multi_scalar_mul/accumulate_results");
for (const auto& single_thread_msm_results : thread_msm_results) {
for (const auto& [element, index] : single_thread_msm_results) {
results[index] += element;
}
}
}
Element::batch_normalize(results.data(), num_msms);
{
BB_BENCH_NAME("MSM::batch_multi_scalar_mul/batch_normalize");
Element::batch_normalize(results.data(), num_msms);
}

// Convert scalars back TO Montgomery form so they remain unchanged from caller's perspective
for (auto& scalar_span : scalars) {
parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
scalar_span[i].self_to_montgomery_form();
}
});
{
BB_BENCH_NAME("MSM::batch_multi_scalar_mul/scalars_to_montgomery");
for (auto& scalar_span : scalars) {
parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
scalar_span[i].self_to_montgomery_form();
}
});
}
}

return std::vector<AffineElement>(results.begin(), results.end());
Expand Down
Loading