Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* Batch config modeled after AVM: ~2618 wire polys of size 2^21, committed in batches of 32.
*/
#include "barretenberg/common/assert.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/ecc/curves/bn254/bn254.hpp"
#include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"
#include "barretenberg/polynomials/polynomial_arithmetic.hpp"
Expand Down Expand Up @@ -82,6 +83,43 @@ BENCHMARK_DEFINE_F(PippengerBench, BatchMSM)(benchmark::State& state)
}
}

/**
* @brief Batch MSM benchmark variant added to investigate `AztecProtocol/barretenberg#1656`.
*
* The issue is concerned about the single-threaded final reduction step in
* `MSM::batch_multi_scalar_mul(...)` when the work is split across a large number of threads,
* e.g. `2^16` points with `256` threads.
*
* We run a single MSM (num_polys = 1) and sweep thread counts and MSM sizes to make the
* final reduction overhead visible in the phase breakdown reported via `BB_BENCH` counters.
*/
BENCHMARK_DEFINE_F(PippengerBench, BatchMSM_1656)(benchmark::State& state)
{
const size_t num_threads = static_cast<size_t>(state.range(0));
const size_t msm_size = static_cast<size_t>(state.range(1));

std::vector<Fr> msm_scalars(msm_size);
for (auto& s : msm_scalars) {
s = Fr::random_element(&engine);
}

std::vector<std::span<Fr>> scalar_spans;
std::vector<std::span<const G1>> point_spans;
scalar_spans.emplace_back(msm_scalars);
point_spans.emplace_back(srs->get_monomial_points().subspan(0, msm_size));

// This is thread-local: restore after the benchmark so other cases in this binary are unaffected.
const size_t original_concurrency = bb::get_num_cpus();
bb::set_parallel_for_concurrency(num_threads);

for (auto _ : state) {
GOOGLE_BB_BENCH_REPORTER(state);
bb::scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(point_spans, scalar_spans, false);
}

bb::set_parallel_for_concurrency(original_concurrency);
}

// ===================== Registration =====================

// Single MSM: 2^14 to 2^20
Expand All @@ -97,6 +135,12 @@ BENCHMARK_REGISTER_F(PippengerBench, BatchMSM)
->Args({ 32, 1 << 19 })
->Args({ 32, 1 << 21 });

// Issue #1656 target: {threads=256, msm_size}
BENCHMARK_REGISTER_F(PippengerBench, BatchMSM_1656)
->Unit(benchmark::kMillisecond)
->Args({ 256, 1 << 16 })
->Args({ 256, 1 << 20 });

} // namespace

BENCHMARK_MAIN();
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,7 @@ std::vector<typename Curve::AffineElement> MSM<Curve>::batch_multi_scalar_mul(
std::span<std::span<ScalarField>> scalars,
bool handle_edge_cases) noexcept
{
BB_BENCH_NAME("MSM::batch_multi_scalar_mul");
BB_ASSERT_EQ(points.size(), scalars.size());
const size_t num_msms = points.size();

Expand All @@ -454,45 +455,56 @@ std::vector<typename Curve::AffineElement> MSM<Curve>::batch_multi_scalar_mul(
handle_edge_cases ? jacobian_pippenger_with_transformed_scalars : affine_pippenger_with_transformed_scalars;

// Once we have our work units, each thread can independently evaluate its assigned msms
parallel_for(num_cpus, [&](size_t thread_idx) {
if (!thread_work_units[thread_idx].empty()) {
const std::vector<MSMWorkUnit>& msms = thread_work_units[thread_idx];
std::vector<std::pair<Element, size_t>>& msm_results = thread_msm_results[thread_idx];
msm_results.reserve(msms.size());

// Point schedule buffer for this thread - avoids per-work-unit heap allocation
std::vector<uint64_t> point_schedule_buffer;

for (const MSMWorkUnit& msm : msms) {
point_schedule_buffer.resize(msm.size);
MSMData msm_data =
MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm);
Element msm_result =
(msm.size < PIPPENGER_THRESHOLD) ? small_mul<Curve>(msm_data) : pippenger_impl(msm_data);

msm_results.emplace_back(msm_result, msm.batch_msm_index);
{
BB_BENCH_NAME("MSM::batch_multi_scalar_mul/evaluate_work_units");
parallel_for(num_cpus, [&](size_t thread_idx) {
if (!thread_work_units[thread_idx].empty()) {
const std::vector<MSMWorkUnit>& msms = thread_work_units[thread_idx];
std::vector<std::pair<Element, size_t>>& msm_results = thread_msm_results[thread_idx];
msm_results.reserve(msms.size());

// Point schedule buffer for this thread - avoids per-work-unit heap allocation
std::vector<uint64_t> point_schedule_buffer;

for (const MSMWorkUnit& msm : msms) {
point_schedule_buffer.resize(msm.size);
MSMData msm_data =
MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm);
Element msm_result =
(msm.size < PIPPENGER_THRESHOLD) ? small_mul<Curve>(msm_data) : pippenger_impl(msm_data);

msm_results.emplace_back(msm_result, msm.batch_msm_index);
}
}
}
});
});
}

// Accumulate results. This part needs to be single threaded, but amount of work done here should be small
// TODO(@zac-williamson) check this? E.g. if we are doing a 2^16 MSM with 256 threads this single-threaded part
// will be painful.
// Accumulate results. Single-threaded, but negligible in practice.
// Benchmarked (192-core, 256 threads): ~512us for 2^16 MSM (~1.2% of total), ~207us for 2^20 (<0.1%).
std::vector<Element> results(num_msms, Curve::Group::point_at_infinity);
for (const auto& single_thread_msm_results : thread_msm_results) {
for (const auto& [element, index] : single_thread_msm_results) {
results[index] += element;
{
BB_BENCH_NAME("MSM::batch_multi_scalar_mul/accumulate_results");
for (const auto& single_thread_msm_results : thread_msm_results) {
for (const auto& [element, index] : single_thread_msm_results) {
results[index] += element;
}
}
}
Element::batch_normalize(results.data(), num_msms);
{
BB_BENCH_NAME("MSM::batch_multi_scalar_mul/batch_normalize");
Element::batch_normalize(results.data(), num_msms);
}

// Convert scalars back TO Montgomery form so they remain unchanged from caller's perspective
for (auto& scalar_span : scalars) {
parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
scalar_span[i].self_to_montgomery_form();
}
});
{
BB_BENCH_NAME("MSM::batch_multi_scalar_mul/scalars_to_montgomery");
for (auto& scalar_span : scalars) {
parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
scalar_span[i].self_to_montgomery_form();
}
});
}
}

return std::vector<AffineElement>(results.begin(), results.end());
Expand Down
Loading