diff --git a/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp
index 28b9b3b096c5..d1040be2da56 100644
--- a/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp
+++ b/barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/pippenger.bench.cpp
@@ -5,6 +5,7 @@
  * Batch config modeled after AVM: ~2618 wire polys of size 2^21, committed in batches of 32.
  */
 #include "barretenberg/common/assert.hpp"
+#include "barretenberg/common/thread.hpp"
 #include "barretenberg/ecc/curves/bn254/bn254.hpp"
 #include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/polynomials/polynomial_arithmetic.hpp"
@@ -82,6 +83,43 @@ BENCHMARK_DEFINE_F(PippengerBench, BatchMSM)(benchmark::State& state)
     }
 }
 
+/**
+ * @brief Batch MSM benchmark variant added to investigate `AztecProtocol/barretenberg#1656`.
+ *
+ * The issue is concerned about the single-threaded final reduction step in
+ * `MSM::batch_multi_scalar_mul(...)` when the work is split across a large number of threads,
+ * e.g. `2^16` points with `256` threads.
+ *
+ * We run a single MSM (num_polys = 1) and sweep thread counts and MSM sizes to make the
+ * final reduction overhead visible in the phase breakdown reported via `BB_BENCH` counters.
+ */
+BENCHMARK_DEFINE_F(PippengerBench, BatchMSM_1656)(benchmark::State& state)
+{
+    const size_t num_threads = static_cast<size_t>(state.range(0));
+    const size_t msm_size = static_cast<size_t>(state.range(1));
+
+    std::vector<Fr> msm_scalars(msm_size);
+    for (auto& s : msm_scalars) {
+        s = Fr::random_element(&engine);
+    }
+
+    std::vector<std::span<Fr>> scalar_spans;
+    std::vector<std::span<const G1>> point_spans;
+    scalar_spans.emplace_back(msm_scalars);
+    point_spans.emplace_back(srs->get_monomial_points().subspan(0, msm_size));
+
+    // This is thread-local: restore after the benchmark so other cases in this binary are unaffected.
+    const size_t original_concurrency = bb::get_num_cpus();
+    bb::set_parallel_for_concurrency(num_threads);
+
+    for (auto _ : state) {
+        GOOGLE_BB_BENCH_REPORTER(state);
+        bb::scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(point_spans, scalar_spans, false);
+    }
+
+    bb::set_parallel_for_concurrency(original_concurrency);
+}
+
 // ===================== Registration =====================
 
 // Single MSM: 2^14 to 2^20
@@ -97,6 +135,12 @@ BENCHMARK_REGISTER_F(PippengerBench, BatchMSM)
     ->Args({ 32, 1 << 19 })
     ->Args({ 32, 1 << 21 });
 
+// Issue #1656 target: {threads=256, msm_size}
+BENCHMARK_REGISTER_F(PippengerBench, BatchMSM_1656)
+    ->Unit(benchmark::kMillisecond)
+    ->Args({ 256, 1 << 16 })
+    ->Args({ 256, 1 << 20 });
+
 } // namespace
 
 BENCHMARK_MAIN();
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp
index b320789e32e8..e7f07c9ab6ff 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp
@@ -439,6 +439,7 @@ std::vector<typename Curve::AffineElement> MSM<Curve>::batch_multi_scalar_mul(
     std::span<std::span<ScalarField>> scalars,
     bool handle_edge_cases) noexcept
 {
+    BB_BENCH_NAME("MSM::batch_multi_scalar_mul");
     BB_ASSERT_EQ(points.size(), scalars.size());
     const size_t num_msms = points.size();
 
@@ -454,45 +455,56 @@ std::vector<typename Curve::AffineElement> MSM<Curve>::batch_multi_scalar_mul(
         handle_edge_cases ? jacobian_pippenger_with_transformed_scalars : affine_pippenger_with_transformed_scalars;
 
     // Once we have our work units, each thread can independently evaluate its assigned msms
-    parallel_for(num_cpus, [&](size_t thread_idx) {
-        if (!thread_work_units[thread_idx].empty()) {
-            const std::vector<MSMWorkUnit>& msms = thread_work_units[thread_idx];
-            std::vector<std::pair<Element, size_t>>& msm_results = thread_msm_results[thread_idx];
-            msm_results.reserve(msms.size());
-
-            // Point schedule buffer for this thread - avoids per-work-unit heap allocation
-            std::vector<uint64_t> point_schedule_buffer;
-
-            for (const MSMWorkUnit& msm : msms) {
-                point_schedule_buffer.resize(msm.size);
-                MSMData msm_data =
-                    MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm);
-                Element msm_result =
-                    (msm.size < PIPPENGER_THRESHOLD) ? small_mul<Curve>(msm_data) : pippenger_impl(msm_data);
-
-                msm_results.emplace_back(msm_result, msm.batch_msm_index);
+    {
+        BB_BENCH_NAME("MSM::batch_multi_scalar_mul/evaluate_work_units");
+        parallel_for(num_cpus, [&](size_t thread_idx) {
+            if (!thread_work_units[thread_idx].empty()) {
+                const std::vector<MSMWorkUnit>& msms = thread_work_units[thread_idx];
+                std::vector<std::pair<Element, size_t>>& msm_results = thread_msm_results[thread_idx];
+                msm_results.reserve(msms.size());
+
+                // Point schedule buffer for this thread - avoids per-work-unit heap allocation
+                std::vector<uint64_t> point_schedule_buffer;
+
+                for (const MSMWorkUnit& msm : msms) {
+                    point_schedule_buffer.resize(msm.size);
+                    MSMData msm_data =
+                        MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm);
+                    Element msm_result =
+                        (msm.size < PIPPENGER_THRESHOLD) ? small_mul<Curve>(msm_data) : pippenger_impl(msm_data);
+
+                    msm_results.emplace_back(msm_result, msm.batch_msm_index);
+                }
             }
-        }
-    });
+        });
+    }
 
-    // Accumulate results. This part needs to be single threaded, but amount of work done here should be small
-    // TODO(@zac-williamson) check this? E.g. if we are doing a 2^16 MSM with 256 threads this single-threaded part
-    // will be painful.
+    // Accumulate results. Single-threaded, but negligible in practice.
+    // Benchmarked (192-core, 256 threads): ~512us for 2^16 MSM (~1.2% of total), ~207us for 2^20 (<0.1%).
     std::vector<Element> results(num_msms, Curve::Group::point_at_infinity);
-    for (const auto& single_thread_msm_results : thread_msm_results) {
-        for (const auto& [element, index] : single_thread_msm_results) {
-            results[index] += element;
+    {
+        BB_BENCH_NAME("MSM::batch_multi_scalar_mul/accumulate_results");
+        for (const auto& single_thread_msm_results : thread_msm_results) {
+            for (const auto& [element, index] : single_thread_msm_results) {
+                results[index] += element;
+            }
         }
     }
-    Element::batch_normalize(results.data(), num_msms);
+    {
+        BB_BENCH_NAME("MSM::batch_multi_scalar_mul/batch_normalize");
+        Element::batch_normalize(results.data(), num_msms);
+    }
 
     // Convert scalars back TO Montgomery form so they remain unchanged from caller's perspective
-    for (auto& scalar_span : scalars) {
-        parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) {
-            for (size_t i = start; i < end; ++i) {
-                scalar_span[i].self_to_montgomery_form();
-            }
-        });
+    {
+        BB_BENCH_NAME("MSM::batch_multi_scalar_mul/scalars_to_montgomery");
+        for (auto& scalar_span : scalars) {
+            parallel_for_range(scalar_span.size(), [&](size_t start, size_t end) {
+                for (size_t i = start; i < end; ++i) {
+                    scalar_span[i].self_to_montgomery_form();
+                }
+            });
+        }
     }
 
     return std::vector<AffineElement>(results.begin(), results.end());