From 61830431333d9af6a53e91f85adc532c4ac4669d Mon Sep 17 00:00:00 2001
From: Jonathan Hao <jonathan@aztec-labs.com>
Date: Mon, 30 Mar 2026 14:17:44 +0000
Subject: [PATCH 1/3] feat: add --memory_profile_out flag for Chonk memory
 profiling

Add tooling to continually assess memory consumption in Chonk proving.
The new --memory_profile_out flag on `bb prove` outputs a JSON report
with per-circuit polynomial memory breakdown by category (wires, sigmas,
selectors, etc.), CRS size, and RSS checkpoints at key proving stages.

A Python extraction script converts the JSON into benchmark dashboard
entries (stacked charts for polynomial categories, line charts for
totals and peak RSS). Integrated into ci_benchmark_ivc_flows.sh.

Refs: AztecProtocol/barretenberg#1641
---
 barretenberg/cpp/CLAUDE.md                    |  33 ++++
 .../cpp/scripts/ci_benchmark_ivc_flows.sh     |  27 ++-
 .../cpp/scripts/extract_memory_benchmarks.py  |  76 ++++++++
 barretenberg/cpp/src/barretenberg/bb/cli.cpp  |  20 +++
 .../cpp/src/barretenberg/chonk/chonk.cpp      |   7 +
 .../commitment_schemes/commitment_key.hpp     |   7 +-
 .../barretenberg/common/memory_profile.cpp    | 165 ++++++++++++++++++
 .../barretenberg/common/memory_profile.hpp    |  51 ++++++
 .../polynomials/polynomial_stats.hpp          | 100 +++++++++++
 .../ultra_honk/prover_instance.cpp            |   6 +
 .../barretenberg/ultra_honk/ultra_prover.cpp  |  16 ++
 11 files changed, 503 insertions(+), 5 deletions(-)
 create mode 100755 barretenberg/cpp/scripts/extract_memory_benchmarks.py
 create mode 100644 barretenberg/cpp/src/barretenberg/common/memory_profile.cpp
 create mode 100644 barretenberg/cpp/src/barretenberg/common/memory_profile.hpp
diff --git a/barretenberg/cpp/CLAUDE.md b/barretenberg/cpp/CLAUDE.md
index c0276f8e026a..d67e7fa7aa65 100644
--- a/barretenberg/cpp/CLAUDE.md
+++ b/barretenberg/cpp/CLAUDE.md
@@ -155,3 +155,36 @@ Typical workflow
 2. Build native code: `cd barretenberg/cpp && ./bootstrap.sh build_native`
 3. Check VKs: `cd scripts && ./test_chonk_standalone_vks_havent_changed.sh`
 4. If VKs changed intentionally: `./test_chonk_standalone_vks_havent_changed.sh --update_inputs`
+
+## Example IVC inputs
+
+Example IVC inputs (msgpack files) for `bb prove --scheme chonk` are generated by e2e benchmark tests. Run the full bootstrap from the repo root to populate them:
+
+```bash
+cd $(git rev-parse --show-toplevel) && ./bootstrap.sh
+```
+
+This creates `yarn-project/end-to-end/example-app-ivc-inputs-out/<flow>/ivc-inputs.msgpack`. The inputs are generated by the `build_bench` function in `yarn-project/end-to-end/bootstrap.sh`, which runs client flow tests with `CAPTURE_IVC_FOLDER` set. In CI, these are cached as `bb-chonk-captures-<hash>.tar.gz`.
+
+## Memory profiling
+
+The `--memory_profile_out <file>` flag on `bb prove` outputs a JSON breakdown of memory consumption: per-circuit polynomial memory by category (wires, sigmas, selectors, etc.), CRS size, and RSS checkpoints at key proving stages (after polynomial allocation, after oink, after sumcheck, after PCS).
+
+```bash
+cd barretenberg/cpp
+./build/bin/bb prove \
+  --scheme chonk \
+  --ivc_inputs_path <path-to>/ivc-inputs.msgpack \
+  -o /tmp/proof-out \
+  -v \
+  --memory_profile_out /tmp/proof-out/memory_breakdown.json
+```
+
+The extraction script converts the JSON into dashboard-friendly benchmark entries:
+
+```bash
+echo '[]' > /tmp/proof-out/benchmarks.bench.json
+python3 scripts/extract_memory_benchmarks.py /tmp/proof-out "app-proving/flow/native"
+```
+
+This appends stacked chart entries (polynomial memory by category) and line chart entries (total polynomial MB, CRS MB, peak RSS) to `benchmarks.bench.json`. In CI, this is integrated into `ci_benchmark_ivc_flows.sh` and uploaded to the benchmark dashboard.
diff --git a/barretenberg/cpp/scripts/ci_benchmark_ivc_flows.sh b/barretenberg/cpp/scripts/ci_benchmark_ivc_flows.sh
index 4a3078f5d51d..93432a4baff3 100755
--- a/barretenberg/cpp/scripts/ci_benchmark_ivc_flows.sh
+++ b/barretenberg/cpp/scripts/ci_benchmark_ivc_flows.sh
@@ -59,15 +59,15 @@ function run_bb_cli_bench {
 
   if [[ "$runtime" == "native" ]]; then
     # Add --bench_out_hierarchical flag for native builds to capture hierarchical op counts and timings
-    memusage "./$native_build_dir/bin/bb" "$@" "--bench_out_hierarchical" "$output/benchmark_breakdown.json" || {
-      echo "bb native failed with args: $@ --bench_out_hierarchical $output/benchmark_breakdown.json"
+    memusage "./$native_build_dir/bin/bb" "$@" "--bench_out_hierarchical" "$output/benchmark_breakdown.json" "--memory_profile_out" "$output/memory_breakdown.json" || {
+      echo "bb native failed with args: $@ --bench_out_hierarchical $output/benchmark_breakdown.json --memory_profile_out $output/memory_breakdown.json"
       exit 1
     }
   else # wasm
     export WASMTIME_ALLOWED_DIRS="--dir=$flow_folder --dir=$output"
     # Add --bench_out_hierarchical flag for wasm builds to capture hierarchical op counts and timings
-    memusage scripts/wasmtime.sh $WASMTIME_ALLOWED_DIRS ./build-wasm-threads/bin/bb "$@" "--bench_out_hierarchical" "$output/benchmark_breakdown.json" || {
-      echo "bb wasm failed with args: $@ --bench_out_hierarchical $output/benchmark_breakdown.json"
+    memusage scripts/wasmtime.sh $WASMTIME_ALLOWED_DIRS ./build-wasm-threads/bin/bb "$@" "--bench_out_hierarchical" "$output/benchmark_breakdown.json" "--memory_profile_out" "$output/memory_breakdown.json" || {
+      echo "bb wasm failed with args: $@ --bench_out_hierarchical $output/benchmark_breakdown.json --memory_profile_out $output/memory_breakdown.json"
       exit 1
     }
   fi
@@ -139,6 +139,12 @@ EOF
     echo "Extracting component timings from hierarchical breakdown..."
     python3 scripts/extract_component_benchmarks.py "$output" "$name_path"
   fi
+
+  # Extract memory breakdown metrics if available
+  if [[ -f "$output/memory_breakdown.json" ]]; then
+    echo "Extracting memory breakdown metrics..."
+    python3 scripts/extract_memory_benchmarks.py "$output" "$name_path"
+  fi
 }
 
 export -f verify_ivc_flow run_bb_cli_bench
@@ -178,4 +184,17 @@ if [[ "${CI:-}" == "1" ]] && [[ "${CI_USE_BUILD_INSTANCE_KEY:-0}" == "1" ]]; the
   else
     echo "Warning: benchmark breakdown file not found at $benchmark_breakdown_file"
   fi
+
+  # Upload memory breakdown to S3
+  memory_breakdown_file="bench-out/app-proving/$flow_name/$runtime/memory_breakdown.json"
+  if [[ -f "$memory_breakdown_file" ]]; then
+    tmp_memory_file="/tmp/memory_breakdown_${runtime}_${flow_name}_$$.json"
+    cp "$memory_breakdown_file" "$tmp_memory_file"
+    memory_disk_key="memory-${runtime}-${flow_name}-${current_sha}"
+    {
+      cat "$tmp_memory_file" | gzip | cache_s3_transfer_to "bench/bb-breakdown" "$memory_disk_key"
+      rm -f "$tmp_memory_file"
+    } &
+    echo "Uploaded memory breakdown to S3: bench/bb-breakdown/$memory_disk_key"
+  fi
 fi
diff --git a/barretenberg/cpp/scripts/extract_memory_benchmarks.py b/barretenberg/cpp/scripts/extract_memory_benchmarks.py
new file mode 100755
index 000000000000..cb370a4b3c32
--- /dev/null
+++ b/barretenberg/cpp/scripts/extract_memory_benchmarks.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+"""Extracts memory breakdown metrics from a memory profile JSON and appends
+them to the benchmark JSON file as dashboard entries.
+
+Usage: extract_memory_benchmarks.py <output_dir> <name_path>
+
+The output_dir must contain:
+  - memory_breakdown.json (memory profile data from bb --memory_profile_out)
+  - benchmarks.bench.json (existing benchmark results to append to)
+
+The memory profile JSON format is documented in memory_profile.cpp.
+"""
+import json
+import sys
+
+if len(sys.argv) != 3:
+    print(f"Usage: {sys.argv[0]} <output_dir> <name_path>", file=sys.stderr)
+    sys.exit(1)
+
+output_dir = sys.argv[1]
+name_path = sys.argv[2]
+
+try:
+    with open(f"{output_dir}/memory_breakdown.json", "r") as f:
+        data = json.load(f)
+
+    entries = []
+
+    # Stacked chart: polynomial memory by category (peak circuit)
+    peak_circuit = data.get("peak_circuit")
+    if peak_circuit:
+        for category, stats in peak_circuit.get("categories", {}).items():
+            entries.append({
+                "name": f"{name_path}/memory/{category}_MB",
+                "unit": "MB",
+                "value": round(stats["actual_mb"], 2),
+                "extra": f"stacked:{name_path}/memory/polynomial_categories"
+            })
+
+        # Total polynomial memory (peak circuit)
+        entries.append({
+            "name": f"{name_path}/memory/total_polynomial_MB",
+            "unit": "MB",
+            "value": round(peak_circuit.get("total_polynomial_mb", 0), 2)
+        })
+
+    # CRS memory
+    crs_mb = data.get("crs_mb", 0)
+    if crs_mb > 0:
+        entries.append({
+            "name": f"{name_path}/memory/crs_MB",
+            "unit": "MB",
+            "value": round(crs_mb, 2)
+        })
+
+    # Peak RSS from checkpoints
+    peak_rss = data.get("peak_rss", {})
+    if peak_rss.get("rss_mb", 0) > 0:
+        entries.append({
+            "name": f"{name_path}/memory/peak_rss_MB",
+            "unit": "MB",
+            "value": peak_rss["rss_mb"]
+        })
+
+    # Append to existing benchmarks file
+    with open(f"{output_dir}/benchmarks.bench.json", "r") as f:
+        existing = json.load(f)
+
+    existing.extend(entries)
+
+    with open(f"{output_dir}/benchmarks.bench.json", "w") as f:
+        json.dump(existing, f, indent=2)
+
+    print(f"Extracted {len(entries)} memory breakdown metrics")
+except Exception as e:
+    print(f"Warning: Could not extract memory breakdown: {e}", file=sys.stderr)
diff --git a/barretenberg/cpp/src/barretenberg/bb/cli.cpp b/barretenberg/cpp/src/barretenberg/bb/cli.cpp
index 60ffc4c02233..399e7e12df56 100644
--- a/barretenberg/cpp/src/barretenberg/bb/cli.cpp
+++ b/barretenberg/cpp/src/barretenberg/bb/cli.cpp
@@ -29,6 +29,7 @@
 #include "barretenberg/common/assert.hpp"
 #include "barretenberg/common/bb_bench.hpp"
 #include "barretenberg/common/get_bytecode.hpp"
+#include "barretenberg/common/memory_profile.hpp"
 #include "barretenberg/common/thread.hpp"
 #include "barretenberg/common/version.hpp"
 #include "barretenberg/dsl/acir_format/serde/index.hpp"
@@ -389,6 +390,15 @@ int parse_and_run_cli_command(int argc, char* argv[])
                          "parent-child relationships) as json.")
             ->group(advanced_group);
     };
+    std::string memory_profile_out;
+    const auto add_memory_profile_out_option = [&](CLI::App* subcommand) {
+        return subcommand
+            ->add_option("--memory_profile_out",
+                         memory_profile_out,
+                         "Path to write memory profile data (polynomial breakdown by category, RSS "
+                         "checkpoints, CRS size) as json.")
+            ->group(advanced_group);
+    };
 
     /***************************************************************************************************************
      * Top-level flags
@@ -482,6 +492,7 @@ int parse_and_run_cli_command(int argc, char* argv[])
     add_print_bench_flag(prove);
     add_bench_out_option(prove);
     add_bench_out_hierarchical_option(prove);
+    add_memory_profile_out_option(prove);
     add_storage_budget_option(prove);
     add_output_format_option(prove);
 
@@ -811,6 +822,10 @@ int parse_and_run_cli_command(int argc, char* argv[])
     if (!flags.storage_budget.empty()) {
         storage_budget = parse_size_string(flags.storage_budget);
     }
+    if (!memory_profile_out.empty()) {
+        bb::detail::use_memory_profile = true;
+        vinfo("Memory profiling enabled via --memory_profile_out");
+    }
     if (print_bench || !bench_out.empty() || !bench_out_hierarchical.empty()) {
         bb::detail::use_bb_bench = true;
         vinfo("BB_BENCH enabled via --print_bench or --bench_out");
@@ -987,6 +1002,11 @@ int parse_and_run_cli_command(int argc, char* argv[])
                     bb::detail::GLOBAL_BENCH_STATS.serialize_aggregate_data_json(file);
                 }
 #endif
+                if (!memory_profile_out.empty()) {
+                    std::ofstream file(memory_profile_out);
+                    bb::detail::GLOBAL_MEMORY_PROFILE.serialize_json(file);
+                    vinfo("Memory profile written to ", memory_profile_out);
+                }
                 return 0;
             }
             if (check->parsed()) {
diff --git a/barretenberg/cpp/src/barretenberg/chonk/chonk.cpp b/barretenberg/cpp/src/barretenberg/chonk/chonk.cpp
index 5e242feca663..d3c93eaccc79 100644
--- a/barretenberg/cpp/src/barretenberg/chonk/chonk.cpp
+++ b/barretenberg/cpp/src/barretenberg/chonk/chonk.cpp
@@ -7,6 +7,7 @@
 #include "barretenberg/chonk/chonk.hpp"
 #include "barretenberg/chonk/chonk_verifier.hpp"
 #include "barretenberg/common/bb_bench.hpp"
+#include "barretenberg/common/memory_profile.hpp"
 #include "barretenberg/common/streams.hpp"
 #include "barretenberg/ecc/curves/grumpkin/grumpkin.hpp"
 #include "barretenberg/goblin/goblin_verifier.hpp"
@@ -538,6 +539,12 @@ void Chonk::accumulate_and_fold(ClientCircuit& circuit,
         break;
     }
 
+    if (detail::use_memory_profile) {
+        size_t circuit_idx =
+            detail::GLOBAL_MEMORY_PROFILE.circuits.empty() ? 0 : detail::GLOBAL_MEMORY_PROFILE.circuits.size() - 1;
+        detail::GLOBAL_MEMORY_PROFILE.add_rss_checkpoint("after_accumulate", circuit_idx);
+    }
+
     VerifierInputs queue_entry{ std::move(proof), precomputed_vk, queue_type, is_kernel };
     verification_queue.push_back(queue_entry);
 
diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp
index a504ed460989..02a7c16bb90a 100644
--- a/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp
+++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp
@@ -11,6 +11,7 @@
  */
 
 #include "barretenberg/common/bb_bench.hpp"
+#include "barretenberg/common/memory_profile.hpp"
 #include "barretenberg/common/ref_span.hpp"
 #include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/polynomials/polynomial.hpp"
@@ -53,7 +54,11 @@ template <class Curve> class CommitmentKey {
     CommitmentKey(const size_t num_points)
         : srs(srs::get_crs_factory<Curve>()->get_crs(num_points))
         , srs_size(num_points)
-    {}
+    {
+        if (detail::use_memory_profile) {
+            detail::GLOBAL_MEMORY_PROFILE.set_crs_size(num_points);
+        }
+    }
     /**
      * @brief Checks the commitment key is properly initialized.
      *
diff --git a/barretenberg/cpp/src/barretenberg/common/memory_profile.cpp b/barretenberg/cpp/src/barretenberg/common/memory_profile.cpp
new file mode 100644
index 000000000000..85238ac55b80
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/common/memory_profile.cpp
@@ -0,0 +1,165 @@
+#include "memory_profile.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <iomanip>
+#include <sstream>
+
+#if defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__)
+#include <sys/resource.h>
+#endif
+
+namespace {
+
+size_t get_peak_rss_mb()
+{
+#if defined(__linux__)
+    struct rusage usage{};
+    if (getrusage(RUSAGE_SELF, &usage) == 0) {
+        // ru_maxrss is in kilobytes on Linux
+        return static_cast<size_t>(usage.ru_maxrss) / 1024;
+    }
+#elif defined(__APPLE__) || defined(__FreeBSD__)
+    struct rusage usage{};
+    if (getrusage(RUSAGE_SELF, &usage) == 0) {
+        // ru_maxrss is in bytes on macOS/BSD
+        return static_cast<size_t>(usage.ru_maxrss) / (1024 * 1024);
+    }
+#endif
+    return 0;
+}
+
+} // namespace
+
+namespace bb::detail {
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+bool use_memory_profile = false;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+MemoryProfile GLOBAL_MEMORY_PROFILE;
+
+void MemoryProfile::add_circuit(CircuitMemoryStats stats)
+{
+    std::lock_guard<std::mutex> lock(mutex);
+    stats.circuit_index = circuits.size();
+    circuits.push_back(std::move(stats));
+}
+
+void MemoryProfile::add_rss_checkpoint(const std::string& stage, size_t circuit_index)
+{
+    std::lock_guard<std::mutex> lock(mutex);
+    rss_checkpoints.push_back(RssCheckpoint{ stage, circuit_index, get_peak_rss_mb() });
+}
+
+void MemoryProfile::set_crs_size(size_t num_points)
+{
+    std::lock_guard<std::mutex> lock(mutex);
+    if (num_points > crs_points) {
+        crs_points = num_points;
+    }
+}
+
+void MemoryProfile::clear()
+{
+    std::lock_guard<std::mutex> lock(mutex);
+    circuits.clear();
+    rss_checkpoints.clear();
+    crs_points = 0;
+}
+
+namespace {
+
+void write_category_stats(std::ostream& os, const CategoryStats& stats, const std::string& indent)
+{
+    os << indent << "\"actual_mb\": " << std::fixed << std::setprecision(2) << stats.actual_mb << ", "
+       << "\"compressed_mb\": " << std::fixed << std::setprecision(2) << stats.compressed_mb;
+}
+
+void write_circuit_stats(std::ostream& os, const CircuitMemoryStats& circuit, const std::string& indent)
+{
+    os << indent << "{\n";
+    os << indent << "  \"index\": " << circuit.circuit_index << ",\n";
+    os << indent << "  \"total_polynomial_mb\": " << std::fixed << std::setprecision(2) << circuit.total.actual_mb
+       << ",\n";
+    os << indent << "  \"categories\": {\n";
+    bool first = true;
+    for (const auto& [name, stats] : circuit.categories) {
+        if (!first) {
+            os << ",\n";
+        }
+        first = false;
+        os << indent << "    \"" << name << "\": { ";
+        write_category_stats(os, stats, "");
+        os << " }";
+    }
+    os << "\n" << indent << "  }\n";
+    os << indent << "}";
+}
+
+} // namespace
+
+void MemoryProfile::serialize_json(std::ostream& os) const
+{
+    // Find peak circuit (largest total_polynomial_mb)
+    size_t peak_idx = 0;
+    double peak_mb = 0;
+    for (size_t i = 0; i < circuits.size(); i++) {
+        if (circuits[i].total.actual_mb > peak_mb) {
+            peak_mb = circuits[i].total.actual_mb;
+            peak_idx = i;
+        }
+    }
+
+    // Find peak RSS checkpoint
+    RssCheckpoint peak_rss{ "unknown", 0, 0 };
+    for (const auto& cp : rss_checkpoints) {
+        if (cp.rss_mb > peak_rss.rss_mb) {
+            peak_rss = cp;
+        }
+    }
+
+    // CRS memory: num_points * 128 bytes (with Pippenger point table)
+    double crs_mb = static_cast<double>(crs_points) * 128.0 / (1024.0 * 1024.0);
+
+    os << "{\n";
+
+    // Peak circuit
+    if (!circuits.empty()) {
+        os << "  \"peak_circuit\": ";
+        write_circuit_stats(os, circuits[peak_idx], "  ");
+        os << ",\n";
+    }
+
+    // All circuits
+    os << "  \"all_circuits\": [\n";
+    for (size_t i = 0; i < circuits.size(); i++) {
+        if (i > 0) {
+            os << ",\n";
+        }
+        write_circuit_stats(os, circuits[i], "    ");
+    }
+    os << "\n  ],\n";
+
+    // RSS checkpoints
+    os << "  \"rss_checkpoints\": [\n";
+    for (size_t i = 0; i < rss_checkpoints.size(); i++) {
+        if (i > 0) {
+            os << ",\n";
+        }
+        const auto& cp = rss_checkpoints[i];
+        os << "    { \"stage\": \"" << cp.stage << "\", \"circuit_index\": " << cp.circuit_index
+           << ", \"rss_mb\": " << cp.rss_mb << " }";
+    }
+    os << "\n  ],\n";
+
+    // Peak RSS
+    os << "  \"peak_rss\": { \"stage\": \"" << peak_rss.stage << "\", \"circuit_index\": " << peak_rss.circuit_index
+       << ", \"rss_mb\": " << peak_rss.rss_mb << " },\n";
+
+    // CRS
+    os << "  \"crs_mb\": " << std::fixed << std::setprecision(2) << crs_mb << "\n";
+
+    os << "}\n";
+}
+
+} // namespace bb::detail
diff --git a/barretenberg/cpp/src/barretenberg/common/memory_profile.hpp b/barretenberg/cpp/src/barretenberg/common/memory_profile.hpp
new file mode 100644
index 000000000000..2d182386ce75
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/common/memory_profile.hpp
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <cstddef>
+#include <map>
+#include <mutex>
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace bb {
+
+struct CategoryStats {
+    double actual_mb = 0;
+    double compressed_mb = 0; // ideal if using variable-width encoding
+};
+
+struct CircuitMemoryStats {
+    size_t circuit_index = 0;
+    std::map<std::string, CategoryStats> categories;
+    CategoryStats total;
+};
+
+namespace detail {
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+extern bool use_memory_profile;
+
+struct RssCheckpoint {
+    std::string stage;
+    size_t circuit_index;
+    size_t rss_mb;
+};
+
+struct MemoryProfile {
+    std::mutex mutex;
+    std::vector<CircuitMemoryStats> circuits;
+    std::vector<RssCheckpoint> rss_checkpoints;
+    size_t crs_points = 0;
+
+    void add_circuit(CircuitMemoryStats stats);
+    void add_rss_checkpoint(const std::string& stage, size_t circuit_index);
+    void set_crs_size(size_t num_points);
+    void serialize_json(std::ostream& os) const;
+    void clear();
+};
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+extern MemoryProfile GLOBAL_MEMORY_PROFILE;
+
+} // namespace detail
+} // namespace bb
diff --git a/barretenberg/cpp/src/barretenberg/polynomials/polynomial_stats.hpp b/barretenberg/cpp/src/barretenberg/polynomials/polynomial_stats.hpp
index b0662dcf86b3..eb7194af7c4f 100644
--- a/barretenberg/cpp/src/barretenberg/polynomials/polynomial_stats.hpp
+++ b/barretenberg/cpp/src/barretenberg/polynomials/polynomial_stats.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "barretenberg/common/log.hpp"
+#include "barretenberg/common/memory_profile.hpp"
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
@@ -165,4 +166,103 @@ template <typename ProverPolynomials> void analyze_prover_polynomials(ProverPoly
     info(oss.str());
 }
 
+/**
+ * @brief Classify a polynomial label into a memory category.
+ */
+inline std::string classify_polynomial(const std::string& label)
+{
+    if (label.starts_with("w_")) {
+        return "wires";
+    }
+    if (label.starts_with("sigma_")) {
+        return "sigmas";
+    }
+    if (label.starts_with("id_")) {
+        return "ids";
+    }
+    if (label.starts_with("q_")) {
+        return "selectors";
+    }
+    if (label.starts_with("table_value")) {
+        return "tables";
+    }
+    if (label.find("lookup") != std::string::npos || label == "z_lookup") {
+        return "lookup";
+    }
+    if (label.find("ecc_op") != std::string::npos) {
+        return "ecc_op";
+    }
+    if (label.find("databus") != std::string::npos || label.find("calldata") != std::string::npos ||
+        label.find("return_data") != std::string::npos) {
+        return "databus";
+    }
+    if (label.find("lagrange") != std::string::npos) {
+        return "lagrange";
+    }
+    if (label == "z_perm") {
+        return "z_perm";
+    }
+    return "other";
+}
+
+/**
+ * @brief Analyze prover polynomials and return categorized memory statistics.
+ * @details Groups polynomials by category (wires, sigmas, ids, selectors, etc.) and computes
+ * actual and compressed memory usage per category. Used by the --memory_profile_out flag.
+ */
+template <typename ProverPolynomials>
+CircuitMemoryStats analyze_prover_polynomials_categorized(ProverPolynomials& polynomials)
+{
+    using Polynomial = std::remove_reference_t<decltype(*polynomials.get_unshifted().begin())>;
+    using Fr = typename Polynomial::FF;
+
+    auto unshifted = polynomials.get_unshifted();
+    auto all_labels = polynomials.get_labels();
+
+    auto mb = [](auto bytes) { return static_cast<double>(bytes) / (1024.0 * 1024.0); };
+
+    CircuitMemoryStats result;
+
+    size_t idx = 0;
+    for (auto& poly : unshifted) {
+        std::string label = (idx < all_labels.size()) ? all_labels[idx] : "unknown_" + std::to_string(idx);
+        idx++;
+
+        if (poly.is_empty()) {
+            continue;
+        }
+
+        std::string category = classify_polynomial(label);
+        size_t actual_bytes = poly.size() * sizeof(Fr);
+        double compressed_bytes = 0;
+
+        const Fr* data = poly.data();
+        for (size_t i = 0; i < poly.size(); ++i) {
+            const Fr& elem = data[i];
+            if (elem.data[0] == 0 && elem.data[1] == 0 && elem.data[2] == 0 && elem.data[3] == 0) {
+                continue; // zero contributes 0 compressed bytes
+            }
+            Fr standard = elem.from_montgomery_form();
+            size_t bytes_needed = min_bytes_for_value(standard);
+            if (bytes_needed <= 4) {
+                compressed_bytes += 4;
+            } else if (bytes_needed <= 8) {
+                compressed_bytes += 8;
+            } else if (bytes_needed <= 16) {
+                compressed_bytes += 16;
+            } else {
+                compressed_bytes += 32;
+            }
+        }
+
+        auto& cat = result.categories[category];
+        cat.actual_mb += mb(actual_bytes);
+        cat.compressed_mb += mb(compressed_bytes);
+        result.total.actual_mb += mb(actual_bytes);
+        result.total.compressed_mb += mb(compressed_bytes);
+    }
+
+    return result;
+}
+
 } // namespace bb
diff --git a/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp b/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp
index 2e272a54de6e..4e375fec2ae9 100644
--- a/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp
+++ b/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp
@@ -100,6 +100,12 @@ template <typename Flavor> ProverInstance_<Flavor>::ProverInstance_(Circuit& cir
     if (std::getenv("BB_POLY_STATS")) {
         analyze_prover_polynomials(polynomials);
     }
+    if (detail::use_memory_profile) {
+        auto stats = analyze_prover_polynomials_categorized(polynomials);
+        detail::GLOBAL_MEMORY_PROFILE.add_circuit(std::move(stats));
+        detail::GLOBAL_MEMORY_PROFILE.add_rss_checkpoint("after_poly_allocation",
+                                                         detail::GLOBAL_MEMORY_PROFILE.circuits.size() - 1);
+    }
 }
 
 /**
diff --git a/barretenberg/cpp/src/barretenberg/ultra_honk/ultra_prover.cpp b/barretenberg/cpp/src/barretenberg/ultra_honk/ultra_prover.cpp
index 2e22ef8f93bb..902903bc987f 100644
--- a/barretenberg/cpp/src/barretenberg/ultra_honk/ultra_prover.cpp
+++ b/barretenberg/cpp/src/barretenberg/ultra_honk/ultra_prover.cpp
@@ -7,6 +7,7 @@
 #include "ultra_prover.hpp"
 #include "barretenberg/commitment_schemes/gemini/gemini.hpp"
 #include "barretenberg/commitment_schemes/shplonk/shplemini.hpp"
+#include "barretenberg/common/memory_profile.hpp"
 #include "barretenberg/flavor/mega_avm_flavor.hpp"
 #include "barretenberg/sumcheck/sumcheck.hpp"
 #include "barretenberg/ultra_honk/oink_prover.hpp"
@@ -78,15 +79,30 @@ template <typename Flavor> typename UltraProver_<Flavor>::Proof UltraProver_<Fla
     OinkProver<Flavor> oink_prover(prover_instance, honk_vk, transcript);
     oink_prover.prove();
     vinfo("created oink proof");
+    if (detail::use_memory_profile) {
+        size_t circuit_idx =
+            detail::GLOBAL_MEMORY_PROFILE.circuits.empty() ? 0 : detail::GLOBAL_MEMORY_PROFILE.circuits.size() - 1;
+        detail::GLOBAL_MEMORY_PROFILE.add_rss_checkpoint("after_oink", circuit_idx);
+    }
 
     generate_gate_challenges();
 
     // Run sumcheck
     execute_sumcheck_iop();
     vinfo("finished relation check rounds");
+    if (detail::use_memory_profile) {
+        size_t circuit_idx =
+            detail::GLOBAL_MEMORY_PROFILE.circuits.empty() ? 0 : detail::GLOBAL_MEMORY_PROFILE.circuits.size() - 1;
+        detail::GLOBAL_MEMORY_PROFILE.add_rss_checkpoint("after_sumcheck", circuit_idx);
+    }
     // Execute Shplemini PCS
     execute_pcs();
     vinfo("finished PCS rounds");
+    if (detail::use_memory_profile) {
+        size_t circuit_idx =
+            detail::GLOBAL_MEMORY_PROFILE.circuits.empty() ? 0 : detail::GLOBAL_MEMORY_PROFILE.circuits.size() - 1;
+        detail::GLOBAL_MEMORY_PROFILE.add_rss_checkpoint("after_pcs", circuit_idx);
+    }
 
     return export_proof();
 }

From 746cff47096094f3606f4547d49286514422c642 Mon Sep 17 00:00:00 2001
From: Jonathan Hao <jonathan@aztec-labs.com>
Date: Mon, 30 Mar 2026 15:05:38 +0000
Subject: [PATCH 2/3] fix: use stacked-area prefix for memory breakdown
 dashboard charts

Switch from stacked: to stacked-area: prefix in extract_memory_benchmarks.py
so the dashboard renders proper stacked area charts instead of overlaid lines.
This requires the corresponding stacked-area chart support in benchmark-page-data.
---
 barretenberg/cpp/scripts/extract_memory_benchmarks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/barretenberg/cpp/scripts/extract_memory_benchmarks.py b/barretenberg/cpp/scripts/extract_memory_benchmarks.py
index cb370a4b3c32..40527931d445 100755
--- a/barretenberg/cpp/scripts/extract_memory_benchmarks.py
+++ b/barretenberg/cpp/scripts/extract_memory_benchmarks.py
@@ -34,7 +34,7 @@
                 "name": f"{name_path}/memory/{category}_MB",
                 "unit": "MB",
                 "value": round(stats["actual_mb"], 2),
-                "extra": f"stacked:{name_path}/memory/polynomial_categories"
+                "extra": f"stacked-area:{name_path}/memory/polynomial_categories"
             })
 
         # Total polynomial memory (peak circuit)

From befc8eeb52d460f6640c809143b0578bfd2e9fc5 Mon Sep 17 00:00:00 2001
From: Jonathan Hao <jonathan@aztec-labs.com>
Date: Tue, 31 Mar 2026 16:12:45 +0000
Subject: [PATCH 3/3] feat: add per-circuit RSS timeline and remove redundant
 metrics

- Add circuit_name to RSS checkpoints (set from ChonkAccumulate)
- Extract RSS checkpoints as per-commit dashboard entries with labels
  like "06_EcdsaRAccount:entrypoint/after_accumulate"
- Remove CRS instrumentation from commitment_key.hpp (constant, not
  useful to track)
- Remove crs_MB, total_polynomial_MB, peak_rss_MB from dashboard
  metrics (redundant with existing memusage and stacked area chart)
---
 .../cpp/scripts/extract_memory_benchmarks.py  | 31 ++++++-------------
 .../src/barretenberg/bbapi/bbapi_chonk.cpp    |  4 +++
 .../commitment_schemes/commitment_key.hpp     |  7 +----
 .../barretenberg/common/memory_profile.cpp    | 22 +++++--------
 .../barretenberg/common/memory_profile.hpp    |  5 +--
 5 files changed, 25 insertions(+), 44 deletions(-)

diff --git a/barretenberg/cpp/scripts/extract_memory_benchmarks.py b/barretenberg/cpp/scripts/extract_memory_benchmarks.py
index 40527931d445..a8316f7b24ca 100755
--- a/barretenberg/cpp/scripts/extract_memory_benchmarks.py
+++ b/barretenberg/cpp/scripts/extract_memory_benchmarks.py
@@ -37,29 +37,18 @@
                 "extra": f"stacked-area:{name_path}/memory/polynomial_categories"
             })
 
-        # Total polynomial memory (peak circuit)
+    # RSS timeline: each checkpoint becomes a line on the per-commit dashboard chart
+    for cp in data.get("rss_checkpoints", []):
+        circuit_name = cp.get("circuit_name", "")
+        idx = cp["circuit_index"]
+        stage = cp["stage"]
+        # Build a stable label like "07_EcdsaRAccount:entrypoint/after_accumulate"
+        label = f"{idx:02d}_{circuit_name}/{stage}" if circuit_name else f"{idx:02d}/{stage}"
         entries.append({
-            "name": f"{name_path}/memory/total_polynomial_MB",
+            "name": f"{name_path}/memory/rss/{label}",
             "unit": "MB",
-            "value": round(peak_circuit.get("total_polynomial_mb", 0), 2)
-        })
-
-    # CRS memory
-    crs_mb = data.get("crs_mb", 0)
-    if crs_mb > 0:
-        entries.append({
-            "name": f"{name_path}/memory/crs_MB",
-            "unit": "MB",
-            "value": round(crs_mb, 2)
-        })
-
-    # Peak RSS from checkpoints
-    peak_rss = data.get("peak_rss", {})
-    if peak_rss.get("rss_mb", 0) > 0:
-        entries.append({
-            "name": f"{name_path}/memory/peak_rss_MB",
-            "unit": "MB",
-            "value": peak_rss["rss_mb"]
+            "value": cp["rss_mb"],
+            "extra": f"stacked:{name_path}/memory/rss_timeline"
         })
 
     # Append to existing benchmarks file
diff --git a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_chonk.cpp b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_chonk.cpp
index d23ebc6c5cce..3041efed572f 100644
--- a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_chonk.cpp
+++ b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_chonk.cpp
@@ -5,6 +5,7 @@
 #include "barretenberg/commitment_schemes/ipa/ipa.hpp"
 #include "barretenberg/commitment_schemes/verification_key.hpp"
 #include "barretenberg/common/log.hpp"
+#include "barretenberg/common/memory_profile.hpp"
 #include "barretenberg/common/serialize.hpp"
 #include "barretenberg/common/throw_or_abort.hpp"
 #include "barretenberg/dsl/acir_format/acir_format.hpp"
@@ -90,6 +91,9 @@ ChonkAccumulate::Response ChonkAccumulate::execute(BBApiRequest& request) &&
     }
 
     info("ChonkAccumulate - accumulating circuit '", request.loaded_circuit_name, "'");
+    if (detail::use_memory_profile) {
+        detail::GLOBAL_MEMORY_PROFILE.set_circuit_name(request.loaded_circuit_name);
+    }
     request.ivc_in_progress->accumulate(circuit, precomputed_vk);
     request.ivc_stack_depth++;
 
diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp
index 02a7c16bb90a..a504ed460989 100644
--- a/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp
+++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp
@@ -11,7 +11,6 @@
  */
 
 #include "barretenberg/common/bb_bench.hpp"
-#include "barretenberg/common/memory_profile.hpp"
 #include "barretenberg/common/ref_span.hpp"
 #include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/polynomials/polynomial.hpp"
@@ -54,11 +53,7 @@ template <class Curve> class CommitmentKey {
     CommitmentKey(const size_t num_points)
         : srs(srs::get_crs_factory<Curve>()->get_crs(num_points))
         , srs_size(num_points)
-    {
-        if (detail::use_memory_profile) {
-            detail::GLOBAL_MEMORY_PROFILE.set_crs_size(num_points);
-        }
-    }
+    {}
     /**
      * @brief Checks the commitment key is properly initialized.
      *
diff --git a/barretenberg/cpp/src/barretenberg/common/memory_profile.cpp b/barretenberg/cpp/src/barretenberg/common/memory_profile.cpp
index 85238ac55b80..d70d4b123902 100644
--- a/barretenberg/cpp/src/barretenberg/common/memory_profile.cpp
+++ b/barretenberg/cpp/src/barretenberg/common/memory_profile.cpp
@@ -48,15 +48,13 @@ void MemoryProfile::add_circuit(CircuitMemoryStats stats)
 void MemoryProfile::add_rss_checkpoint(const std::string& stage, size_t circuit_index)
 {
     std::lock_guard<std::mutex> lock(mutex);
-    rss_checkpoints.push_back(RssCheckpoint{ stage, circuit_index, get_peak_rss_mb() });
+    rss_checkpoints.push_back(RssCheckpoint{ stage, circuit_index, current_circuit_name, get_peak_rss_mb() });
 }
 
-void MemoryProfile::set_crs_size(size_t num_points)
+void MemoryProfile::set_circuit_name(const std::string& name)
 {
     std::lock_guard<std::mutex> lock(mutex);
-    if (num_points > crs_points) {
-        crs_points = num_points;
-    }
+    current_circuit_name = name;
 }
 
 void MemoryProfile::clear()
@@ -64,7 +62,7 @@ void MemoryProfile::clear()
     std::lock_guard<std::mutex> lock(mutex);
     circuits.clear();
     rss_checkpoints.clear();
-    crs_points = 0;
+    current_circuit_name.clear();
 }
 
 namespace {
@@ -111,16 +109,13 @@ void MemoryProfile::serialize_json(std::ostream& os) const
     }
 
     // Find peak RSS checkpoint
-    RssCheckpoint peak_rss{ "unknown", 0, 0 };
+    RssCheckpoint peak_rss{ "unknown", 0, "", 0 };
     for (const auto& cp : rss_checkpoints) {
         if (cp.rss_mb > peak_rss.rss_mb) {
             peak_rss = cp;
         }
     }
 
-    // CRS memory: num_points * 128 bytes (with Pippenger point table)
-    double crs_mb = static_cast<double>(crs_points) * 128.0 / (1024.0 * 1024.0);
-
     os << "{\n";
 
     // Peak circuit
@@ -148,16 +143,13 @@ void MemoryProfile::serialize_json(std::ostream& os) const
         }
         const auto& cp = rss_checkpoints[i];
         os << "    { \"stage\": \"" << cp.stage << "\", \"circuit_index\": " << cp.circuit_index
-           << ", \"rss_mb\": " << cp.rss_mb << " }";
+           << ", \"circuit_name\": \"" << cp.circuit_name << "\", \"rss_mb\": " << cp.rss_mb << " }";
     }
     os << "\n  ],\n";
 
     // Peak RSS
     os << "  \"peak_rss\": { \"stage\": \"" << peak_rss.stage << "\", \"circuit_index\": " << peak_rss.circuit_index
-       << ", \"rss_mb\": " << peak_rss.rss_mb << " },\n";
-
-    // CRS
-    os << "  \"crs_mb\": " << std::fixed << std::setprecision(2) << crs_mb << "\n";
+       << ", \"rss_mb\": " << peak_rss.rss_mb << " }\n";
 
     os << "}\n";
 }
diff --git a/barretenberg/cpp/src/barretenberg/common/memory_profile.hpp b/barretenberg/cpp/src/barretenberg/common/memory_profile.hpp
index 2d182386ce75..4b9ace89418e 100644
--- a/barretenberg/cpp/src/barretenberg/common/memory_profile.hpp
+++ b/barretenberg/cpp/src/barretenberg/common/memory_profile.hpp
@@ -28,6 +28,7 @@ extern bool use_memory_profile;
 struct RssCheckpoint {
     std::string stage;
     size_t circuit_index;
+    std::string circuit_name;
     size_t rss_mb;
 };
 
@@ -35,11 +36,11 @@ struct MemoryProfile {
     std::mutex mutex;
     std::vector<CircuitMemoryStats> circuits;
     std::vector<RssCheckpoint> rss_checkpoints;
-    size_t crs_points = 0;
+    std::string current_circuit_name;
 
     void add_circuit(CircuitMemoryStats stats);
     void add_rss_checkpoint(const std::string& stage, size_t circuit_index);
-    void set_crs_size(size_t num_points);
+    void set_circuit_name(const std::string& name);
     void serialize_json(std::ostream& os) const;
     void clear();
 };