From 61830431333d9af6a53e91f85adc532c4ac4669d Mon Sep 17 00:00:00 2001 From: Jonathan Hao Date: Mon, 30 Mar 2026 14:17:44 +0000 Subject: [PATCH 1/3] feat: add --memory_profile_out flag for Chonk memory profiling Add tooling to continually assess memory consumption in Chonk proving. The new --memory_profile_out flag on `bb prove` outputs a JSON report with per-circuit polynomial memory breakdown by category (wires, sigmas, selectors, etc.), CRS size, and RSS checkpoints at key proving stages. A Python extraction script converts the JSON into benchmark dashboard entries (stacked charts for polynomial categories, line charts for totals and peak RSS). Integrated into ci_benchmark_ivc_flows.sh. Refs: AztecProtocol/barretenberg#1641 --- barretenberg/cpp/CLAUDE.md | 33 ++++ .../cpp/scripts/ci_benchmark_ivc_flows.sh | 27 ++- .../cpp/scripts/extract_memory_benchmarks.py | 76 ++++++++ barretenberg/cpp/src/barretenberg/bb/cli.cpp | 20 +++ .../cpp/src/barretenberg/chonk/chonk.cpp | 7 + .../commitment_schemes/commitment_key.hpp | 7 +- .../barretenberg/common/memory_profile.cpp | 165 ++++++++++++++++++ .../barretenberg/common/memory_profile.hpp | 51 ++++++ .../polynomials/polynomial_stats.hpp | 100 +++++++++++ .../ultra_honk/prover_instance.cpp | 6 + .../barretenberg/ultra_honk/ultra_prover.cpp | 16 ++ 11 files changed, 503 insertions(+), 5 deletions(-) create mode 100755 barretenberg/cpp/scripts/extract_memory_benchmarks.py create mode 100644 barretenberg/cpp/src/barretenberg/common/memory_profile.cpp create mode 100644 barretenberg/cpp/src/barretenberg/common/memory_profile.hpp diff --git a/barretenberg/cpp/CLAUDE.md b/barretenberg/cpp/CLAUDE.md index c0276f8e026a..d67e7fa7aa65 100644 --- a/barretenberg/cpp/CLAUDE.md +++ b/barretenberg/cpp/CLAUDE.md @@ -155,3 +155,36 @@ Typical workflow 2. Build native code: `cd barretenberg/cpp && ./bootstrap.sh build_native` 3. Check VKs: `cd scripts && ./test_chonk_standalone_vks_havent_changed.sh` 4. If VKs changed intentionally: `./test_chonk_standalone_vks_havent_changed.sh --update_inputs` + +## Example IVC inputs + +Example IVC inputs (msgpack files) for `bb prove --scheme chonk` are generated by e2e benchmark tests. Run the full bootstrap from the repo root to populate them: + +```bash +cd $(git rev-parse --show-toplevel) && ./bootstrap.sh +``` + +This creates `yarn-project/end-to-end/example-app-ivc-inputs-out//ivc-inputs.msgpack`. The inputs are generated by the `build_bench` function in `yarn-project/end-to-end/bootstrap.sh`, which runs client flow tests with `CAPTURE_IVC_FOLDER` set. In CI, these are cached as `bb-chonk-captures-.tar.gz`. + +## Memory profiling + +The `--memory_profile_out ` flag on `bb prove` outputs a JSON breakdown of memory consumption: per-circuit polynomial memory by category (wires, sigmas, selectors, etc.), CRS size, and RSS checkpoints at key proving stages (after polynomial allocation, after oink, after sumcheck, after PCS). + +```bash +cd barretenberg/cpp +./build/bin/bb prove \ + --scheme chonk \ + --ivc_inputs_path /ivc-inputs.msgpack \ + -o /tmp/proof-out \ + -v \ + --memory_profile_out /tmp/proof-out/memory_breakdown.json +``` + +The extraction script converts the JSON into dashboard-friendly benchmark entries: + +```bash +echo '[]' > /tmp/proof-out/benchmarks.bench.json +python3 scripts/extract_memory_benchmarks.py /tmp/proof-out "app-proving/flow/native" +``` + +This appends stacked chart entries (polynomial memory by category) and line chart entries (total polynomial MB, CRS MB, peak RSS) to `benchmarks.bench.json`. In CI, this is integrated into `ci_benchmark_ivc_flows.sh` and uploaded to the benchmark dashboard. diff --git a/barretenberg/cpp/scripts/ci_benchmark_ivc_flows.sh b/barretenberg/cpp/scripts/ci_benchmark_ivc_flows.sh index 4a3078f5d51d..93432a4baff3 100755 --- a/barretenberg/cpp/scripts/ci_benchmark_ivc_flows.sh +++ b/barretenberg/cpp/scripts/ci_benchmark_ivc_flows.sh @@ -59,15 +59,15 @@ function run_bb_cli_bench { if [[ "$runtime" == "native" ]]; then # Add --bench_out_hierarchical flag for native builds to capture hierarchical op counts and timings - memusage "./$native_build_dir/bin/bb" "$@" "--bench_out_hierarchical" "$output/benchmark_breakdown.json" || { - echo "bb native failed with args: $@ --bench_out_hierarchical $output/benchmark_breakdown.json" + memusage "./$native_build_dir/bin/bb" "$@" "--bench_out_hierarchical" "$output/benchmark_breakdown.json" "--memory_profile_out" "$output/memory_breakdown.json" || { + echo "bb native failed with args: $@ --bench_out_hierarchical $output/benchmark_breakdown.json --memory_profile_out $output/memory_breakdown.json" exit 1 } else # wasm export WASMTIME_ALLOWED_DIRS="--dir=$flow_folder --dir=$output" # Add --bench_out_hierarchical flag for wasm builds to capture hierarchical op counts and timings - memusage scripts/wasmtime.sh $WASMTIME_ALLOWED_DIRS ./build-wasm-threads/bin/bb "$@" "--bench_out_hierarchical" "$output/benchmark_breakdown.json" || { - echo "bb wasm failed with args: $@ --bench_out_hierarchical $output/benchmark_breakdown.json" + memusage scripts/wasmtime.sh $WASMTIME_ALLOWED_DIRS ./build-wasm-threads/bin/bb "$@" "--bench_out_hierarchical" "$output/benchmark_breakdown.json" "--memory_profile_out" "$output/memory_breakdown.json" || { + echo "bb wasm failed with args: $@ --bench_out_hierarchical $output/benchmark_breakdown.json --memory_profile_out $output/memory_breakdown.json" exit 1 } fi @@ -139,6 +139,12 @@ EOF echo "Extracting component timings from hierarchical breakdown..." python3 scripts/extract_component_benchmarks.py "$output" "$name_path" fi + + # Extract memory breakdown metrics if available + if [[ -f "$output/memory_breakdown.json" ]]; then + echo "Extracting memory breakdown metrics..." + python3 scripts/extract_memory_benchmarks.py "$output" "$name_path" + fi } export -f verify_ivc_flow run_bb_cli_bench @@ -178,4 +184,17 @@ if [[ "${CI:-}" == "1" ]] && [[ "${CI_USE_BUILD_INSTANCE_KEY:-0}" == "1" ]]; the else echo "Warning: benchmark breakdown file not found at $benchmark_breakdown_file" fi + + # Upload memory breakdown to S3 + memory_breakdown_file="bench-out/app-proving/$flow_name/$runtime/memory_breakdown.json" + if [[ -f "$memory_breakdown_file" ]]; then + tmp_memory_file="/tmp/memory_breakdown_${runtime}_${flow_name}_$$.json" + cp "$memory_breakdown_file" "$tmp_memory_file" + memory_disk_key="memory-${runtime}-${flow_name}-${current_sha}" + { + cat "$tmp_memory_file" | gzip | cache_s3_transfer_to "bench/bb-breakdown" "$memory_disk_key" + rm -f "$tmp_memory_file" + } & + echo "Uploaded memory breakdown to S3: bench/bb-breakdown/$memory_disk_key" + fi fi diff --git a/barretenberg/cpp/scripts/extract_memory_benchmarks.py b/barretenberg/cpp/scripts/extract_memory_benchmarks.py new file mode 100755 index 000000000000..cb370a4b3c32 --- /dev/null +++ b/barretenberg/cpp/scripts/extract_memory_benchmarks.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +"""Extracts memory breakdown metrics from a memory profile JSON and appends +them to the benchmark JSON file as dashboard entries. + +Usage: extract_memory_benchmarks.py + +The output_dir must contain: + - memory_breakdown.json (memory profile data from bb --memory_profile_out) + - benchmarks.bench.json (existing benchmark results to append to) + +The memory profile JSON format is documented in memory_profile.cpp. +""" +import json +import sys + +if len(sys.argv) != 3: + print(f"Usage: {sys.argv[0]} ", file=sys.stderr) + sys.exit(1) + +output_dir = sys.argv[1] +name_path = sys.argv[2] + +try: + with open(f"{output_dir}/memory_breakdown.json", "r") as f: + data = json.load(f) + + entries = [] + + # Stacked chart: polynomial memory by category (peak circuit) + peak_circuit = data.get("peak_circuit") + if peak_circuit: + for category, stats in peak_circuit.get("categories", {}).items(): + entries.append({ + "name": f"{name_path}/memory/{category}_MB", + "unit": "MB", + "value": round(stats["actual_mb"], 2), + "extra": f"stacked:{name_path}/memory/polynomial_categories" + }) + + # Total polynomial memory (peak circuit) + entries.append({ + "name": f"{name_path}/memory/total_polynomial_MB", + "unit": "MB", + "value": round(peak_circuit.get("total_polynomial_mb", 0), 2) + }) + + # CRS memory + crs_mb = data.get("crs_mb", 0) + if crs_mb > 0: + entries.append({ + "name": f"{name_path}/memory/crs_MB", + "unit": "MB", + "value": round(crs_mb, 2) + }) + + # Peak RSS from checkpoints + peak_rss = data.get("peak_rss", {}) + if peak_rss.get("rss_mb", 0) > 0: + entries.append({ + "name": f"{name_path}/memory/peak_rss_MB", + "unit": "MB", + "value": peak_rss["rss_mb"] + }) + + # Append to existing benchmarks file + with open(f"{output_dir}/benchmarks.bench.json", "r") as f: + existing = json.load(f) + + existing.extend(entries) + + with open(f"{output_dir}/benchmarks.bench.json", "w") as f: + json.dump(existing, f, indent=2) + + print(f"Extracted {len(entries)} memory breakdown metrics") +except Exception as e: + print(f"Warning: Could not extract memory breakdown: {e}", file=sys.stderr) diff --git a/barretenberg/cpp/src/barretenberg/bb/cli.cpp b/barretenberg/cpp/src/barretenberg/bb/cli.cpp index 60ffc4c02233..399e7e12df56 100644 --- a/barretenberg/cpp/src/barretenberg/bb/cli.cpp +++ b/barretenberg/cpp/src/barretenberg/bb/cli.cpp @@ -29,6 +29,7 @@ #include "barretenberg/common/assert.hpp" #include "barretenberg/common/bb_bench.hpp" #include "barretenberg/common/get_bytecode.hpp" +#include "barretenberg/common/memory_profile.hpp" #include "barretenberg/common/thread.hpp" #include "barretenberg/common/version.hpp" #include "barretenberg/dsl/acir_format/serde/index.hpp" @@ -389,6 +390,15 @@ int parse_and_run_cli_command(int argc, char* argv[]) "parent-child relationships) as json.") ->group(advanced_group); }; + std::string memory_profile_out; + const auto add_memory_profile_out_option = [&](CLI::App* subcommand) { + return subcommand + ->add_option("--memory_profile_out", + memory_profile_out, + "Path to write memory profile data (polynomial breakdown by category, RSS " + "checkpoints, CRS size) as json.") + ->group(advanced_group); + }; /*************************************************************************************************************** * Top-level flags @@ -482,6 +492,7 @@ int parse_and_run_cli_command(int argc, char* argv[]) add_print_bench_flag(prove); add_bench_out_option(prove); add_bench_out_hierarchical_option(prove); + add_memory_profile_out_option(prove); add_storage_budget_option(prove); add_output_format_option(prove); @@ -811,6 +822,10 @@ int parse_and_run_cli_command(int argc, char* argv[]) if (!flags.storage_budget.empty()) { storage_budget = parse_size_string(flags.storage_budget); } + if (!memory_profile_out.empty()) { + bb::detail::use_memory_profile = true; + vinfo("Memory profiling enabled via --memory_profile_out"); + } if (print_bench || !bench_out.empty() || !bench_out_hierarchical.empty()) { bb::detail::use_bb_bench = true; vinfo("BB_BENCH enabled via --print_bench or --bench_out"); @@ -987,6 +1002,11 @@ int parse_and_run_cli_command(int argc, char* argv[]) bb::detail::GLOBAL_BENCH_STATS.serialize_aggregate_data_json(file); } #endif + if (!memory_profile_out.empty()) { + std::ofstream file(memory_profile_out); + bb::detail::GLOBAL_MEMORY_PROFILE.serialize_json(file); + vinfo("Memory profile written to ", memory_profile_out); + } return 0; } if (check->parsed()) { diff --git a/barretenberg/cpp/src/barretenberg/chonk/chonk.cpp b/barretenberg/cpp/src/barretenberg/chonk/chonk.cpp index 5e242feca663..d3c93eaccc79 100644 --- a/barretenberg/cpp/src/barretenberg/chonk/chonk.cpp +++ b/barretenberg/cpp/src/barretenberg/chonk/chonk.cpp @@ -7,6 +7,7 @@ #include "barretenberg/chonk/chonk.hpp" #include "barretenberg/chonk/chonk_verifier.hpp" #include "barretenberg/common/bb_bench.hpp" +#include "barretenberg/common/memory_profile.hpp" #include "barretenberg/common/streams.hpp" #include "barretenberg/ecc/curves/grumpkin/grumpkin.hpp" #include "barretenberg/goblin/goblin_verifier.hpp" @@ -538,6 +539,12 @@ void Chonk::accumulate_and_fold(ClientCircuit& circuit, break; } + if (detail::use_memory_profile) { + size_t circuit_idx = + detail::GLOBAL_MEMORY_PROFILE.circuits.empty() ? 0 : detail::GLOBAL_MEMORY_PROFILE.circuits.size() - 1; + detail::GLOBAL_MEMORY_PROFILE.add_rss_checkpoint("after_accumulate", circuit_idx); + } + VerifierInputs queue_entry{ std::move(proof), precomputed_vk, queue_type, is_kernel }; verification_queue.push_back(queue_entry); diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp index a504ed460989..02a7c16bb90a 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp @@ -11,6 +11,7 @@ */ #include "barretenberg/common/bb_bench.hpp" +#include "barretenberg/common/memory_profile.hpp" #include "barretenberg/common/ref_span.hpp" #include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp" #include "barretenberg/polynomials/polynomial.hpp" @@ -53,7 +54,11 @@ template class CommitmentKey { CommitmentKey(const size_t num_points) : srs(srs::get_crs_factory()->get_crs(num_points)) , srs_size(num_points) - {} + { + if (detail::use_memory_profile) { + detail::GLOBAL_MEMORY_PROFILE.set_crs_size(num_points); + } + } /** * @brief Checks the commitment key is properly initialized. * diff --git a/barretenberg/cpp/src/barretenberg/common/memory_profile.cpp b/barretenberg/cpp/src/barretenberg/common/memory_profile.cpp new file mode 100644 index 000000000000..85238ac55b80 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/common/memory_profile.cpp @@ -0,0 +1,165 @@ +#include "memory_profile.hpp" + +#include +#include +#include +#include + +#if defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) +#include +#endif + +namespace { + +size_t get_peak_rss_mb() +{ +#if defined(__linux__) + struct rusage usage{}; + if (getrusage(RUSAGE_SELF, &usage) == 0) { + // ru_maxrss is in kilobytes on Linux + return static_cast(usage.ru_maxrss) / 1024; + } +#elif defined(__APPLE__) || defined(__FreeBSD__) + struct rusage usage{}; + if (getrusage(RUSAGE_SELF, &usage) == 0) { + // ru_maxrss is in bytes on macOS/BSD + return static_cast(usage.ru_maxrss) / (1024 * 1024); + } +#endif + return 0; +} + +} // namespace + +namespace bb::detail { + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +bool use_memory_profile = false; +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +MemoryProfile GLOBAL_MEMORY_PROFILE; + +void MemoryProfile::add_circuit(CircuitMemoryStats stats) +{ + std::lock_guard lock(mutex); + stats.circuit_index = circuits.size(); + circuits.push_back(std::move(stats)); +} + +void MemoryProfile::add_rss_checkpoint(const std::string& stage, size_t circuit_index) +{ + std::lock_guard lock(mutex); + rss_checkpoints.push_back(RssCheckpoint{ stage, circuit_index, get_peak_rss_mb() }); +} + +void MemoryProfile::set_crs_size(size_t num_points) +{ + std::lock_guard lock(mutex); + if (num_points > crs_points) { + crs_points = num_points; + } +} + +void MemoryProfile::clear() +{ + std::lock_guard lock(mutex); + circuits.clear(); + rss_checkpoints.clear(); + crs_points = 0; +} + +namespace { + +void write_category_stats(std::ostream& os, const CategoryStats& stats, const std::string& indent) +{ + os << indent << "\"actual_mb\": " << std::fixed << std::setprecision(2) << stats.actual_mb << ", " + << "\"compressed_mb\": " << std::fixed << std::setprecision(2) << stats.compressed_mb; +} + +void write_circuit_stats(std::ostream& os, const CircuitMemoryStats& circuit, const std::string& indent) +{ + os << indent << "{\n"; + os << indent << " \"index\": " << circuit.circuit_index << ",\n"; + os << indent << " \"total_polynomial_mb\": " << std::fixed << std::setprecision(2) << circuit.total.actual_mb + << ",\n"; + os << indent << " \"categories\": {\n"; + bool first = true; + for (const auto& [name, stats] : circuit.categories) { + if (!first) { + os << ",\n"; + } + first = false; + os << indent << " \"" << name << "\": { "; + write_category_stats(os, stats, ""); + os << " }"; + } + os << "\n" << indent << " }\n"; + os << indent << "}"; +} + +} // namespace + +void MemoryProfile::serialize_json(std::ostream& os) const +{ + // Find peak circuit (largest total_polynomial_mb) + size_t peak_idx = 0; + double peak_mb = 0; + for (size_t i = 0; i < circuits.size(); i++) { + if (circuits[i].total.actual_mb > peak_mb) { + peak_mb = circuits[i].total.actual_mb; + peak_idx = i; + } + } + + // Find peak RSS checkpoint + RssCheckpoint peak_rss{ "unknown", 0, 0 }; + for (const auto& cp : rss_checkpoints) { + if (cp.rss_mb > peak_rss.rss_mb) { + peak_rss = cp; + } + } + + // CRS memory: num_points * 128 bytes (with Pippenger point table) + double crs_mb = static_cast(crs_points) * 128.0 / (1024.0 * 1024.0); + + os << "{\n"; + + // Peak circuit + if (!circuits.empty()) { + os << " \"peak_circuit\": "; + write_circuit_stats(os, circuits[peak_idx], " "); + os << ",\n"; + } + + // All circuits + os << " \"all_circuits\": [\n"; + for (size_t i = 0; i < circuits.size(); i++) { + if (i > 0) { + os << ",\n"; + } + write_circuit_stats(os, circuits[i], " "); + } + os << "\n ],\n"; + + // RSS checkpoints + os << " \"rss_checkpoints\": [\n"; + for (size_t i = 0; i < rss_checkpoints.size(); i++) { + if (i > 0) { + os << ",\n"; + } + const auto& cp = rss_checkpoints[i]; + os << " { \"stage\": \"" << cp.stage << "\", \"circuit_index\": " << cp.circuit_index + << ", \"rss_mb\": " << cp.rss_mb << " }"; + } + os << "\n ],\n"; + + // Peak RSS + os << " \"peak_rss\": { \"stage\": \"" << peak_rss.stage << "\", \"circuit_index\": " << peak_rss.circuit_index + << ", \"rss_mb\": " << peak_rss.rss_mb << " },\n"; + + // CRS + os << " \"crs_mb\": " << std::fixed << std::setprecision(2) << crs_mb << "\n"; + + os << "}\n"; +} + +} // namespace bb::detail diff --git a/barretenberg/cpp/src/barretenberg/common/memory_profile.hpp b/barretenberg/cpp/src/barretenberg/common/memory_profile.hpp new file mode 100644 index 000000000000..2d182386ce75 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/common/memory_profile.hpp @@ -0,0 +1,51 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace bb { + +struct CategoryStats { + double actual_mb = 0; + double compressed_mb = 0; // ideal if using variable-width encoding +}; + +struct CircuitMemoryStats { + size_t circuit_index = 0; + std::map categories; + CategoryStats total; +}; + +namespace detail { + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +extern bool use_memory_profile; + +struct RssCheckpoint { + std::string stage; + size_t circuit_index; + size_t rss_mb; +}; + +struct MemoryProfile { + std::mutex mutex; + std::vector circuits; + std::vector rss_checkpoints; + size_t crs_points = 0; + + void add_circuit(CircuitMemoryStats stats); + void add_rss_checkpoint(const std::string& stage, size_t circuit_index); + void set_crs_size(size_t num_points); + void serialize_json(std::ostream& os) const; + void clear(); +}; + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +extern MemoryProfile GLOBAL_MEMORY_PROFILE; + +} // namespace detail +} // namespace bb diff --git a/barretenberg/cpp/src/barretenberg/polynomials/polynomial_stats.hpp b/barretenberg/cpp/src/barretenberg/polynomials/polynomial_stats.hpp index b0662dcf86b3..eb7194af7c4f 100644 --- a/barretenberg/cpp/src/barretenberg/polynomials/polynomial_stats.hpp +++ b/barretenberg/cpp/src/barretenberg/polynomials/polynomial_stats.hpp @@ -1,6 +1,7 @@ #pragma once #include "barretenberg/common/log.hpp" +#include "barretenberg/common/memory_profile.hpp" #include #include #include @@ -165,4 +166,103 @@ template void analyze_prover_polynomials(ProverPoly info(oss.str()); } +/** + * @brief Classify a polynomial label into a memory category. + */ +inline std::string classify_polynomial(const std::string& label) +{ + if (label.starts_with("w_")) { + return "wires"; + } + if (label.starts_with("sigma_")) { + return "sigmas"; + } + if (label.starts_with("id_")) { + return "ids"; + } + if (label.starts_with("q_")) { + return "selectors"; + } + if (label.starts_with("table_value")) { + return "tables"; + } + if (label.find("lookup") != std::string::npos || label == "z_lookup") { + return "lookup"; + } + if (label.find("ecc_op") != std::string::npos) { + return "ecc_op"; + } + if (label.find("databus") != std::string::npos || label.find("calldata") != std::string::npos || + label.find("return_data") != std::string::npos) { + return "databus"; + } + if (label.find("lagrange") != std::string::npos) { + return "lagrange"; + } + if (label == "z_perm") { + return "z_perm"; + } + return "other"; +} + +/** + * @brief Analyze prover polynomials and return categorized memory statistics. + * @details Groups polynomials by category (wires, sigmas, ids, selectors, etc.) and computes + * actual and compressed memory usage per category. Used by the --memory_profile_out flag. + */ +template +CircuitMemoryStats analyze_prover_polynomials_categorized(ProverPolynomials& polynomials) +{ + using Polynomial = std::remove_reference_t; + using Fr = typename Polynomial::FF; + + auto unshifted = polynomials.get_unshifted(); + auto all_labels = polynomials.get_labels(); + + auto mb = [](auto bytes) { return static_cast(bytes) / (1024.0 * 1024.0); }; + + CircuitMemoryStats result; + + size_t idx = 0; + for (auto& poly : unshifted) { + std::string label = (idx < all_labels.size()) ? all_labels[idx] : "unknown_" + std::to_string(idx); + idx++; + + if (poly.is_empty()) { + continue; + } + + std::string category = classify_polynomial(label); + size_t actual_bytes = poly.size() * sizeof(Fr); + double compressed_bytes = 0; + + const Fr* data = poly.data(); + for (size_t i = 0; i < poly.size(); ++i) { + const Fr& elem = data[i]; + if (elem.data[0] == 0 && elem.data[1] == 0 && elem.data[2] == 0 && elem.data[3] == 0) { + continue; // zero contributes 0 compressed bytes + } + Fr standard = elem.from_montgomery_form(); + size_t bytes_needed = min_bytes_for_value(standard); + if (bytes_needed <= 4) { + compressed_bytes += 4; + } else if (bytes_needed <= 8) { + compressed_bytes += 8; + } else if (bytes_needed <= 16) { + compressed_bytes += 16; + } else { + compressed_bytes += 32; + } + } + + auto& cat = result.categories[category]; + cat.actual_mb += mb(actual_bytes); + cat.compressed_mb += mb(compressed_bytes); + result.total.actual_mb += mb(actual_bytes); + result.total.compressed_mb += mb(compressed_bytes); + } + + return result; +} + } // namespace bb diff --git a/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp b/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp index 2e272a54de6e..4e375fec2ae9 100644 --- a/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp +++ b/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp @@ -100,6 +100,12 @@ template ProverInstance_::ProverInstance_(Circuit& cir if (std::getenv("BB_POLY_STATS")) { analyze_prover_polynomials(polynomials); } + if (detail::use_memory_profile) { + auto stats = analyze_prover_polynomials_categorized(polynomials); + detail::GLOBAL_MEMORY_PROFILE.add_circuit(std::move(stats)); + detail::GLOBAL_MEMORY_PROFILE.add_rss_checkpoint("after_poly_allocation", + detail::GLOBAL_MEMORY_PROFILE.circuits.size() - 1); + } } /** diff --git a/barretenberg/cpp/src/barretenberg/ultra_honk/ultra_prover.cpp b/barretenberg/cpp/src/barretenberg/ultra_honk/ultra_prover.cpp index 2e22ef8f93bb..902903bc987f 100644 --- a/barretenberg/cpp/src/barretenberg/ultra_honk/ultra_prover.cpp +++ b/barretenberg/cpp/src/barretenberg/ultra_honk/ultra_prover.cpp @@ -7,6 +7,7 @@ #include "ultra_prover.hpp" #include "barretenberg/commitment_schemes/gemini/gemini.hpp" #include "barretenberg/commitment_schemes/shplonk/shplemini.hpp" +#include "barretenberg/common/memory_profile.hpp" #include "barretenberg/flavor/mega_avm_flavor.hpp" #include "barretenberg/sumcheck/sumcheck.hpp" #include "barretenberg/ultra_honk/oink_prover.hpp" @@ -78,15 +79,30 @@ template typename UltraProver_::Proof UltraProver_ oink_prover(prover_instance, honk_vk, transcript); oink_prover.prove(); vinfo("created oink proof"); + if (detail::use_memory_profile) { + size_t circuit_idx = + detail::GLOBAL_MEMORY_PROFILE.circuits.empty() ? 0 : detail::GLOBAL_MEMORY_PROFILE.circuits.size() - 1; + detail::GLOBAL_MEMORY_PROFILE.add_rss_checkpoint("after_oink", circuit_idx); + } generate_gate_challenges(); // Run sumcheck execute_sumcheck_iop(); vinfo("finished relation check rounds"); + if (detail::use_memory_profile) { + size_t circuit_idx = + detail::GLOBAL_MEMORY_PROFILE.circuits.empty() ? 0 : detail::GLOBAL_MEMORY_PROFILE.circuits.size() - 1; + detail::GLOBAL_MEMORY_PROFILE.add_rss_checkpoint("after_sumcheck", circuit_idx); + } // Execute Shplemini PCS execute_pcs(); vinfo("finished PCS rounds"); + if (detail::use_memory_profile) { + size_t circuit_idx = + detail::GLOBAL_MEMORY_PROFILE.circuits.empty() ? 0 : detail::GLOBAL_MEMORY_PROFILE.circuits.size() - 1; + detail::GLOBAL_MEMORY_PROFILE.add_rss_checkpoint("after_pcs", circuit_idx); + } return export_proof(); } From 746cff47096094f3606f4547d49286514422c642 Mon Sep 17 00:00:00 2001 From: Jonathan Hao Date: Mon, 30 Mar 2026 15:05:38 +0000 Subject: [PATCH 2/3] fix: use stacked-area prefix for memory breakdown dashboard charts Switch from stacked: to stacked-area: prefix in extract_memory_benchmarks.py so the dashboard renders proper stacked area charts instead of overlaid lines. This requires the corresponding stacked-area chart support in benchmark-page-data. --- barretenberg/cpp/scripts/extract_memory_benchmarks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/barretenberg/cpp/scripts/extract_memory_benchmarks.py b/barretenberg/cpp/scripts/extract_memory_benchmarks.py index cb370a4b3c32..40527931d445 100755 --- a/barretenberg/cpp/scripts/extract_memory_benchmarks.py +++ b/barretenberg/cpp/scripts/extract_memory_benchmarks.py @@ -34,7 +34,7 @@ "name": f"{name_path}/memory/{category}_MB", "unit": "MB", "value": round(stats["actual_mb"], 2), - "extra": f"stacked:{name_path}/memory/polynomial_categories" + "extra": f"stacked-area:{name_path}/memory/polynomial_categories" }) # Total polynomial memory (peak circuit) From befc8eeb52d460f6640c809143b0578bfd2e9fc5 Mon Sep 17 00:00:00 2001 From: Jonathan Hao Date: Tue, 31 Mar 2026 16:12:45 +0000 Subject: [PATCH 3/3] feat: add per-circuit RSS timeline and remove redundant metrics - Add circuit_name to RSS checkpoints (set from ChonkAccumulate) - Extract RSS checkpoints as per-commit dashboard entries with labels like "06_EcdsaRAccount:entrypoint/after_accumulate" - Remove CRS instrumentation from commitment_key.hpp (constant, not useful to track) - Remove crs_MB, total_polynomial_MB, peak_rss_MB from dashboard metrics (redundant with existing memusage and stacked area chart) --- .../cpp/scripts/extract_memory_benchmarks.py | 31 ++++++------------- .../src/barretenberg/bbapi/bbapi_chonk.cpp | 4 +++ .../commitment_schemes/commitment_key.hpp | 7 +---- .../barretenberg/common/memory_profile.cpp | 22 +++++-------- .../barretenberg/common/memory_profile.hpp | 5 +-- 5 files changed, 25 insertions(+), 44 deletions(-) diff --git a/barretenberg/cpp/scripts/extract_memory_benchmarks.py b/barretenberg/cpp/scripts/extract_memory_benchmarks.py index 40527931d445..a8316f7b24ca 100755 --- a/barretenberg/cpp/scripts/extract_memory_benchmarks.py +++ b/barretenberg/cpp/scripts/extract_memory_benchmarks.py @@ -37,29 +37,18 @@ "extra": f"stacked-area:{name_path}/memory/polynomial_categories" }) - # Total polynomial memory (peak circuit) + # RSS timeline: each checkpoint becomes a line on the per-commit dashboard chart + for cp in data.get("rss_checkpoints", []): + circuit_name = cp.get("circuit_name", "") + idx = cp["circuit_index"] + stage = cp["stage"] + # Build a stable label like "07_EcdsaRAccount:entrypoint/after_accumulate" + label = f"{idx:02d}_{circuit_name}/{stage}" if circuit_name else f"{idx:02d}/{stage}" entries.append({ - "name": f"{name_path}/memory/total_polynomial_MB", + "name": f"{name_path}/memory/rss/{label}", "unit": "MB", - "value": round(peak_circuit.get("total_polynomial_mb", 0), 2) - }) - - # CRS memory - crs_mb = data.get("crs_mb", 0) - if crs_mb > 0: - entries.append({ - "name": f"{name_path}/memory/crs_MB", - "unit": "MB", - "value": round(crs_mb, 2) - }) - - # Peak RSS from checkpoints - peak_rss = data.get("peak_rss", {}) - if peak_rss.get("rss_mb", 0) > 0: - entries.append({ - "name": f"{name_path}/memory/peak_rss_MB", - "unit": "MB", - "value": peak_rss["rss_mb"] + "value": cp["rss_mb"], + "extra": f"stacked:{name_path}/memory/rss_timeline" }) # Append to existing benchmarks file diff --git a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_chonk.cpp b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_chonk.cpp index d23ebc6c5cce..3041efed572f 100644 --- a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_chonk.cpp +++ b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_chonk.cpp @@ -5,6 +5,7 @@ #include "barretenberg/commitment_schemes/ipa/ipa.hpp" #include "barretenberg/commitment_schemes/verification_key.hpp" #include "barretenberg/common/log.hpp" +#include "barretenberg/common/memory_profile.hpp" #include "barretenberg/common/serialize.hpp" #include "barretenberg/common/throw_or_abort.hpp" #include "barretenberg/dsl/acir_format/acir_format.hpp" @@ -90,6 +91,9 @@ ChonkAccumulate::Response ChonkAccumulate::execute(BBApiRequest& request) && } info("ChonkAccumulate - accumulating circuit '", request.loaded_circuit_name, "'"); + if (detail::use_memory_profile) { + detail::GLOBAL_MEMORY_PROFILE.set_circuit_name(request.loaded_circuit_name); + } request.ivc_in_progress->accumulate(circuit, precomputed_vk); request.ivc_stack_depth++; diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp index 02a7c16bb90a..a504ed460989 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp @@ -11,7 +11,6 @@ */ #include "barretenberg/common/bb_bench.hpp" -#include "barretenberg/common/memory_profile.hpp" #include "barretenberg/common/ref_span.hpp" #include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp" #include "barretenberg/polynomials/polynomial.hpp" @@ -54,11 +53,7 @@ template class CommitmentKey { CommitmentKey(const size_t num_points) : srs(srs::get_crs_factory()->get_crs(num_points)) , srs_size(num_points) - { - if (detail::use_memory_profile) { - detail::GLOBAL_MEMORY_PROFILE.set_crs_size(num_points); - } - } + {} /** * @brief Checks the commitment key is properly initialized. * diff --git a/barretenberg/cpp/src/barretenberg/common/memory_profile.cpp b/barretenberg/cpp/src/barretenberg/common/memory_profile.cpp index 85238ac55b80..d70d4b123902 100644 --- a/barretenberg/cpp/src/barretenberg/common/memory_profile.cpp +++ b/barretenberg/cpp/src/barretenberg/common/memory_profile.cpp @@ -48,15 +48,13 @@ void MemoryProfile::add_circuit(CircuitMemoryStats stats) void MemoryProfile::add_rss_checkpoint(const std::string& stage, size_t circuit_index) { std::lock_guard lock(mutex); - rss_checkpoints.push_back(RssCheckpoint{ stage, circuit_index, get_peak_rss_mb() }); + rss_checkpoints.push_back(RssCheckpoint{ stage, circuit_index, current_circuit_name, get_peak_rss_mb() }); } -void MemoryProfile::set_crs_size(size_t num_points) +void MemoryProfile::set_circuit_name(const std::string& name) { std::lock_guard lock(mutex); - if (num_points > crs_points) { - crs_points = num_points; - } + current_circuit_name = name; } void MemoryProfile::clear() @@ -64,7 +62,7 @@ void MemoryProfile::clear() std::lock_guard lock(mutex); circuits.clear(); rss_checkpoints.clear(); - crs_points = 0; + current_circuit_name.clear(); } namespace { @@ -111,16 +109,13 @@ void MemoryProfile::serialize_json(std::ostream& os) const } // Find peak RSS checkpoint - RssCheckpoint peak_rss{ "unknown", 0, 0 }; + RssCheckpoint peak_rss{ "unknown", 0, "", 0 }; for (const auto& cp : rss_checkpoints) { if (cp.rss_mb > peak_rss.rss_mb) { peak_rss = cp; } } - // CRS memory: num_points * 128 bytes (with Pippenger point table) - double crs_mb = static_cast(crs_points) * 128.0 / (1024.0 * 1024.0); - os << "{\n"; // Peak circuit @@ -148,16 +143,13 @@ void MemoryProfile::serialize_json(std::ostream& os) const } const auto& cp = rss_checkpoints[i]; os << " { \"stage\": \"" << cp.stage << "\", \"circuit_index\": " << cp.circuit_index - << ", \"rss_mb\": " << cp.rss_mb << " }"; + << ", \"circuit_name\": \"" << cp.circuit_name << "\", \"rss_mb\": " << cp.rss_mb << " }"; } os << "\n ],\n"; // Peak RSS os << " \"peak_rss\": { \"stage\": \"" << peak_rss.stage << "\", \"circuit_index\": " << peak_rss.circuit_index - << ", \"rss_mb\": " << peak_rss.rss_mb << " },\n"; - - // CRS - os << " \"crs_mb\": " << std::fixed << std::setprecision(2) << crs_mb << "\n"; + << ", \"rss_mb\": " << peak_rss.rss_mb << " }\n"; os << "}\n"; } diff --git a/barretenberg/cpp/src/barretenberg/common/memory_profile.hpp b/barretenberg/cpp/src/barretenberg/common/memory_profile.hpp index 2d182386ce75..4b9ace89418e 100644 --- a/barretenberg/cpp/src/barretenberg/common/memory_profile.hpp +++ b/barretenberg/cpp/src/barretenberg/common/memory_profile.hpp @@ -28,6 +28,7 @@ extern bool use_memory_profile; struct RssCheckpoint { std::string stage; size_t circuit_index; + std::string circuit_name; size_t rss_mb; }; @@ -35,11 +36,11 @@ struct MemoryProfile { std::mutex mutex; std::vector circuits; std::vector rss_checkpoints; - size_t crs_points = 0; + std::string current_circuit_name; void add_circuit(CircuitMemoryStats stats); void add_rss_checkpoint(const std::string& stage, size_t circuit_index); - void set_crs_size(size_t num_points); + void set_circuit_name(const std::string& name); void serialize_json(std::ostream& os) const; void clear(); };