diff --git a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecc.cpp b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecc.cpp
index 7bcc3fc2edfc..571ca7804e27 100644
--- a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecc.cpp
+++ b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecc.cpp
@@ -11,7 +11,7 @@ GrumpkinMul::Response GrumpkinMul::execute(BBApiRequest& request) &&
     if (!point.on_curve()) {
         BBAPI_ERROR(request, "Input point must be on the curve");
     }
-    return { point * scalar };
+    return { grumpkin::g1::element(point).mul_const_time(scalar).to_affine_const_time() };
 }
 
 GrumpkinAdd::Response GrumpkinAdd::execute(BBApiRequest& request) &&
@@ -32,7 +32,11 @@ GrumpkinBatchMul::Response GrumpkinBatchMul::execute(BBApiRequest& request) &&
             BBAPI_ERROR(request, "Input point must be on the curve");
         }
     }
-    auto output = grumpkin::g1::element::batch_mul_with_endomorphism(points, scalar);
+    std::vector<grumpkin::g1::affine_element> output;
+    output.reserve(points.size());
+    for (const auto& p : points) {
+        output.emplace_back(grumpkin::g1::element(p).mul_const_time(scalar).to_affine_const_time());
+    }
     return { std::move(output) };
 }
 
@@ -54,7 +58,7 @@ Secp256k1Mul::Response Secp256k1Mul::execute(BBApiRequest& request) &&
     if (!point.on_curve()) {
         BBAPI_ERROR(request, "Input point must be on the curve");
     }
-    return { point * scalar };
+    return { secp256k1::g1::element(point).mul_const_time(scalar).to_affine_const_time() };
 }
 
 Secp256k1GetRandomFr::Response Secp256k1GetRandomFr::execute(BB_UNUSED BBApiRequest& request) &&
@@ -87,7 +91,7 @@ Bn254G1Mul::Response Bn254G1Mul::execute(BBApiRequest& request) &&
     if (!point.on_curve()) {
         BBAPI_ERROR(request, "Input point must be on the curve");
     }
-    auto result = point * scalar;
+    auto result = bb::g1::element(point).mul_const_time(scalar).to_affine_const_time();
     if (!result.on_curve()) {
         BBAPI_ERROR(request, "Output point must be on the curve");
     }
diff --git a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecdsa.cpp b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecdsa.cpp
index c664e74d65a5..e2f351a302f1 100644
--- a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecdsa.cpp
+++ b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecdsa.cpp
@@ -10,12 +10,12 @@ namespace bb::bbapi {
 // Secp256k1 implementations
 EcdsaSecp256k1ComputePublicKey::Response EcdsaSecp256k1ComputePublicKey::execute(BB_UNUSED BBApiRequest& request) &&
 {
-    return { secp256k1::g1::one * private_key };
+    return { secp256k1::g1::element(secp256k1::g1::one).mul_const_time(private_key).to_affine_const_time() };
 }
 
 EcdsaSecp256k1ConstructSignature::Response EcdsaSecp256k1ConstructSignature::execute(BB_UNUSED BBApiRequest& request) &&
 {
-    auto pub_key = secp256k1::g1::one * private_key;
+    auto pub_key = secp256k1::g1::element(secp256k1::g1::one).mul_const_time(private_key).to_affine_const_time();
     crypto::ecdsa_key_pair<secp256k1::fr, secp256k1::g1> key_pair = { private_key, pub_key };
 
     std::string message_str(reinterpret_cast<const char*>(message.data()), message.size());
@@ -44,12 +44,12 @@ EcdsaSecp256k1VerifySignature::Response EcdsaSecp256k1VerifySignature::execute(B
 // Secp256r1 implementations
 EcdsaSecp256r1ComputePublicKey::Response EcdsaSecp256r1ComputePublicKey::execute(BB_UNUSED BBApiRequest& request) &&
 {
-    return { secp256r1::g1::one * private_key };
+    return { secp256r1::g1::element(secp256r1::g1::one).mul_const_time(private_key).to_affine_const_time() };
 }
 
 EcdsaSecp256r1ConstructSignature::Response EcdsaSecp256r1ConstructSignature::execute(BB_UNUSED BBApiRequest& request) &&
 {
-    auto pub_key = secp256r1::g1::one * private_key;
+    auto pub_key = secp256r1::g1::element(secp256r1::g1::one).mul_const_time(private_key).to_affine_const_time();
     crypto::ecdsa_key_pair<secp256r1::fr, secp256r1::g1> key_pair = { private_key, pub_key };
 
     std::string message_str(reinterpret_cast<const char*>(message.data()), message.size());
diff --git a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_schnorr.cpp b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_schnorr.cpp
index 68b51ea2cd8e..f5c365a23479 100644
--- a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_schnorr.cpp
+++ b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_schnorr.cpp
@@ -8,12 +8,13 @@ namespace bb::bbapi {
 
 SchnorrComputePublicKey::Response SchnorrComputePublicKey::execute(BB_UNUSED BBApiRequest& request) &&
 {
-    return { grumpkin::g1::one * private_key };
+    return { grumpkin::g1::element(grumpkin::g1::one).mul_const_time(private_key).to_affine_const_time() };
 }
 
 SchnorrConstructSignature::Response SchnorrConstructSignature::execute(BB_UNUSED BBApiRequest& request) &&
 {
-    grumpkin::g1::affine_element pub_key = grumpkin::g1::one * private_key;
+    grumpkin::g1::affine_element pub_key =
+        grumpkin::g1::element(grumpkin::g1::one).mul_const_time(private_key).to_affine_const_time();
     crypto::schnorr_key_pair<grumpkin::fr, grumpkin::g1> key_pair = { private_key, pub_key };
 
     auto sig = crypto::schnorr_construct_signature<grumpkin::fr, grumpkin::g1>(message_field, key_pair);
diff --git a/barretenberg/cpp/src/barretenberg/crypto/ecdsa/ecdsa_impl.hpp b/barretenberg/cpp/src/barretenberg/crypto/ecdsa/ecdsa_impl.hpp
index 8588ffae6876..a028e787fb9b 100644
--- a/barretenberg/cpp/src/barretenberg/crypto/ecdsa/ecdsa_impl.hpp
+++ b/barretenberg/cpp/src/barretenberg/crypto/ecdsa/ecdsa_impl.hpp
@@ -31,11 +31,11 @@ ecdsa_signature ecdsa_construct_signature(const std::string& message, const ecds
 
     // Compute R = k * G. k is the secret RFC6979 nonce, so use the constant-time multiplication
     // to defend against the Hamming-weight / bit-length timing leak in operator*.
-    typename G1::affine_element R(typename G1::element(G1::one).mul_const_time(k));
+    typename G1::affine_element R(typename G1::element(G1::one).mul_const_time(k).to_affine_const_time());
 
     // Compute the signature
     Fr r = Fr(R.x);
-    Fr s = (z + r * account.private_key) / k;
+    Fr s = (z + r * account.private_key) * k.invert_const_time();
     secure_erase_bytes(&k, sizeof(k));
 
     // Ensure that the value of s is "low", i.e. s := min{ s, (|Fr| - s) }
diff --git a/barretenberg/cpp/src/barretenberg/crypto/schnorr/schnorr.tcc b/barretenberg/cpp/src/barretenberg/crypto/schnorr/schnorr.tcc
index 847608a53244..16fe624c72dc 100644
--- a/barretenberg/cpp/src/barretenberg/crypto/schnorr/schnorr.tcc
+++ b/barretenberg/cpp/src/barretenberg/crypto/schnorr/schnorr.tcc
@@ -86,7 +86,7 @@ schnorr_signature schnorr_construct_signature(const typename G1::Fq& message_fie
 
     // k is a secret nonce; use the constant-time multiplication to defend against the
     // Hamming-weight / bit-length timing leak in operator*.
-    typename G1::affine_element R(typename G1::element(G1::one).mul_const_time(k));
+    typename G1::affine_element R(typename G1::element(G1::one).mul_const_time(k).to_affine_const_time());
 
     using Fq = typename G1::Fq;
     Fq e = schnorr_generate_challenge<G1>(message_field, public_key, R);
diff --git a/barretenberg/cpp/src/barretenberg/ecc/curves/invert_differential.fuzzer.cpp b/barretenberg/cpp/src/barretenberg/ecc/curves/invert_differential.fuzzer.cpp
new file mode 100644
index 000000000000..6ef8c48599fb
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/ecc/curves/invert_differential.fuzzer.cpp
@@ -0,0 +1,240 @@
+/**
+ * @file invert_differential.fuzzer.cpp
+ * @brief Differential fuzzer for Bernstein-Yang modular inverse vs Fermat (modexp).
+ *
+ * Reuses the FieldVM driver from `multi_field.fuzzer.cpp` to generate diverse
+ * field elements via sequences of arithmetic operations.  After each VM phase
+ * it takes the last element produced (the highest-indexed non-zero slot in the
+ * VM's internal state, with a fallback to slot 0) and computes its inverse
+ * three different ways:
+ *
+ *   - A: `pow(modulus_minus_two)` — Fermat's little theorem (modexp).
+ *   - B: `invert_vartime<Native5x64>` — safegcd, 5×64-bit limb kernel
+ *        (selected on native targets, BATCH=62).
+ *   - C: `invert_vartime<Wasm9x29>` — safegcd, 9×29-bit limb kernel
+ *        (selected on WASM targets, BATCH=58).
+ *
+ * All three are compared in canonical (non-Montgomery) form.  Any discrepancy
+ * triggers an abort with full diagnostic output (field type, input, all three
+ * outputs, plus Montgomery checks `a * X ?= 1` for each).  Cross-checking the
+ * WASM kernel here gives it libFuzzer coverage even though libFuzzer itself
+ * doesn't run under WASM — both kernels are plain C++ classes.
+ *
+ * Only 254-bit primes are tested (BN254 Fr/Fq, Grumpkin shares the BN254
+ * curves), since the 5-limb signed BY state requires p < 2^255 and the
+ * production `field::invert()` dispatch also gates on this.  256-bit primes
+ * (secp256k1/r1) don't use BY and are skipped.
+ */
+
+#include "barretenberg/ecc/curves/bn254/fq.hpp"
+#include "barretenberg/ecc/curves/bn254/fr.hpp"
+#include "barretenberg/ecc/fields/bernstein_yang_inverse.hpp"
+#include "barretenberg/ecc/fields/field.fuzzer.hpp"
+#include <cassert>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+using namespace bb;
+using numeric::uint256_t;
+
+// ---------------------------------------------------------------
+// Phase header — same 2-byte layout as multi_field.fuzzer.cpp but restricted
+// to the two 254-bit fields that actually use BY.
+// ---------------------------------------------------------------
+enum class FieldType : uint8_t {
+    BN254_FQ = 0,
+    BN254_FR = 1,
+};
+
+static constexpr size_t NUM_FIELD_TYPES = 2;
+static constexpr size_t MAX_STEPS = 64;
+static constexpr size_t PHASE_HEADER_SIZE = 2;
+
+struct VMPhaseHeader {
+    uint8_t field_type;
+    uint8_t steps;
+};
+static_assert(sizeof(VMPhaseHeader) == 2, "VMPhaseHeader must be 2 bytes");
+
+template <typename Field> static uint256_t reduce_to_modulus(const uint256_t& value)
+{
+    return (value < Field::modulus) ? value : (value % Field::modulus);
+}
+
+template <typename Field>
+static void import_state_with_reduction(FieldVM<Field>& vm, const std::vector<uint256_t>& state)
+{
+    for (size_t i = 0; i < INTERNAL_STATE_SIZE && i < state.size(); i++) {
+        vm.uint_internal_state[i] = reduce_to_modulus<Field>(state[i]);
+        vm.field_internal_state[i] = Field(vm.uint_internal_state[i]);
+    }
+}
+
+// ---------------------------------------------------------------
+// Differential oracle.
+//
+// Fetches `a_raw` (the non-Montgomery integer) from the VM's uint state and
+// computes a^{-1} two ways; aborts on mismatch.
+// ---------------------------------------------------------------
+template <typename Field> static Field raw_to_montgomery(const uint256_t& raw)
+{
+    Field f{ raw.data[0], raw.data[1], raw.data[2], raw.data[3] };
+    f.self_to_montgomery_form();
+    return f;
+}
+
+static void print_limbs(const char* label, const uint256_t& v)
+{
+    std::fprintf(stderr,
+                 "  %s = 0x%016lx%016lx%016lx%016lx\n",
+                 label,
+                 (unsigned long)v.data[3],
+                 (unsigned long)v.data[2],
+                 (unsigned long)v.data[1],
+                 (unsigned long)v.data[0]);
+}
+
+template <typename Field> static void differential_check_inverse(const Field& a_mont, const uint256_t& a_raw)
+{
+    if (a_raw == 0) {
+        return; // 0 has no inverse — skip.
+    }
+
+    // A: Fermat via pow. We bypass field::invert() (which now dispatches into
+    // BY) by calling pow(modulus_minus_two) directly, so the paths are
+    // genuinely independent implementations.
+    Field fermat_inv = a_mont.pow(Field::modulus_minus_two);
+
+    // B, C: Bernstein-Yang safegcd, called with the raw (non-Montgomery) value
+    // on both the Native5x64 (BATCH=62) and Wasm9x29 (BATCH=58) kernels.
+    // Each kernel needs its own p_inv_mod_2^BATCH constant.
+    constexpr uint256_t p_uint = Field::modulus;
+    constexpr uint64_t p_inv_native =
+        bernstein_yang::Native5x64::p_inv_mod_2k_from_montgomery_r_inv(Field::Params::r_inv);
+    constexpr uint64_t p_inv_wasm = bernstein_yang::Wasm9x29::p_inv_mod_2k_from_montgomery_r_inv(Field::Params::r_inv);
+
+    uint256_t native_inv_raw = bernstein_yang::invert_vartime<bernstein_yang::Native5x64>(a_raw, p_uint, p_inv_native);
+    uint256_t wasm_inv_raw = bernstein_yang::invert_vartime<bernstein_yang::Wasm9x29>(a_raw, p_uint, p_inv_wasm);
+
+    Field native_inv = raw_to_montgomery<Field>(native_inv_raw);
+    Field wasm_inv = raw_to_montgomery<Field>(wasm_inv_raw);
+
+    const bool native_ok = (fermat_inv == native_inv);
+    const bool wasm_ok = (fermat_inv == wasm_inv);
+    if (native_ok && wasm_ok) {
+        return;
+    }
+
+    std::fprintf(stderr, "\n[invert_differential.fuzzer] MISMATCH\n");
+    std::fprintf(stderr, "  field:       %s\n", typeid(Field).name());
+    std::fprintf(stderr, "  native_ok:   %s\n", native_ok ? "yes" : "NO");
+    std::fprintf(stderr, "  wasm_ok:     %s\n", wasm_ok ? "yes" : "NO");
+    print_limbs("a_raw     ", a_raw);
+    print_limbs("fermat    ", static_cast<uint256_t>(fermat_inv));
+    print_limbs("BY native ", static_cast<uint256_t>(native_inv));
+    print_limbs("BY wasm   ", static_cast<uint256_t>(wasm_inv));
+    print_limbs("a*fermat  ", static_cast<uint256_t>(a_mont * fermat_inv));
+    print_limbs("a*native  ", static_cast<uint256_t>(a_mont * native_inv));
+    print_limbs("a*wasm    ", static_cast<uint256_t>(a_mont * wasm_inv));
+    std::fflush(stderr);
+    std::abort();
+}
+
+// Pick the last element produced: highest-indexed non-zero slot of the VM's
+// uint state, with a fallback to slot 0 if all slots are zero.
+static size_t last_element_index(const std::vector<uint256_t>& state)
+{
+    for (size_t i = state.size(); i > 0; --i) {
+        if (state[i - 1] != uint256_t(0)) {
+            return i - 1;
+        }
+    }
+    return 0;
+}
+
+template <typename Field>
+static int run_phase_and_diff(const VMPhaseHeader& header,
+                              const unsigned char* data,
+                              size_t size,
+                              size_t& data_offset,
+                              std::vector<uint256_t>& current_state)
+{
+    FieldVM<Field> vm(false, header.steps);
+    if (!current_state.empty()) {
+        import_state_with_reduction<Field>(vm, current_state);
+    }
+    vm.set_max_steps(header.steps);
+    size_t bytes_consumed = vm.run(data + data_offset, size - data_offset, true);
+    if (bytes_consumed == 0) {
+        return 0;
+    }
+
+    if (!vm.check_internal_state()) {
+        // Internal VM invariant violation — not the inverse bug we're looking
+        // for, but still a failure of the driver.  Report and stop.
+        std::fprintf(stderr, "[invert_differential.fuzzer] VM internal state check failed\n");
+        return -1;
+    }
+
+    // Differential inverse check on the last element produced this phase,
+    // plus every non-zero slot in the final state for extra coverage.
+    auto uint_state = vm.export_uint_state();
+    size_t last_idx = last_element_index(uint_state);
+    differential_check_inverse<Field>(vm.field_internal_state[last_idx], uint_state[last_idx]);
+
+    // Extra coverage: also diff every other non-zero slot.  Same check on
+    // many more values per phase, virtually free CPU-wise.
+    for (size_t i = 0; i < uint_state.size(); ++i) {
+        if (i != last_idx && uint_state[i] != uint256_t(0)) {
+            differential_check_inverse<Field>(vm.field_internal_state[i], uint_state[i]);
+        }
+    }
+
+    current_state = uint_state;
+    data_offset += bytes_consumed;
+    return 1;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const unsigned char* data, size_t size)
+{
+    if (size < PHASE_HEADER_SIZE) {
+        return 0;
+    }
+
+    std::vector<uint256_t> current_state;
+    size_t data_offset = 0;
+
+    while (data_offset + PHASE_HEADER_SIZE <= size) {
+        const VMPhaseHeader* header_ptr = reinterpret_cast<const VMPhaseHeader*>(data + data_offset);
+        VMPhaseHeader header = *header_ptr;
+
+        FieldType selected_field_type = static_cast<FieldType>(header.field_type % NUM_FIELD_TYPES);
+        uint8_t selected_steps = header.steps % MAX_STEPS;
+        if (selected_steps == 0) {
+            selected_steps = 1;
+        }
+        header.field_type = static_cast<uint8_t>(selected_field_type);
+        header.steps = selected_steps;
+
+        int r = 0;
+        switch (selected_field_type) {
+        case FieldType::BN254_FQ:
+            r = run_phase_and_diff<fq>(header, data, size, data_offset, current_state);
+            break;
+        case FieldType::BN254_FR:
+            r = run_phase_and_diff<fr>(header, data, size, data_offset, current_state);
+            break;
+        }
+
+        if (r < 0) {
+            return 1;
+        }
+        if (r == 0) {
+            break;
+        }
+    }
+    return 0;
+}
diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.hpp b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.hpp
new file mode 100644
index 000000000000..4e4190d1bdf9
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.hpp
@@ -0,0 +1,369 @@
+// Bernstein-Yang safegcd modular inverse.
+//
+// We want a⁻¹ mod p, where p is an odd prime.  Run an extended binary GCD on
+// the pair (f, g) starting at (p, a), with Bezout coefficients (d, e)
+// satisfying invariants  d·a ≡ f (mod p)  and  e·a ≡ g (mod p)  at all times.
+// When g reaches 0, gcd(p, a) = ±f = ±1 (since p is prime, a ≠ 0), so
+// a⁻¹ = ±d.
+//
+// The cheap operation in a binary GCD is "if g is odd swap-and-subtract to
+// make it even, then divide g by 2"; one such step is a "divstep" and
+// shrinks |g| by ~1 bit.  Doing each divstep on the full 256-bit state would
+// be slow, so we use Pornin's trick: do BATCH divsteps purely on the low 64
+// bits of (f, g), accumulating the resulting linear transform as a 2×2
+// integer matrix M, then apply M to the full-precision (f, g, d, e) in one
+// go (with an exact /2^BATCH at the end).  The (d, e) side needs a 2-adic
+// correction k·p added before dividing so the result stays integer-valued
+// — k is determined by the low BATCH bits of M·(d, e) and p⁻¹ mod 2^BATCH.
+//
+// Native5x64 and Wasm9x29 wrap the platform-specific limb representation
+// behind the same static interface so `invert_vartime` below is a
+// single algorithm for both targets.
+
+#pragma once
+
+#include "barretenberg/numeric/uint256/uint256.hpp"
+#include <cstdint>
+
+namespace bb::bernstein_yang {
+
+using numeric::uint256_t;
+using u64 = uint64_t;
+using i64 = int64_t;
+
+// The transition matrix produced by BATCH divsteps. With the implicit
+// "/ 2^BATCH" at the end of apply_divstep_matrix, it represents the linear
+// map (f, g) ↦ (M·(f, g) / 2^BATCH).  Each divstep doubles one matrix entry,
+// so after BATCH steps |u|, |v|, |q|, |r| ≤ 2^BATCH; they are signed (the
+// swap-and-subtract case introduces negatives).
+struct DivstepMatrix {
+    i64 u, v, q, r;
+};
+
+// 5 × 64-bit limbs, top limb two's-complement signed.  Products via __int128.
+class Native5x64 {
+  public:
+    // Number of divsteps folded into one matrix application.  Bigger BATCH
+    // means fewer matrix applications per inversion but bigger matrix
+    // entries.  Cap is set by the matrix-times-state product staying inside
+    // an __int128 accumulator: a single (i63 entry) × (u64 limb) is 127 bits,
+    // so BATCH ≤ 63; we use 62 to keep one bit of slack for the running sum.
+    static constexpr int BATCH = 62;
+
+    // Worst-case number of matrix applications needed before g must have
+    // reached 0.  The 735-divstep bound for 254-bit inputs is from the BY
+    // paper's convergence proof (rate ≈ 1 / 1.7 bits of g consumed per
+    // divstep).  We pick the smallest NUM_ITERATIONS so NUM_ITERATIONS *
+    // BATCH ≥ 735; ⌈735 / 62⌉ = 12.  The actual loop usually exits much
+    // earlier via the early break on g == 0.
+    static constexpr int NUM_ITERATIONS = 12;
+
+    // The Bezout coefficients (d, e) live mod p but during the iteration we
+    // hold them as signed integers in the state.  Each matrix application grows
+    // |d|, |e| by roughly a factor of 2 (matrix entry × value) plus an
+    // additive p (from the 2-adic correction k·p), so without bringing them
+    // back to [0, p) they would eventually overflow the state.
+    // reduce_to_canonical does that subtract-or-add-p reduction.  Calling it
+    // every iter is wasteful — the 5×64-bit signed state has enough room to
+    // let |d|, |e| reach ~2^K · p before they no longer fit, so we only
+    // reduce every REDUCE_INTERVAL = 4 iters (|d|, |e| stay ≤ ~32p between
+    // reductions, plenty of headroom).
+    static constexpr int REDUCE_INTERVAL = 4;
+
+    // Worst-case iteration cap inside reduce_to_canonical.  After
+    // REDUCE_INTERVAL iters between reductions, |d|, |e| ≤ (2^(REDUCE_INTERVAL+1) - 1)·p,
+    // so reducing requires that many subtractions plus one break iter.
+    static constexpr int REDUCE_TO_CANONICAL_MAX_ITERS = 36;
+    static_assert((1U << (REDUCE_INTERVAL + 1)) <= REDUCE_TO_CANONICAL_MAX_ITERS,
+                  "REDUCE_INTERVAL too large for reduce_to_canonical iteration bound");
+
+    Native5x64() noexcept
+        : l{}
+    {}
+    explicit Native5x64(const uint256_t& x) noexcept
+        : l{ x.data[0], x.data[1], x.data[2], x.data[3], 0 }
+    {}
+    static Native5x64 one() noexcept
+    {
+        Native5x64 r;
+        r.l[0] = 1;
+        return r;
+    }
+
+    uint256_t to_uint256() const noexcept { return { l[0], l[1], l[2], l[3] }; }
+    u64 low_64() const noexcept { return l[0]; }
+    bool is_zero() const noexcept { return (l[0] | l[1] | l[2] | l[3] | l[4]) == 0; }
+    bool is_negative() const noexcept { return (i64)l[4] < 0; }
+    void neg() noexcept
+    {
+        u64 c = 1;
+        for (int i = 0; i < N; ++i) {
+            u64 v = (~l[i]) + c;
+            c = (c && v == 0) ? 1 : 0;
+            l[i] = v;
+        }
+    }
+    // Iter cap chosen by the REDUCE_TO_CANONICAL_MAX_ITERS / REDUCE_INTERVAL
+    // static_assert above; see those constants for the magnitude argument.
+    void reduce_to_canonical(const Native5x64& p) noexcept
+    {
+        for (int it = 0; it < REDUCE_TO_CANONICAL_MAX_ITERS; ++it) {
+            if (is_negative()) {
+                add_inplace(p);
+            } else if (ge(p)) {
+                sub_inplace(p);
+            } else {
+                break;
+            }
+        }
+    }
+
+    // BATCH branchy divsteps on the low 64 bits of (f, g); returns the
+    // transition matrix M and updates δ.  Variable-time over the inner
+    // branches — non-secret inputs only.
+    static DivstepMatrix compute_divstep_matrix(i64& delta, u64 f_lo, u64 g_lo) noexcept;
+    // (f, g) ← M·(f, g) / 2^BATCH and (d, e) ← (M·(d, e) + k·p) / 2^BATCH,
+    // where k_i = -((M·(d, e))_i · p⁻¹) mod 2^BATCH (the 2-adic correction
+    // that makes (M·(d, e))_i + k_i·p divisible by 2^BATCH).
+    static void apply_divstep_matrix(const DivstepMatrix& m,
+                                     Native5x64& f,
+                                     Native5x64& g,
+                                     Native5x64& d,
+                                     Native5x64& e,
+                                     const Native5x64& p,
+                                     u64 p_inv_mod_2k) noexcept;
+    // r_inv = -p⁻¹ mod 2^64 (barretenberg's Montgomery constant), so p⁻¹ mod
+    // 2^BATCH is the low BATCH bits of -r_inv.
+    static constexpr u64 p_inv_mod_2k_from_montgomery_r_inv(u64 r_inv) noexcept
+    {
+        // r_inv = -p^{-1} mod 2^64, so 0 - r_inv = p^{-1} mod 2^64.
+        return (0ULL - r_inv) & ((1ULL << BATCH) - 1);
+    }
+
+  private:
+    static constexpr int N = 5;
+    u64 l[N];
+
+    void add_inplace(const Native5x64& b) noexcept
+    {
+        u64 c = 0;
+        for (int i = 0; i < N; ++i) {
+            __uint128_t s = (__uint128_t)l[i] + b.l[i] + c;
+            l[i] = (u64)s;
+            c = (u64)(s >> 64);
+        }
+    }
+    void sub_inplace(const Native5x64& b) noexcept
+    {
+        u64 borrow = 0;
+        for (int i = 0; i < N; ++i) {
+            __uint128_t s = (__uint128_t)l[i] - (__uint128_t)b.l[i] - borrow;
+            l[i] = (u64)s;
+            borrow = ((i64)(s >> 64) < 0) ? 1 : 0;
+        }
+    }
+    bool ge(const Native5x64& b) const noexcept
+    {
+        i64 a_top = (i64)l[N - 1], b_top = (i64)b.l[N - 1];
+        if (a_top != b_top) {
+            return a_top > b_top;
+        }
+        for (int i = N - 2; i >= 0; --i) {
+            if (l[i] != b.l[i]) {
+                return l[i] > b.l[i];
+            }
+        }
+        return true;
+    }
+
+    friend struct NativeMatrix;
+};
+
+// 6-limb signed product helpers used by Native5x64::apply_divstep_matrix.
+struct NativeMatrix {
+    static void signed_linear_combination(i64 a, const Native5x64& x, i64 b, const Native5x64& y, u64 out[6]) noexcept
+    {
+        __int128 c = 0;
+        for (int i = 0; i < 4; ++i) {
+            c += (__int128)a * (__int128)(u64)x.l[i] + (__int128)b * (__int128)(u64)y.l[i];
+            out[i] = (u64)c;
+            c >>= 64;
+        }
+        c += (__int128)a * (__int128)(i64)x.l[4] + (__int128)b * (__int128)(i64)y.l[4];
+        out[4] = (u64)c;
+        out[5] = (u64)(c >> 64);
+    }
+    // Sign-preserving exact /2^BATCH on the 6-limb signed `t`.  Sign of the
+    // result lives in bit 63 of r.l[4], which is bit 61 of t[5].  This is the
+    // sign of t iff t[5] is just sign-extension of the actual magnitude — i.e.,
+    // iff |t| < 2^319.  BY guarantees this: |M·(x,y) + k·p| ≤ 2^324 in the
+    // (d,e) row and ≤ 2^319 in the (f,g) row, both with |result/2^62| < 2^263 < 2^319.
+    // A future change widening the matrix entries or state without re-running
+    // this analysis will silently corrupt the sign bit.
+    static Native5x64 arithmetic_shift_by_batch(const u64 t[6]) noexcept
+    {
+        Native5x64 r;
+        for (int i = 0; i < 4; ++i) {
+            r.l[i] = (t[i] >> 62) | (t[i + 1] << 2);
+        }
+        r.l[4] = (t[4] >> 62) | (t[5] << 2);
+        return r;
+    }
+};
+
+// Each inner step shrinks |g| by ~1 bit using a binary-GCD-style move.
+// Three cases, depending on g's parity and the "δ" tracker (which decides
+// whether |f| or |g| is currently smaller):
+//   g even         : g ← g/2.
+//   g odd, δ ≤ 0   : g ← (g + f)/2  (adding f to make g+f even before /2).
+//   g odd, δ > 0   : swap roles — (f, g) ← (g, (g - f)/2).
+// The matrix (u, v, q, r) tracks the same linear transform applied
+// symbolically; doubling u, v (or q, r) corresponds to the implicit /2 each
+// inner step performs.  After BATCH steps the low BATCH bits of the
+// transformed state are guaranteed zero, so apply_divstep_matrix's implicit
+// "/ 2^BATCH" is an exact integer division.
+inline DivstepMatrix Native5x64::compute_divstep_matrix(i64& delta, u64 f_lo, u64 g_lo) noexcept
+{
+    i64 u = 1, v = 0, q = 0, r = 1;
+    for (int i = 0; i < BATCH; ++i) {
+        if (g_lo & 1) {
+            if (delta > 0) {
+                u64 nf = g_lo, ng = (g_lo - f_lo) >> 1;
+                i64 nu = q << 1, nv = r << 1, nq = q - u, nr = r - v;
+                f_lo = nf;
+                g_lo = ng;
+                u = nu;
+                v = nv;
+                q = nq;
+                r = nr;
+                delta = 1 - delta;
+            } else {
+                g_lo = (g_lo + f_lo) >> 1;
+                q = q + u;
+                r = r + v;
+                u <<= 1;
+                v <<= 1;
+                delta = delta + 1;
+            }
+        } else {
+            g_lo >>= 1;
+            u <<= 1;
+            v <<= 1;
+            delta = delta + 1;
+        }
+    }
+    return { u, v, q, r };
+}
+
+inline void Native5x64::apply_divstep_matrix(const DivstepMatrix& m,
+                                             Native5x64& f,
+                                             Native5x64& g,
+                                             Native5x64& d,
+                                             Native5x64& e,
+                                             const Native5x64& p,
+                                             u64 p_inv_mod_2k) noexcept
+{
+    constexpr u64 MASK_BATCH = (1ULL << BATCH) - 1;
+
+    u64 nf[6], ng[6];
+    NativeMatrix::signed_linear_combination(m.u, f, m.v, g, nf);
+    NativeMatrix::signed_linear_combination(m.q, f, m.r, g, ng);
+    f = NativeMatrix::arithmetic_shift_by_batch(nf);
+    g = NativeMatrix::arithmetic_shift_by_batch(ng);
+
+    // k = -t · p_inv_mod_2k mod 2^BATCH makes t + k·p divisible by 2^BATCH.
+    auto apply_corrected_row = [&](i64 a, const Native5x64& da, i64 b, const Native5x64& eb, Native5x64& out) {
+        u64 t[6];
+        NativeMatrix::signed_linear_combination(a, da, b, eb, t);
+        u64 k = ((0ULL - t[0]) * p_inv_mod_2k) & MASK_BATCH;
+        u64 kp[6] = {};
+        u64 carry = 0;
+        for (int i = 0; i < 5; ++i) {
+            __uint128_t prod = (__uint128_t)k * (u64)p.l[i] + carry;
+            kp[i] = (u64)prod;
+            carry = (u64)(prod >> 64);
+        }
+        kp[5] = carry;
+        u64 c = 0;
+        for (int i = 0; i < 6; ++i) {
+            __uint128_t s = (__uint128_t)t[i] + kp[i] + c;
+            t[i] = (u64)s;
+            c = (u64)(s >> 64);
+        }
+        out = NativeMatrix::arithmetic_shift_by_batch(t);
+    };
+    Native5x64 nd, ne;
+    apply_corrected_row(m.u, d, m.v, e, nd);
+    apply_corrected_row(m.q, d, m.r, e, ne);
+    d = nd;
+    e = ne;
+}
+
+} // namespace bb::bernstein_yang
+
+#include "./bernstein_yang_inverse_wasm.hpp"
+
+namespace bb::bernstein_yang {
+
+#if defined(__wasm__)
+using State = Wasm9x29;
+#else
+using State = Native5x64;
+#endif
+
+/**
+ * @brief Variable-time safegcd inverse (Bernstein-Yang TCHES 2019, Pornin 2020 §4).
+ *
+ * Iterates (f, g) starting at (p, a); each outer iter batches BATCH divsteps
+ * into a 2×2 matrix M and applies M to (f, g) / (d, e).  When g reaches 0,
+ * gcd(p, a) = ±f and a⁻¹ = ±d mod p.  Returns 0 for a == 0.
+ *
+ * @param p_inv_mod_2k  p⁻¹ mod 2^BATCH (used by apply_divstep_matrix's 2-adic correction).
+ * @pre p odd prime, p < 2^255, 0 ≤ a < p.
+ */
+template <class S = State>
+inline uint256_t invert_vartime(const uint256_t& a, const uint256_t& p, u64 p_inv_mod_2k) noexcept
+{
+    if (a == uint256_t(0)) {
+        return uint256_t(0);
+    }
+    S P(p), f = P, g(a), d, e = S::one();
+    // δ is Pornin's auxiliary used by the divstep rule to decide swap-vs-add cases.
+    i64 delta = 1;
+    for (int i = 0; i < S::NUM_ITERATIONS; ++i) {
+        DivstepMatrix m = S::compute_divstep_matrix(delta, f.low_64(), g.low_64());
+        S::apply_divstep_matrix(m, f, g, d, e, P, p_inv_mod_2k);
+        if (g.is_zero()) {
+            break;
+        }
+        if ((i + 1) % S::REDUCE_INTERVAL == 0) {
+            d.reduce_to_canonical(P);
+            e.reduce_to_canonical(P);
+        }
+    }
+    d.reduce_to_canonical(P);
+    if (f.is_negative()) {
+        d.neg();
+        d.reduce_to_canonical(P);
+    }
+    return d.to_uint256();
+}
+
+inline constexpr u64 p_inv_mod_2k_from_montgomery_r_inv(u64 r_inv) noexcept
+{
+    return State::p_inv_mod_2k_from_montgomery_r_inv(r_inv);
+}
+
+// True iff `invert_vartime` is usable for field params T: the active
+// kernel must be compilable on this toolchain (Native5x64 needs __int128, the
+// WASM kernel is unconditional) and T's modulus must fit BY's < 2^255
+// precondition.  Used to gate the dispatch in `field::invert()`.
+template <class T>
+inline constexpr bool supported_v =
+#if defined(__SIZEOF_INT128__) || defined(__wasm__)
+    T::modulus_3 < (1ULL << 63);
+#else
+    false;
+#endif
+
+} // namespace bb::bernstein_yang
diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.md b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.md
new file mode 100644
index 000000000000..c03ca5c5bd0e
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.md
@@ -0,0 +1,392 @@
+# Bernstein–Yang modular inverse: the algorithm
+
+A description of the modular inverse algorithm implemented in `bernstein_yang_inverse.hpp`.
+
+Throughout, $p$ is an **odd prime**, $0 < a < p$, and we want
+$$a^{-1} \bmod p.$$
+
+All quantities are integers; reductions modulo $p$ are made explicit when used.
+
+---
+
+## 1. Setup
+
+We track a $2 \times 2$ integer matrix
+
+$$
+\Phi \;=\; \begin{pmatrix} f & d \\ g & e \end{pmatrix}
+$$
+
+initialised at
+
+$$
+\Phi_0 \;=\; \begin{pmatrix} p & 0 \\ a & 1 \end{pmatrix}, \qquad \det \Phi_0 \;=\; p.
+$$
+
+### The kernel invariant
+
+The whole algorithm preserves a single congruence:
+
+$$
+\boxed{\quad \Phi \begin{pmatrix} 1 \\ -a \end{pmatrix} \;\equiv\; \begin{pmatrix} 0 \\ 0 \end{pmatrix} \pmod p. \quad}
+$$
+
+Equivalently, the column vector $(1, -a)^T \in \mathbb{F}_p^2$ is in the kernel
+of $\Phi \bmod p$. Unpacking entry by entry, this is just two Bezout congruences:
+
+$$
+f \;\equiv\; d \cdot a \pmod p, \qquad g \;\equiv\; e \cdot a \pmod p.
+$$
+
+The initial state satisfies them ($p \equiv 0 \cdot a$, $a \equiv 1 \cdot a$).
+
+### The reduction goal
+
+We reduce $\Phi$ to upper triangular form:
+
+$$
+\Phi_N \;=\; \begin{pmatrix} \pm 1 & a^{-1} \bmod p \\ 0 & \star \end{pmatrix}.
+$$
+
+When $g = 0$, the kernel invariant gives $e a \equiv 0 \pmod p$,
+hence (since $\gcd(a, p) = 1$) $p \mid e$, putting the bottom-right at a
+multiple of $p$. Independently, $f$ at termination equals $\pm \gcd(p, a) = \pm 1$,
+and the invariant collapses to $d a \equiv \pm 1 \pmod p$, giving
+$a^{-1} \equiv \pm d \pmod p$. The sign is read off the top-left entry.
+
+So **the BY algorithm is a reduction of $\Phi_0$ to upper triangular form, with
+the inverse appearing as the top-right entry.**
+
+---
+
+## 2. The generators $L_a, L_b, L_c$
+
+The algorithm proceeds by repeated left multiplication: $\Phi \to L_n \Phi$,
+choosing $L_n$ from a fixed set of three matrices:
+
+$$
+L_a \;=\; \begin{pmatrix} 1 & 0 \\ 0 & \tfrac{1}{2} \end{pmatrix}, \qquad
+L_b \;=\; \begin{pmatrix} 1 & 0 \\ \tfrac{1}{2} & \tfrac{1}{2} \end{pmatrix}, \qquad
+L_c \;=\; \begin{pmatrix} 0 & 1 \\ -\tfrac{1}{2} & \tfrac{1}{2} \end{pmatrix}.
+$$
+
+Each has $\det L_n = \tfrac{1}{2}$, so $L_n \in \mathrm{GL}_2(\mathbb{Z}[\tfrac{1}{2}])$
+but not in $\mathrm{SL}_2$. They act on the rows of $\Phi$: equivalently, on
+$(f, g)$ and on $(d, e)$ in lockstep.
+
+### Divstep
+
+A small auxiliary integer $\delta$ is carried alongside $\Phi$, initialised at
+$\delta_0 = 1$. At every step, the parity of $g$ together with the sign of
+$\delta$ dictates which generator to apply:
+
+| Condition | Generator | $\delta$ update |
+| --- | --- | --- |
+| $g$ even | $L_a$ | $\delta \leftarrow \delta + 1$ |
+| $g$ odd, $\delta \le 0$ | $L_b$ | $\delta \leftarrow \delta + 1$ |
+| $g$ odd, $\delta > 0$ | $L_c$ | $\delta \leftarrow 1 - \delta$ |
+
+The convention $f$ odd is maintained throughout: $p$ is odd, the $L_n$ preserve
+$f \bmod 2$, so $f$ stays odd until termination.
+
+One application of this rule is one **divstep**. Each divstep multiplies $\det \Phi$
+by $\tfrac{1}{2}$ (so after $N$ divsteps $\det \Phi_N = p / 2^N$) and shrinks
+the magnitude of the lower-left entry $g$ on average.
+
+### Why the case split makes sense
+
+Unpacking $L_a, L_b, L_c$ as row operations on $(f, g)$:
+
+- $L_a$: $g \leftarrow g/2$. Valid only when $g$ is even, otherwise the result is non-integer.
+- $L_b$: $g \leftarrow (g + f)/2$. Used when $g$ is odd but $|g| \le |f|$ in spirit (tracked by $\delta \le 0$): adding $f$ first makes the sum even.
+- $L_c$: $(f, g) \leftarrow (g, (g - f)/2)$. Swap-and-subtract; used when $g$ is odd and $|g| > |f|$ in spirit ($\delta > 0$).
+
+The "in spirit" caveats are because BY tracks $|f|$ vs. $|g|$ comparisons via the
+integer $\delta$ rather than by an actual size comparison — $\delta$ effectively
+measures the running deficit "extra divisions of $g$ over extra divisions of $f$."
+
+### Right-column $p$-shift
+
+Left multiplication by $L_n$ alone would leave $\Phi$ with non-integer entries
+in the right column (because $L_n$ has $\tfrac{1}{2}$ entries acting on
+$(d, e)$). To keep the state integer-valued we add a multiple of $p$ to the
+right column before halving — a *2-adic correction* that vanishes modulo $p$
+and hence doesn't disturb the kernel invariant. The mechanics live in §6.
+
+So one full divstep is:
+$$
+\Phi \;\longmapsto\; L_n \,\Phi \;+\; p \cdot \big(\text{adjustment in right column}\big), \qquad L_n \in \{L_a, L_b, L_c\}.
+$$
+
+The left column ($f, g$) updates via clean left multiplication; the right
+column ($d, e$) updates via left multiplication plus a $p$-shift.
+
+---
+
+## 3. Convergence
+
+The Bernstein--Yang/Pornin convergence proof guarantees $g = 0$ within 735 divsteps for 254-bit inputs; native runs $12 \cdot 62 = 744$ divsteps and wasm runs $13 \cdot 58 = 754$, so both cover the bound.
+
+---
+
+## 4. Batching: products of $L_n$ as a $2 \times 2$ matrix
+
+A single divstep touches only the lowest bit of $g$ (to read parity) and is a
+linear combination of $(f, g)$. So the next $\mathrm{BATCH}$ divsteps depend
+only on the low $\mathrm{BATCH}$ bits of $(f, g)$ — the high bits do not
+influence the choice between $L_a, L_b, L_c$ within those steps.
+
+Let $\mathrm{BATCH} = 62$ (the implementation choice on native; $58$ on wasm).
+Run $\mathrm{BATCH}$ divsteps purely on the low $64$ bits of $(f, g)$,
+accumulating the result as a single $2 \times 2$ rational matrix
+
+$$
+M \;=\; L_{n_{\mathrm{BATCH}-1}} \cdots L_{n_1} L_{n_0} \;=\; \begin{pmatrix} u & v \\ q & r \end{pmatrix} \cdot 2^{-\mathrm{BATCH}}.
+$$
+
+After clearing the implicit denominator, the *integer* part $\begin{pmatrix} u & v \\ q & r \end{pmatrix} = 2^{\mathrm{BATCH}} M$ has all four entries bounded by $|u|, |v|, |q|, |r| \le 2^{\mathrm{BATCH}}$ (each individual $L_n$ at most doubles one entry of the running product). With $\mathrm{BATCH} = 62$, the four entries fit in `int64_t`.
+
+---
+
+## 5. Applying $M$ to the left column $(f, g)$
+
+The new $(f', g')^T = M \cdot (f, g)^T$ is computed as
+
+1. The two integer linear combinations
+   $$t_f = u \cdot f + v \cdot g, \qquad t_g = q \cdot f + r \cdot g$$
+   in full precision. With $|u|, \ldots, |r| \le 2^{62}$ and $|f|, |g|$
+   fitting in $256$ bits (see §7), each product fits in $318$ bits and the
+   sum in $319$ bits.
+2. Arithmetic-right-shift by $\mathrm{BATCH}$ bits:
+   $$f' \;=\; t_f \gg \mathrm{BATCH}, \qquad g' \;=\; t_g \gg \mathrm{BATCH}.$$
+
+The shift is exact: the low $\mathrm{BATCH}$ bits of $t_f$ and $t_g$ are zero
+by construction. Each divstep makes one bit of $g$ cancel, and after
+$\mathrm{BATCH}$ divsteps the bottom $\mathrm{BATCH}$ bits of the linear
+combinations are guaranteed zero — this is the "exact integer division"
+property of the divstep cascade.
+
+In the implementation this lives in `NativeMatrix::signed_linear_combination` (the
+products) and `NativeMatrix::arithmetic_shift_by_batch` (the right shift).
+
+---
+
+## 6. Applying $M$ to the right column $(d, e)$: the 2-adic correction
+
+Applying $M$ to $(d, e)$ presents the same divisibility question, but now the
+state lives modulo $p$. The integer linear combination $t = u \cdot d + v \cdot e$
+is **not** generally divisible by $2^{\mathrm{BATCH}}$. We need an integer
+correction $k$ such that $t + k \cdot p$ is divisible by $2^{\mathrm{BATCH}}$,
+then take the quotient.
+
+This is a 2-adic problem:
+
+$$
+t + k\cdot p \;\equiv\; 0 \pmod{2^{\mathrm{BATCH}}}
+\;\Longleftrightarrow\;
+k \;\equiv\; -\, t \cdot p^{-1} \pmod{2^{\mathrm{BATCH}}}.
+$$
+
+$p$ is odd, so $p^{-1} \bmod 2^{\mathrm{BATCH}}$ exists. The implementation
+precomputes it once: $p^{-1} \bmod 2^{\mathrm{BATCH}}$ is the low
+$\mathrm{BATCH}$ bits of $-r_{\mathrm{inv}}$, where $r_{\mathrm{inv}}$ is
+barretenberg's Montgomery constant $-p^{-1} \bmod 2^{64}$.
+
+The correction step:
+
+$$
+\begin{aligned}
+t  &\;=\; u \cdot d + v \cdot e, \\
+k  &\;\equiv\; -\, t \cdot p^{-1} \pmod{2^{\mathrm{BATCH}}}, \\
+d' &\;=\; (t + k \cdot p) \gg \mathrm{BATCH}.
+\end{aligned}
+$$
+
+After the shift, $d'$ is an integer (because $t + k \cdot p$ is divisible by
+$2^{\mathrm{BATCH}}$). The added $k \cdot p$ contributes only a multiple of $p$
+to the right column, so the kernel invariant $\Phi \binom{1}{-a} \equiv 0 \pmod p$
+is preserved.
+
+The same construction applied with $(q, r)$ and the second row of $M$ gives
+the new $e'$. This is `apply_corrected_row` in the implementation.
+
+---
+
+## 7. State bounds: why 5 limbs
+
+`Native5x64` stores each state value in five signed 64-bit limbs. Four limbs
+would be enough for canonical field elements, but the BY state is not always
+canonical while a batched matrix is being applied.
+
+For the left column, $f$ and $g$ are roughly 256-bit values between matrix
+applications. During §5, however, the products $u f$, $v g$, $q f$, and $r g$
+can be as large as $2^{62} \cdot 2^{256}$, and their sums need about 319 bits.
+Those sums are held in the temporary six-limb arrays inside
+`NativeMatrix::signed_linear_combination`; after the exact right shift by
+`BATCH`, $f$ and $g$ return to the normal state size.
+
+For the right column, $d$ and $e$ need extra resting headroom too. The 2-adic
+correction adds $k \cdot p$ before the shift, and repeated matrix applications
+can move   00,p)1 The implementation therefore reduces them
+to canonical form every `REDUCE_INTERVAL = 4` iterations rather than after every
+matrix application.
+
+Between reductions, $|d|$ and $|e|$ are bounded by roughly $32p$. Five signed
+64-bit limbs provide about 319 bits of magnitude, enough room for that growth.
+`Native5x64::reduce_to_canonical(p)` then repeatedly adds or subtracts $p$,
+with a loop bound of 36 to cover the same worst-case headroom.
+
+---
+
+## 8. Final answer extraction
+
+After the iteration loop:
+
+- $g = 0$.
+- $f = \pm \gcd(p, a) = \pm 1$.
+- Kernel invariant: $d \cdot a \equiv f \pmod{p}$, so $d \equiv \pm a^{-1} \pmod p$.
+
+Reduce $d$ to canonical form $[0, p)$. Then
+
+$$
+a^{-1} \bmod p \;=\;
+\begin{cases}
+d & \text{if } f > 0, \\
+{-d} \bmod p & \text{if } f < 0.
+\end{cases}
+$$
+
+In code, this is the
+`if (f.is_negative()) { d.neg(); d.reduce_to_canonical(P); }` at the end of
+`invert_vartime`.
+
+---
+
+## 9. Math-to-code map
+
+The implementation keeps the same mathematical state but hides the limb details
+behind `Native5x64` and `Wasm9x29`.
+
+| Math description | Code |
+| --- | --- |
+| $\Phi = \begin{pmatrix} f & d \\ g & e \end{pmatrix}$ | `S P(p), f = P, g(a), d, e = S::one()` in `invert_vartime` |
+| Auxiliary divstep counter $\delta$ | local `i64 delta` |
+| Product of one batch of divsteps | `S::compute_divstep_matrix(delta, f.low_64(), g.low_64())` |
+| $M = \begin{pmatrix} u & v \\ q & r \end{pmatrix} 2^{-B}$ | `DivstepMatrix { u, v, q, r }`; the denominator is the implicit `2^BATCH` |
+| $M(f,g)/2^B$ | `apply_divstep_matrix`, using `NativeMatrix::signed_linear_combination` and `arithmetic_shift_by_batch` on native |
+| $k \equiv -t p^{-1} \pmod {2^B}$ | `apply_corrected_row` on native; streamed as `k_d`, `k_e` on wasm |
+| $p^{-1} \bmod 2^B$ from Montgomery metadata | `p_inv_mod_2k_from_montgomery_r_inv` |
+| Periodic reduction of $d,e$ modulo $p$ | `reduce_to_canonical(P)` every `REDUCE_INTERVAL` iterations |
+| Final sign correction $a^{-1} = \pm d$ | final `if (f.is_negative()) { d.neg(); ... }` |
+| Platform-specific limb representation | `using State = Native5x64` or `Wasm9x29` |
+
+---
+
+## 10. Example
+
+This example uses the same batched mechanics as the implementation, but with a toy $B = 3$ instead of native $B = 62$.
+
+Let
+
+$$
+p = 17, \qquad a = 3, \qquad
+\Phi_0 = \begin{pmatrix} f & d \\ g & e \end{pmatrix}
+       = \begin{pmatrix} 17 & 0 \\ 3 & 1 \end{pmatrix}, \qquad
+\delta_0 = 1.
+$$
+
+Since $17 \equiv 1 \pmod 8$, we have $p^{-1} \equiv 1 \pmod 8$. For each row
+of the right column, the correction is
+
+$$
+k \equiv -t \cdot p^{-1} \pmod 8,
+\qquad
+x^\prime = (t + k p) / 8.
+$$
+
+### Batch 0
+
+The first three divsteps are $L_c, L_b, L_a$. Their product is
+
+$$
+M_0 = \begin{pmatrix} 0 & 8 \\ -1 & 3 \end{pmatrix} \cdot 2^{-3}.
+$$
+
+Apply it to the left column:
+
+$$
+f^\prime = \frac{0 \cdot 17 + 8 \cdot 3}{8} = 3,
+\qquad
+g^\prime = \frac{-1 \cdot 17 + 3 \cdot 3}{8} = -1.
+$$
+
+Apply it to the right column. For $d^\prime$, $t = 0 \cdot 0 + 8 \cdot 1 = 8$,
+so $k = 0$ and $d^\prime = 1$. For $e^\prime$, $t = -1 \cdot 0 + 3 \cdot 1 = 3$,
+so $k \equiv -3 \equiv 5 \pmod 8$ and
+
+$$
+e^\prime = \frac{3 + 5 \cdot 17}{8} = 11.
+$$
+
+After batch 0,
+
+$$
+(f,g,d,e;\delta) = (3,-1,1,11;2).
+$$
+
+### Batch 1
+
+The next three divsteps are $L_c, L_a, L_b$, giving
+
+$$
+M_1 = \begin{pmatrix} 0 & 8 \\ -1 & 5 \end{pmatrix} \cdot 2^{-3}.
+$$
+
+For the left column,
+
+$$
+f^\prime = \frac{0 \cdot 3 + 8 \cdot (-1)}{8} = -1,
+\qquad
+g^\prime = \frac{-1 \cdot 3 + 5 \cdot (-1)}{8} = -1.
+$$
+
+For the right column, the first row gives $t = 88$ and $k = 0$, hence
+$d^\prime = 11$. The second row gives $t = -1 \cdot 1 + 5 \cdot 11 = 54$,
+so $k \equiv -54 \equiv 2 \pmod 8$ and
+
+$$
+e^\prime = \frac{54 + 2 \cdot 17}{8} = 11.
+$$
+
+After batch 1,
+
+$$
+(f,g,d,e;\delta) = (-1,-1,11,11;1).
+$$
+
+### Batch 2
+
+The next batch starts with the terminating divstep $L_c$; the remaining two
+low-bit divsteps keep $g = 0$. The accumulated matrix is
+
+$$
+M_2 = \begin{pmatrix} 0 & 8 \\ -1 & 1 \end{pmatrix} \cdot 2^{-3}.
+$$
+
+Applying it gives
+
+$$
+f^\prime = \frac{0 \cdot (-1) + 8 \cdot (-1)}{8} = -1,
+\qquad
+g^\prime = \frac{-1 \cdot (-1) + 1 \cdot (-1)}{8} = 0,
+$$
+
+and for the right column, $d^\prime = 11$ and $e^\prime = 0$.
+
+Now $g=0$. Since $f=-1$, the inverse is $-d \bmod p$:
+
+$$
+a^{-1} \equiv -11 \equiv 6 \pmod {17},
+\qquad
+3 \cdot 6 \equiv 1 \pmod {17}.
+$$
diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.test.cpp
new file mode 100644
index 000000000000..35f65e5c56fe
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.test.cpp
@@ -0,0 +1,150 @@
+#include "barretenberg/ecc/fields/bernstein_yang_inverse.hpp"
+#include "barretenberg/ecc/curves/bn254/fq.hpp"
+#include "barretenberg/ecc/curves/bn254/fr.hpp"
+#include "barretenberg/ecc/curves/secp256k1/secp256k1.hpp"
+#include "barretenberg/ecc/curves/secp256r1/secp256r1.hpp"
+#include "barretenberg/ecc/fields/bernstein_yang_inverse_wasm.hpp"
+#include "barretenberg/numeric/uint256/uint256.hpp"
+#include <gtest/gtest.h>
+
+namespace {
+
+using bb::numeric::uint256_t;
+using Native = bb::bernstein_yang::Native5x64;
+using Wasm = bb::bernstein_yang::Wasm9x29;
+
+template <class F> uint256_t to_raw(const F& a)
+{
+    F nonmont = a.from_montgomery_form_reduced();
+    return { nonmont.data[0], nonmont.data[1], nonmont.data[2], nonmont.data[3] };
+}
+
+template <class F> F from_raw(const uint256_t& a)
+{
+    F r{ a.data[0], a.data[1], a.data[2], a.data[3] };
+    r.self_to_montgomery_form();
+    return r;
+}
+
+template <class S, class F> uint256_t by_invert(const F& a)
+{
+    constexpr uint256_t p = F::modulus;
+    constexpr uint64_t p_inv_mod_2k = S::p_inv_mod_2k_from_montgomery_r_inv(F::Params::r_inv);
+    return bb::bernstein_yang::invert_vartime<S>(to_raw(a), p, p_inv_mod_2k);
+}
+
+// Random a, BY result roundtripped through Montgomery must invert a.
+template <class S, class F> void check_inverse_matches_modexp(size_t n)
+{
+    for (size_t i = 0; i < n; ++i) {
+        F a = F::random_element();
+        if (a == F::zero()) {
+            continue;
+        }
+        F got = from_raw<F>(by_invert<S>(a));
+        EXPECT_EQ(got * a, F::one()) << "iteration " << i;
+    }
+}
+
+// Same input through both kernels must produce identical canonical output.
+template <class F> void check_native_matches_wasm(size_t n)
+{
+    for (size_t i = 0; i < n; ++i) {
+        F a = F::random_element();
+        if (a == F::zero()) {
+            continue;
+        }
+        EXPECT_EQ(by_invert<Native>(a), by_invert<Wasm>(a)) << "iteration " << i;
+    }
+}
+
+template <class S, class F> void check_edge_cases()
+{
+    // invert(1) == 1
+    EXPECT_EQ(from_raw<F>(by_invert<S>(F::one())), F::one());
+
+    // invert(-1) == -1
+    F neg_one = -F::one();
+    EXPECT_EQ(from_raw<F>(by_invert<S>(neg_one)), neg_one);
+
+    // a * invert(a) == 1 for small / boundary / sparse inputs
+    F two = F::one() + F::one();
+    F neg_two = -two;
+    F top_limb_only{ 0, 0, 0, 1 }; // raw value 2^192, well below modulus for 254-bit fields
+    top_limb_only.self_to_montgomery_form();
+    for (const F& a : { two, neg_two, top_limb_only }) {
+        F inv = from_raw<F>(by_invert<S>(a));
+        EXPECT_EQ(inv * a, F::one());
+    }
+
+    // Involution: invert(invert(a)) == a, on random samples.
+    for (int i = 0; i < 64; ++i) {
+        F a = F::random_element();
+        if (a == F::zero()) {
+            continue;
+        }
+        F inv = from_raw<F>(by_invert<S>(a));
+        F inv_inv = from_raw<F>(by_invert<S>(inv));
+        EXPECT_EQ(inv_inv, a);
+    }
+}
+
+} // namespace
+
+TEST(Wasm9x29, MatchesModexp_BN254_Fr)
+{
+    check_inverse_matches_modexp<Wasm, bb::fr>(500);
+}
+TEST(Wasm9x29, MatchesModexp_BN254_Fq)
+{
+    check_inverse_matches_modexp<Wasm, bb::fq>(500);
+}
+
+TEST(Native5x64, MatchesModexp_BN254_Fr)
+{
+    check_inverse_matches_modexp<Native, bb::fr>(500);
+}
+TEST(Native5x64, MatchesModexp_BN254_Fq)
+{
+    check_inverse_matches_modexp<Native, bb::fq>(500);
+}
+
+TEST(Wasm9x29, EdgeCases_BN254_Fr)
+{
+    check_edge_cases<Wasm, bb::fr>();
+}
+TEST(Wasm9x29, EdgeCases_BN254_Fq)
+{
+    check_edge_cases<Wasm, bb::fq>();
+}
+TEST(Native5x64, EdgeCases_BN254_Fr)
+{
+    check_edge_cases<Native, bb::fr>();
+}
+TEST(Native5x64, EdgeCases_BN254_Fq)
+{
+    check_edge_cases<Native, bb::fq>();
+}
+
+TEST(BernsteinYang, NativeMatchesWasm_BN254_Fr)
+{
+    check_native_matches_wasm<bb::fr>(500);
+}
+TEST(BernsteinYang, NativeMatchesWasm_BN254_Fq)
+{
+    check_native_matches_wasm<bb::fq>(500);
+}
+
+// 256-bit moduli must keep working through field::invert() via the Fermat
+// fallback (the BY dispatch is gated by `modulus_3 < 2^63`, which excludes
+// secp256k1/r1). Pin that behavior so a future change to the gate gets caught.
+TEST(BernsteinYang, FermatFallback_Secp256k1_Fr)
+{
+    auto a = bb::secp256k1::fr::random_element();
+    EXPECT_EQ(a * a.invert(), bb::secp256k1::fr::one());
+}
+TEST(BernsteinYang, FermatFallback_Secp256r1_Fr)
+{
+    auto a = bb::secp256r1::fr::random_element();
+    EXPECT_EQ(a * a.invert(), bb::secp256r1::fr::one());
+}
diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse_wasm.hpp b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse_wasm.hpp
new file mode 100644
index 000000000000..4d728e3de73b
--- /dev/null
+++ b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse_wasm.hpp
@@ -0,0 +1,280 @@
+// 9 × 29-bit-limb state.  Included from bernstein_yang_inverse.hpp; uses the
+// u64 / i64 / DivstepMatrix names declared there.
+//
+// Why a different limb size from Native5x64: on wasm32 there is no native
+// 64×64→128 multiply, so i64 × u64 → __int128 lowers to a compiler-rt
+// __multi3 dispatch.  Pack the 254-bit state into 9 limbs of 29 bits each
+// instead: every limb-level product is then i29 × i29 = i58, fitting in a
+// single WASM i64.mul.  Choosing the per-iter BATCH as exactly 2 × LIMB_BITS
+// makes the "/ 2^BATCH" at the end of apply_divstep_matrix equivalent to dropping
+// the bottom two 29-bit limbs (no sub-limb shift on the intermediate).
+
+#pragma once
+
+#include "barretenberg/numeric/uint256/uint256.hpp"
+#include <cstdint>
+
+namespace bb::bernstein_yang {
+
+class Wasm9x29 {
+  public:
+    // Divsteps per matrix application; smaller than Native5x64::BATCH so
+    // the resulting "/ 2^BATCH" is limb-aligned (= drop the bottom two
+    // 29-bit limbs) and no sub-limb shift is needed on the intermediate.
+    static constexpr int BATCH = 58;
+
+    // ⌈735 / 58⌉ = 13.  Same convergence-bound logic as Native5x64; one
+    // iter more because BATCH is smaller.
+    static constexpr int NUM_ITERATIONS = 13;
+
+    // |d|, |e| can grow by ~2× + p per matrix application; after 4 iters
+    // they reach ~31p ≈ 2^259, which still fits in the 9 × 29-bit signed
+    // state (capacity ~2^260).  Reducing once every 4 iters instead of
+    // every iter saves ~3× reduce_to_canonical calls per inversion.
+    static constexpr int REDUCE_INTERVAL = 4;
+
+    // Worst-case iteration cap inside reduce_to_canonical.  After
+    // REDUCE_INTERVAL iters between reductions, |d|, |e| ≤ (2^(REDUCE_INTERVAL+1) - 1)·p,
+    // so reducing requires that many subtractions plus one break iter.
+    static constexpr int REDUCE_TO_CANONICAL_MAX_ITERS = 36;
+    static_assert((1U << (REDUCE_INTERVAL + 1)) <= REDUCE_TO_CANONICAL_MAX_ITERS,
+                  "REDUCE_INTERVAL too large for reduce_to_canonical iteration bound");
+
+    Wasm9x29() noexcept
+        : l{}
+    {}
+    explicit Wasm9x29(const uint256_t& x) noexcept
+    {
+        const u64* d = x.data;
+        l[0] = (i64)(d[0] & LIMB_MASK);
+        l[1] = (i64)((d[0] >> 29) & LIMB_MASK);
+        l[2] = (i64)(((d[0] >> 58) & 0x3FULL) | ((d[1] & 0x7FFFFFULL) << 6));
+        l[3] = (i64)((d[1] >> 23) & LIMB_MASK);
+        l[4] = (i64)(((d[1] >> 52) & 0xFFFULL) | ((d[2] & 0x1FFFFULL) << 12));
+        l[5] = (i64)((d[2] >> 17) & LIMB_MASK);
+        l[6] = (i64)(((d[2] >> 46) & 0x3FFFFULL) | ((d[3] & 0x7FFULL) << 18));
+        l[7] = (i64)((d[3] >> 11) & LIMB_MASK);
+        l[8] = (i64)((d[3] >> 40) & 0xFFFFFFULL);
+    }
+    static Wasm9x29 one() noexcept
+    {
+        Wasm9x29 r;
+        r.l[0] = 1;
+        return r;
+    }
+
+    uint256_t to_uint256() const noexcept
+    {
+        return { (u64)l[0] | ((u64)l[1] << 29) | ((u64)l[2] << 58),
+                 ((u64)l[2] >> 6) | ((u64)l[3] << 23) | ((u64)l[4] << 52),
+                 ((u64)l[4] >> 12) | ((u64)l[5] << 17) | ((u64)l[6] << 46),
+                 ((u64)l[6] >> 18) | ((u64)l[7] << 11) | ((u64)l[8] << 40) };
+    }
+    u64 low_64() const noexcept { return (u64)l[0] | ((u64)l[1] << 29) | (((u64)l[2] & 0x3F) << 58); }
+    bool is_zero() const noexcept
+    {
+        i64 a = 0;
+        for (int i = 0; i < N; ++i) {
+            a |= l[i];
+        }
+        return a == 0;
+    }
+    bool is_negative() const noexcept { return l[N - 1] < 0; }
+    void neg() noexcept
+    {
+        for (int i = 0; i < N; ++i) {
+            l[i] = -l[i];
+        }
+        normalise();
+    }
+    void reduce_to_canonical(const Wasm9x29& p) noexcept;
+
+    // See Native5x64 for the batched divstep matrix, matrix application,
+    // and p_inv_mod_2k_from_montgomery_r_inv contracts; the bodies differ only
+    // in limb representation.
+    static DivstepMatrix compute_divstep_matrix(i64& delta, u64 f_lo, u64 g_lo) noexcept;
+    static void apply_divstep_matrix(const DivstepMatrix& m,
+                                     Wasm9x29& f,
+                                     Wasm9x29& g,
+                                     Wasm9x29& d,
+                                     Wasm9x29& e,
+                                     const Wasm9x29& p,
+                                     u64 p_inv_mod_2k) noexcept;
+    static constexpr u64 p_inv_mod_2k_from_montgomery_r_inv(u64 r_inv) noexcept
+    {
+        // r_inv = -p^{-1} mod 2^64, so 0 - r_inv = p^{-1} mod 2^64.
+        return (0ULL - r_inv) & ((1ULL << BATCH) - 1);
+    }
+
+  private:
+    static constexpr int N = 9;
+    static constexpr int LIMB_BITS = 29;
+    static constexpr u64 LIMB_MASK = (1ULL << LIMB_BITS) - 1;
+    i64 l[N]; // top limb carries sign; lower limbs in [0, 2^29) post-normalise
+
+    void normalise() noexcept
+    {
+        i64 c = 0;
+        for (int i = 0; i < N - 1; ++i) {
+            i64 v = l[i] + c;
+            l[i] = v & (i64)LIMB_MASK;
+            c = v >> LIMB_BITS;
+        }
+        l[N - 1] += c;
+    }
+    void add_inplace(const Wasm9x29& b) noexcept
+    {
+        for (int i = 0; i < N; ++i) {
+            l[i] += b.l[i];
+        }
+        normalise();
+    }
+    void sub_inplace(const Wasm9x29& b) noexcept
+    {
+        for (int i = 0; i < N; ++i) {
+            l[i] -= b.l[i];
+        }
+        normalise();
+    }
+};
+
+// Iter cap chosen by the REDUCE_TO_CANONICAL_MAX_ITERS / REDUCE_INTERVAL
+// static_assert above; see those constants for the magnitude argument.
+inline void Wasm9x29::reduce_to_canonical(const Wasm9x29& p) noexcept
+{
+    normalise();
+    for (int it = 0; it < REDUCE_TO_CANONICAL_MAX_ITERS; ++it) {
+        if (is_negative()) {
+            add_inplace(p);
+            continue;
+        }
+        int cmp = 0;
+        for (int i = N - 1; i >= 0; --i) {
+            if (l[i] != p.l[i]) {
+                cmp = l[i] > p.l[i] ? 1 : -1;
+                break;
+            }
+        }
+        if (cmp < 0) {
+            break;
+        }
+        sub_inplace(p);
+    }
+}
+
+inline DivstepMatrix Wasm9x29::compute_divstep_matrix(i64& delta, u64 f_lo, u64 g_lo) noexcept
+{
+    i64 u = 1, v = 0, q = 0, r = 1;
+    for (int i = 0; i < BATCH; ++i) {
+        if (g_lo & 1) {
+            if (delta > 0) {
+                u64 nf = g_lo, ng = (g_lo - f_lo) >> 1;
+                i64 nu = q << 1, nv = r << 1, nq = q - u, nr = r - v;
+                f_lo = nf;
+                g_lo = ng;
+                u = nu;
+                v = nv;
+                q = nq;
+                r = nr;
+                delta = 1 - delta;
+            } else {
+                g_lo = (g_lo + f_lo) >> 1;
+                q = q + u;
+                r = r + v;
+                u <<= 1;
+                v <<= 1;
+                delta = delta + 1;
+            }
+        } else {
+            g_lo >>= 1;
+            u <<= 1;
+            v <<= 1;
+            delta = delta + 1;
+        }
+    }
+    return { u, v, q, r };
+}
+
+// Streamed schoolbook: for each limb position i compute
+//   nf_i = u_lo·f_i + v_lo·g_i + u_hi·f_{i-1} + v_hi·g_{i-1} + carry_in
+// (similarly ng, nd, ne), then carry_out = nf_i >> LIMB_BITS, masked low
+// 29 bits land at output position i - 2 (= exact >> BATCH).  The de row
+// derives k_d, k_e from the low two limbs up front and folds k·p into the
+// per-limb formula from position 2 onward.  No 11-limb intermediate is
+// materialised — the JIT keeps the four running carries in registers.
+inline void Wasm9x29::apply_divstep_matrix(const DivstepMatrix& m,
+                                           Wasm9x29& f,
+                                           Wasm9x29& g,
+                                           Wasm9x29& d,
+                                           Wasm9x29& e,
+                                           const Wasm9x29& p,
+                                           u64 p_inv_mod_2k) noexcept
+{
+    constexpr u64 MASK_BATCH = (1ULL << BATCH) - 1;
+    const i64 u_lo = m.u & (i64)LIMB_MASK, u_hi = m.u >> LIMB_BITS;
+    const i64 v_lo = m.v & (i64)LIMB_MASK, v_hi = m.v >> LIMB_BITS;
+    const i64 q_lo = m.q & (i64)LIMB_MASK, q_hi = m.q >> LIMB_BITS;
+    const i64 r_lo = m.r & (i64)LIMB_MASK, r_hi = m.r >> LIMB_BITS;
+
+    {
+        i64 cf = 0, cg = 0, fp = 0, gp = 0;
+        for (int i = 0; i < N; ++i) {
+            const i64 fi = f.l[i], gi = g.l[i];
+            const i64 nf = u_lo * fi + v_lo * gi + u_hi * fp + v_hi * gp + cf;
+            const i64 ng = q_lo * fi + r_lo * gi + q_hi * fp + r_hi * gp + cg;
+            cf = nf >> LIMB_BITS;
+            cg = ng >> LIMB_BITS;
+            if (i >= 2) {
+                f.l[i - 2] = nf & (i64)LIMB_MASK;
+                g.l[i - 2] = ng & (i64)LIMB_MASK;
+            }
+            fp = fi;
+            gp = gi;
+        }
+        const i64 nf9 = u_hi * fp + v_hi * gp + cf;
+        const i64 ng9 = q_hi * fp + r_hi * gp + cg;
+        f.l[N - 2] = nf9 & (i64)LIMB_MASK;
+        g.l[N - 2] = ng9 & (i64)LIMB_MASK;
+        f.l[N - 1] = nf9 >> LIMB_BITS;
+        g.l[N - 1] = ng9 >> LIMB_BITS;
+    }
+
+    // k_d, k_e (mod 2^BATCH) clear the low BATCH bits of nd, ne; fold k·p
+    // into the streaming pass from position 2 onward.
+    {
+        const i64 d0 = d.l[0], e0 = e.l[0], d1 = d.l[1], e1 = e.l[1];
+        const i64 nd0 = u_lo * d0 + v_lo * e0;
+        const i64 ne0 = q_lo * d0 + r_lo * e0;
+        const i64 nd1 = u_lo * d1 + v_lo * e1 + u_hi * d0 + v_hi * e0;
+        const i64 ne1 = q_lo * d1 + r_lo * e1 + q_hi * d0 + r_hi * e0;
+        const u64 t_d = ((u64)nd0 & LIMB_MASK) | (((u64)(nd1 + (nd0 >> LIMB_BITS)) & LIMB_MASK) << LIMB_BITS);
+        const u64 t_e = ((u64)ne0 & LIMB_MASK) | (((u64)(ne1 + (ne0 >> LIMB_BITS)) & LIMB_MASK) << LIMB_BITS);
+        const u64 k_d = ((0ULL - t_d) * p_inv_mod_2k) & MASK_BATCH;
+        const u64 k_e = ((0ULL - t_e) * p_inv_mod_2k) & MASK_BATCH;
+        const i64 kd_lo = (i64)(k_d & LIMB_MASK), kd_hi = (i64)(k_d >> LIMB_BITS);
+        const i64 ke_lo = (i64)(k_e & LIMB_MASK), ke_hi = (i64)(k_e >> LIMB_BITS);
+        i64 cd = (nd1 + kd_lo * p.l[1] + kd_hi * p.l[0] + ((nd0 + kd_lo * p.l[0]) >> LIMB_BITS)) >> LIMB_BITS;
+        i64 ce = (ne1 + ke_lo * p.l[1] + ke_hi * p.l[0] + ((ne0 + ke_lo * p.l[0]) >> LIMB_BITS)) >> LIMB_BITS;
+
+        i64 dp = d1, ep = e1;
+        for (int i = 2; i < N; ++i) {
+            const i64 di = d.l[i], ei = e.l[i];
+            const i64 nd = u_lo * di + v_lo * ei + u_hi * dp + v_hi * ep + kd_lo * p.l[i] + kd_hi * p.l[i - 1] + cd;
+            const i64 ne = q_lo * di + r_lo * ei + q_hi * dp + r_hi * ep + ke_lo * p.l[i] + ke_hi * p.l[i - 1] + ce;
+            cd = nd >> LIMB_BITS;
+            ce = ne >> LIMB_BITS;
+            d.l[i - 2] = nd & (i64)LIMB_MASK;
+            e.l[i - 2] = ne & (i64)LIMB_MASK;
+            dp = di;
+            ep = ei;
+        }
+        const i64 nd9 = u_hi * dp + v_hi * ep + kd_hi * p.l[N - 1] + cd;
+        const i64 ne9 = q_hi * dp + r_hi * ep + ke_hi * p.l[N - 1] + ce;
+        d.l[N - 2] = nd9 & (i64)LIMB_MASK;
+        e.l[N - 2] = ne9 & (i64)LIMB_MASK;
+        d.l[N - 1] = nd9 >> LIMB_BITS;
+        e.l[N - 1] = ne9 >> LIMB_BITS;
+    }
+}
+
+} // namespace bb::bernstein_yang
diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp b/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp
index fdbb7508dfa8..6362ac70e04c 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp
@@ -340,6 +340,7 @@ template <class Params_> struct alignas(32) field {
     static constexpr uint256_t modulus_minus_two =
         uint256_t(Params::modulus_0 - 2ULL, Params::modulus_1, Params::modulus_2, Params::modulus_3);
     constexpr field invert() const noexcept;
+    constexpr field invert_const_time() const noexcept;
     template <typename C>
     // has size() and operator[].
         requires requires(C& c) {
diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/field_impl.hpp b/barretenberg/cpp/src/barretenberg/ecc/fields/field_impl.hpp
index f97695d9be19..c9283b3847c6 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/fields/field_impl.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/fields/field_impl.hpp
@@ -15,6 +15,8 @@
 #include <type_traits>
 #include <vector>
 
+#include "./bernstein_yang_inverse.hpp"
+#include "./bernstein_yang_inverse_wasm.hpp"
 #include "./field_declarations.hpp"
 #include "barretenberg/numeric/uint256/uint256.hpp"
 
@@ -384,6 +386,29 @@ template <class T> constexpr field<T> field<T>::pow(const uint64_t exponent) con
 }
 
 template <class T> constexpr field<T> field<T>::invert() const noexcept
+{
+    if (*this == zero()) {
+        bb::assert_failure("Trying to invert zero in the field");
+    }
+    // BY uses __int128 and is not constexpr-evaluable; compile-time inversions keep Fermat.
+    if (std::is_constant_evaluated()) {
+        return pow(modulus_minus_two);
+    }
+    if constexpr (bernstein_yang::supported_v<T>) {
+        constexpr uint256_t p_uint = modulus;
+        constexpr uint64_t p_inv_mod_2k = bernstein_yang::p_inv_mod_2k_from_montgomery_r_inv(T::r_inv);
+        field non_mont = from_montgomery_form_reduced();
+        uint256_t a{ non_mont.data[0], non_mont.data[1], non_mont.data[2], non_mont.data[3] };
+        uint256_t inv = bernstein_yang::invert_vartime(a, p_uint, p_inv_mod_2k);
+        field result{ inv.data[0], inv.data[1], inv.data[2], inv.data[3] };
+        result.self_to_montgomery_form();
+        return result;
+    } else {
+        return pow(modulus_minus_two);
+    }
+}
+
+template <class T> constexpr field<T> field<T>::invert_const_time() const noexcept
 {
     if (*this == zero()) {
         bb::assert_failure("Trying to invert zero in the field");
diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp
index 03c24a5acb91..7c6fbb8d1c5b 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp
@@ -113,6 +113,8 @@ template <class Fq, class Fr, class Params> class alignas(32) element {
     // constexpr Fr operator/(const element& other) noexcept {}
 
     constexpr element normalize() const noexcept;
+    constexpr element normalize_const_time() const noexcept;
+    constexpr affine_element<Fq, Fr, Params> to_affine_const_time() const noexcept;
     static element infinity();
     BB_INLINE constexpr element set_infinity() const noexcept;
     BB_INLINE constexpr void self_set_infinity() noexcept;
diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp
index bf3687445ad6..7c7f299f5fda 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp
@@ -63,6 +63,10 @@ constexpr element<Fq, Fr, T>& element<Fq, Fr, T>::operator=(element&& other) noe
     return *this;
 }
 
+// Warning: variable-time — calls `z.invert()` (Bernstein-Yang safegcd).  Do not
+// use on points derived from secret material (signing nonces, private keys, DH
+// shared secrets).  For those, call `to_affine_const_time()` explicitly; the
+// implicit conversion does NOT pick up the const-time path.
 template <class Fq, class Fr, class T> constexpr element<Fq, Fr, T>::operator affine_element<Fq, Fr, T>() const noexcept
 {
     if (is_point_at_infinity()) {
@@ -79,6 +83,23 @@ template <class Fq, class Fr, class T> constexpr element<Fq, Fr, T>::operator af
     return result;
 }
 
+template <class Fq, class Fr, class T>
+constexpr affine_element<Fq, Fr, T> element<Fq, Fr, T>::to_affine_const_time() const noexcept
+{
+    if (is_point_at_infinity()) {
+        affine_element<Fq, Fr, T> result;
+        result.x = Fq(0);
+        result.y = Fq(0);
+        result.self_set_infinity();
+        return result;
+    }
+    Fq z_inv = z.invert_const_time();
+    Fq zz_inv = z_inv.sqr();
+    Fq zzz_inv = zz_inv * z_inv;
+    affine_element<Fq, Fr, T> result(x * zz_inv, y * zzz_inv);
+    return result;
+}
+
 template <class Fq, class Fr, class T> constexpr void element<Fq, Fr, T>::self_dbl() noexcept
 {
     if constexpr (Fq::modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
@@ -461,12 +482,20 @@ element<Fq, Fr, T> element<Fq, Fr, T>::mul_const_time(const Fr& scalar, numeric:
     return R0;
 }
 
+// Warning: variable-time via the implicit affine conversion above.  For
+// secret-input points use `normalize_const_time()`.
 template <class Fq, class Fr, class T> constexpr element<Fq, Fr, T> element<Fq, Fr, T>::normalize() const noexcept
 {
     const affine_element<Fq, Fr, T> converted = *this;
     return element(converted);
 }
 
+template <class Fq, class Fr, class T>
+constexpr element<Fq, Fr, T> element<Fq, Fr, T>::normalize_const_time() const noexcept
+{
+    return element(to_affine_const_time());
+}
+
 template <class Fq, class Fr, class T> element<Fq, Fr, T> element<Fq, Fr, T>::infinity()
 {
     element<Fq, Fr, T> e{};