diff --git a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecc.cpp b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecc.cpp index 7bcc3fc2edfc..571ca7804e27 100644 --- a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecc.cpp +++ b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecc.cpp @@ -11,7 +11,7 @@ GrumpkinMul::Response GrumpkinMul::execute(BBApiRequest& request) && if (!point.on_curve()) { BBAPI_ERROR(request, "Input point must be on the curve"); } - return { point * scalar }; + return { grumpkin::g1::element(point).mul_const_time(scalar).to_affine_const_time() }; } GrumpkinAdd::Response GrumpkinAdd::execute(BBApiRequest& request) && @@ -32,7 +32,11 @@ GrumpkinBatchMul::Response GrumpkinBatchMul::execute(BBApiRequest& request) && BBAPI_ERROR(request, "Input point must be on the curve"); } } - auto output = grumpkin::g1::element::batch_mul_with_endomorphism(points, scalar); + std::vector output; + output.reserve(points.size()); + for (const auto& p : points) { + output.emplace_back(grumpkin::g1::element(p).mul_const_time(scalar).to_affine_const_time()); + } return { std::move(output) }; } @@ -54,7 +58,7 @@ Secp256k1Mul::Response Secp256k1Mul::execute(BBApiRequest& request) && if (!point.on_curve()) { BBAPI_ERROR(request, "Input point must be on the curve"); } - return { point * scalar }; + return { secp256k1::g1::element(point).mul_const_time(scalar).to_affine_const_time() }; } Secp256k1GetRandomFr::Response Secp256k1GetRandomFr::execute(BB_UNUSED BBApiRequest& request) && @@ -87,7 +91,7 @@ Bn254G1Mul::Response Bn254G1Mul::execute(BBApiRequest& request) && if (!point.on_curve()) { BBAPI_ERROR(request, "Input point must be on the curve"); } - auto result = point * scalar; + auto result = bb::g1::element(point).mul_const_time(scalar).to_affine_const_time(); if (!result.on_curve()) { BBAPI_ERROR(request, "Output point must be on the curve"); } diff --git a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecdsa.cpp b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecdsa.cpp index c664e74d65a5..e2f351a302f1 100644 --- a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecdsa.cpp +++ b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_ecdsa.cpp @@ -10,12 +10,12 @@ namespace bb::bbapi { // Secp256k1 implementations EcdsaSecp256k1ComputePublicKey::Response EcdsaSecp256k1ComputePublicKey::execute(BB_UNUSED BBApiRequest& request) && { - return { secp256k1::g1::one * private_key }; + return { secp256k1::g1::element(secp256k1::g1::one).mul_const_time(private_key).to_affine_const_time() }; } EcdsaSecp256k1ConstructSignature::Response EcdsaSecp256k1ConstructSignature::execute(BB_UNUSED BBApiRequest& request) && { - auto pub_key = secp256k1::g1::one * private_key; + auto pub_key = secp256k1::g1::element(secp256k1::g1::one).mul_const_time(private_key).to_affine_const_time(); crypto::ecdsa_key_pair key_pair = { private_key, pub_key }; std::string message_str(reinterpret_cast(message.data()), message.size()); @@ -44,12 +44,12 @@ EcdsaSecp256k1VerifySignature::Response EcdsaSecp256k1VerifySignature::execute(B // Secp256r1 implementations EcdsaSecp256r1ComputePublicKey::Response EcdsaSecp256r1ComputePublicKey::execute(BB_UNUSED BBApiRequest& request) && { - return { secp256r1::g1::one * private_key }; + return { secp256r1::g1::element(secp256r1::g1::one).mul_const_time(private_key).to_affine_const_time() }; } EcdsaSecp256r1ConstructSignature::Response EcdsaSecp256r1ConstructSignature::execute(BB_UNUSED BBApiRequest& request) && { - auto pub_key = secp256r1::g1::one * private_key; + auto pub_key = secp256r1::g1::element(secp256r1::g1::one).mul_const_time(private_key).to_affine_const_time(); crypto::ecdsa_key_pair key_pair = { private_key, pub_key }; std::string message_str(reinterpret_cast(message.data()), message.size()); diff --git a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_schnorr.cpp b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_schnorr.cpp index 68b51ea2cd8e..f5c365a23479 100644 --- a/barretenberg/cpp/src/barretenberg/bbapi/bbapi_schnorr.cpp +++ b/barretenberg/cpp/src/barretenberg/bbapi/bbapi_schnorr.cpp @@ -8,12 +8,13 @@ namespace bb::bbapi { SchnorrComputePublicKey::Response SchnorrComputePublicKey::execute(BB_UNUSED BBApiRequest& request) && { - return { grumpkin::g1::one * private_key }; + return { grumpkin::g1::element(grumpkin::g1::one).mul_const_time(private_key).to_affine_const_time() }; } SchnorrConstructSignature::Response SchnorrConstructSignature::execute(BB_UNUSED BBApiRequest& request) && { - grumpkin::g1::affine_element pub_key = grumpkin::g1::one * private_key; + grumpkin::g1::affine_element pub_key = + grumpkin::g1::element(grumpkin::g1::one).mul_const_time(private_key).to_affine_const_time(); crypto::schnorr_key_pair key_pair = { private_key, pub_key }; auto sig = crypto::schnorr_construct_signature(message_field, key_pair); diff --git a/barretenberg/cpp/src/barretenberg/crypto/ecdsa/ecdsa_impl.hpp b/barretenberg/cpp/src/barretenberg/crypto/ecdsa/ecdsa_impl.hpp index 8588ffae6876..a028e787fb9b 100644 --- a/barretenberg/cpp/src/barretenberg/crypto/ecdsa/ecdsa_impl.hpp +++ b/barretenberg/cpp/src/barretenberg/crypto/ecdsa/ecdsa_impl.hpp @@ -31,11 +31,11 @@ ecdsa_signature ecdsa_construct_signature(const std::string& message, const ecds // Compute R = k * G. k is the secret RFC6979 nonce, so use the constant-time multiplication // to defend against the Hamming-weight / bit-length timing leak in operator*. - typename G1::affine_element R(typename G1::element(G1::one).mul_const_time(k)); + typename G1::affine_element R(typename G1::element(G1::one).mul_const_time(k).to_affine_const_time()); // Compute the signature Fr r = Fr(R.x); - Fr s = (z + r * account.private_key) / k; + Fr s = (z + r * account.private_key) * k.invert_const_time(); secure_erase_bytes(&k, sizeof(k)); // Ensure that the value of s is "low", i.e. s := min{ s, (|Fr| - s) } diff --git a/barretenberg/cpp/src/barretenberg/crypto/schnorr/schnorr.tcc b/barretenberg/cpp/src/barretenberg/crypto/schnorr/schnorr.tcc index 847608a53244..16fe624c72dc 100644 --- a/barretenberg/cpp/src/barretenberg/crypto/schnorr/schnorr.tcc +++ b/barretenberg/cpp/src/barretenberg/crypto/schnorr/schnorr.tcc @@ -86,7 +86,7 @@ schnorr_signature schnorr_construct_signature(const typename G1::Fq& message_fie // k is a secret nonce; use the constant-time multiplication to defend against the // Hamming-weight / bit-length timing leak in operator*. - typename G1::affine_element R(typename G1::element(G1::one).mul_const_time(k)); + typename G1::affine_element R(typename G1::element(G1::one).mul_const_time(k).to_affine_const_time()); using Fq = typename G1::Fq; Fq e = schnorr_generate_challenge(message_field, public_key, R); diff --git a/barretenberg/cpp/src/barretenberg/ecc/curves/invert_differential.fuzzer.cpp b/barretenberg/cpp/src/barretenberg/ecc/curves/invert_differential.fuzzer.cpp new file mode 100644 index 000000000000..6ef8c48599fb --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/ecc/curves/invert_differential.fuzzer.cpp @@ -0,0 +1,240 @@ +/** + * @file invert_differential.fuzzer.cpp + * @brief Differential fuzzer for Bernstein-Yang modular inverse vs Fermat (modexp). + * + * Reuses the FieldVM driver from `multi_field.fuzzer.cpp` to generate diverse + * field elements via sequences of arithmetic operations. After each VM phase + * it takes the last element produced (the highest-indexed non-zero slot in the + * VM's internal state, with a fallback to slot 0) and computes its inverse + * three different ways: + * + * - A: `pow(modulus_minus_two)` — Fermat's little theorem (modexp). + * - B: `invert_vartime` — safegcd, 5×64-bit limb kernel + * (selected on native targets, BATCH=62). + * - C: `invert_vartime` — safegcd, 9×29-bit limb kernel + * (selected on WASM targets, BATCH=58). + * + * All three are compared in canonical (non-Montgomery) form. Any discrepancy + * triggers an abort with full diagnostic output (field type, input, all three + * outputs, plus Montgomery checks `a * X ?= 1` for each). Cross-checking the + * WASM kernel here gives it libFuzzer coverage even though libFuzzer itself + * doesn't run under WASM — both kernels are plain C++ classes. + * + * Only 254-bit primes are tested (BN254 Fr/Fq, Grumpkin shares the BN254 + * curves), since the 5-limb signed BY state requires p < 2^255 and the + * production `field::invert()` dispatch also gates on this. 256-bit primes + * (secp256k1/r1) don't use BY and are skipped. + */ + +#include "barretenberg/ecc/curves/bn254/fq.hpp" +#include "barretenberg/ecc/curves/bn254/fr.hpp" +#include "barretenberg/ecc/fields/bernstein_yang_inverse.hpp" +#include "barretenberg/ecc/fields/field.fuzzer.hpp" +#include +#include +#include +#include +#include +#include + +using namespace bb; +using numeric::uint256_t; + +// --------------------------------------------------------------- +// Phase header — same 2-byte layout as multi_field.fuzzer.cpp but restricted +// to the two 254-bit fields that actually use BY. +// --------------------------------------------------------------- +enum class FieldType : uint8_t { + BN254_FQ = 0, + BN254_FR = 1, +}; + +static constexpr size_t NUM_FIELD_TYPES = 2; +static constexpr size_t MAX_STEPS = 64; +static constexpr size_t PHASE_HEADER_SIZE = 2; + +struct VMPhaseHeader { + uint8_t field_type; + uint8_t steps; +}; +static_assert(sizeof(VMPhaseHeader) == 2, "VMPhaseHeader must be 2 bytes"); + +template static uint256_t reduce_to_modulus(const uint256_t& value) +{ + return (value < Field::modulus) ? value : (value % Field::modulus); +} + +template +static void import_state_with_reduction(FieldVM& vm, const std::vector& state) +{ + for (size_t i = 0; i < INTERNAL_STATE_SIZE && i < state.size(); i++) { + vm.uint_internal_state[i] = reduce_to_modulus(state[i]); + vm.field_internal_state[i] = Field(vm.uint_internal_state[i]); + } +} + +// --------------------------------------------------------------- +// Differential oracle. +// +// Fetches `a_raw` (the non-Montgomery integer) from the VM's uint state and +// computes a^{-1} two ways; aborts on mismatch. +// --------------------------------------------------------------- +template static Field raw_to_montgomery(const uint256_t& raw) +{ + Field f{ raw.data[0], raw.data[1], raw.data[2], raw.data[3] }; + f.self_to_montgomery_form(); + return f; +} + +static void print_limbs(const char* label, const uint256_t& v) +{ + std::fprintf(stderr, + " %s = 0x%016lx%016lx%016lx%016lx\n", + label, + (unsigned long)v.data[3], + (unsigned long)v.data[2], + (unsigned long)v.data[1], + (unsigned long)v.data[0]); +} + +template static void differential_check_inverse(const Field& a_mont, const uint256_t& a_raw) +{ + if (a_raw == 0) { + return; // 0 has no inverse — skip. + } + + // A: Fermat via pow. We bypass field::invert() (which now dispatches into + // BY) by calling pow(modulus_minus_two) directly, so the paths are + // genuinely independent implementations. + Field fermat_inv = a_mont.pow(Field::modulus_minus_two); + + // B, C: Bernstein-Yang safegcd, called with the raw (non-Montgomery) value + // on both the Native5x64 (BATCH=62) and Wasm9x29 (BATCH=58) kernels. + // Each kernel needs its own p_inv_mod_2^BATCH constant. + constexpr uint256_t p_uint = Field::modulus; + constexpr uint64_t p_inv_native = + bernstein_yang::Native5x64::p_inv_mod_2k_from_montgomery_r_inv(Field::Params::r_inv); + constexpr uint64_t p_inv_wasm = bernstein_yang::Wasm9x29::p_inv_mod_2k_from_montgomery_r_inv(Field::Params::r_inv); + + uint256_t native_inv_raw = bernstein_yang::invert_vartime(a_raw, p_uint, p_inv_native); + uint256_t wasm_inv_raw = bernstein_yang::invert_vartime(a_raw, p_uint, p_inv_wasm); + + Field native_inv = raw_to_montgomery(native_inv_raw); + Field wasm_inv = raw_to_montgomery(wasm_inv_raw); + + const bool native_ok = (fermat_inv == native_inv); + const bool wasm_ok = (fermat_inv == wasm_inv); + if (native_ok && wasm_ok) { + return; + } + + std::fprintf(stderr, "\n[invert_differential.fuzzer] MISMATCH\n"); + std::fprintf(stderr, " field: %s\n", typeid(Field).name()); + std::fprintf(stderr, " native_ok: %s\n", native_ok ? "yes" : "NO"); + std::fprintf(stderr, " wasm_ok: %s\n", wasm_ok ? "yes" : "NO"); + print_limbs("a_raw ", a_raw); + print_limbs("fermat ", static_cast(fermat_inv)); + print_limbs("BY native ", static_cast(native_inv)); + print_limbs("BY wasm ", static_cast(wasm_inv)); + print_limbs("a*fermat ", static_cast(a_mont * fermat_inv)); + print_limbs("a*native ", static_cast(a_mont * native_inv)); + print_limbs("a*wasm ", static_cast(a_mont * wasm_inv)); + std::fflush(stderr); + std::abort(); +} + +// Pick the last element produced: highest-indexed non-zero slot of the VM's +// uint state, with a fallback to slot 0 if all slots are zero. +static size_t last_element_index(const std::vector& state) +{ + for (size_t i = state.size(); i > 0; --i) { + if (state[i - 1] != uint256_t(0)) { + return i - 1; + } + } + return 0; +} + +template +static int run_phase_and_diff(const VMPhaseHeader& header, + const unsigned char* data, + size_t size, + size_t& data_offset, + std::vector& current_state) +{ + FieldVM vm(false, header.steps); + if (!current_state.empty()) { + import_state_with_reduction(vm, current_state); + } + vm.set_max_steps(header.steps); + size_t bytes_consumed = vm.run(data + data_offset, size - data_offset, true); + if (bytes_consumed == 0) { + return 0; + } + + if (!vm.check_internal_state()) { + // Internal VM invariant violation — not the inverse bug we're looking + // for, but still a failure of the driver. Report and stop. + std::fprintf(stderr, "[invert_differential.fuzzer] VM internal state check failed\n"); + return -1; + } + + // Differential inverse check on the last element produced this phase, + // plus every non-zero slot in the final state for extra coverage. + auto uint_state = vm.export_uint_state(); + size_t last_idx = last_element_index(uint_state); + differential_check_inverse(vm.field_internal_state[last_idx], uint_state[last_idx]); + + // Extra coverage: also diff every other non-zero slot. Same check on + // many more values per phase, virtually free CPU-wise. + for (size_t i = 0; i < uint_state.size(); ++i) { + if (i != last_idx && uint_state[i] != uint256_t(0)) { + differential_check_inverse(vm.field_internal_state[i], uint_state[i]); + } + } + + current_state = uint_state; + data_offset += bytes_consumed; + return 1; +} + +extern "C" int LLVMFuzzerTestOneInput(const unsigned char* data, size_t size) +{ + if (size < PHASE_HEADER_SIZE) { + return 0; + } + + std::vector current_state; + size_t data_offset = 0; + + while (data_offset + PHASE_HEADER_SIZE <= size) { + const VMPhaseHeader* header_ptr = reinterpret_cast(data + data_offset); + VMPhaseHeader header = *header_ptr; + + FieldType selected_field_type = static_cast(header.field_type % NUM_FIELD_TYPES); + uint8_t selected_steps = header.steps % MAX_STEPS; + if (selected_steps == 0) { + selected_steps = 1; + } + header.field_type = static_cast(selected_field_type); + header.steps = selected_steps; + + int r = 0; + switch (selected_field_type) { + case FieldType::BN254_FQ: + r = run_phase_and_diff(header, data, size, data_offset, current_state); + break; + case FieldType::BN254_FR: + r = run_phase_and_diff(header, data, size, data_offset, current_state); + break; + } + + if (r < 0) { + return 1; + } + if (r == 0) { + break; + } + } + return 0; +} diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.hpp b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.hpp new file mode 100644 index 000000000000..4e4190d1bdf9 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.hpp @@ -0,0 +1,369 @@ +// Bernstein-Yang safegcd modular inverse. +// +// We want a⁻¹ mod p, where p is an odd prime. Run an extended binary GCD on +// the pair (f, g) starting at (p, a), with Bezout coefficients (d, e) +// satisfying invariants d·a ≡ f (mod p) and e·a ≡ g (mod p) at all times. +// When g reaches 0, gcd(p, a) = ±f = ±1 (since p is prime, a ≠ 0), so +// a⁻¹ = ±d. +// +// The cheap operation in a binary GCD is "if g is odd swap-and-subtract to +// make it even, then divide g by 2"; one such step is a "divstep" and +// shrinks |g| by ~1 bit. Doing each divstep on the full 256-bit state would +// be slow, so we use Pornin's trick: do BATCH divsteps purely on the low 64 +// bits of (f, g), accumulating the resulting linear transform as a 2×2 +// integer matrix M, then apply M to the full-precision (f, g, d, e) in one +// go (with an exact /2^BATCH at the end). The (d, e) side needs a 2-adic +// correction k·p added before dividing so the result stays integer-valued +// — k is determined by the low BATCH bits of M·(d, e) and p⁻¹ mod 2^BATCH. +// +// Native5x64 and Wasm9x29 wrap the platform-specific limb representation +// behind the same static interface so `invert_vartime` below is a +// single algorithm for both targets. + +#pragma once + +#include "barretenberg/numeric/uint256/uint256.hpp" +#include + +namespace bb::bernstein_yang { + +using numeric::uint256_t; +using u64 = uint64_t; +using i64 = int64_t; + +// The transition matrix produced by BATCH divsteps. With the implicit +// "/ 2^BATCH" at the end of apply_divstep_matrix, it represents the linear +// map (f, g) ↦ (M·(f, g) / 2^BATCH). Each divstep doubles one matrix entry, +// so after BATCH steps |u|, |v|, |q|, |r| ≤ 2^BATCH; they are signed (the +// swap-and-subtract case introduces negatives). +struct DivstepMatrix { + i64 u, v, q, r; +}; + +// 5 × 64-bit limbs, top limb two's-complement signed. Products via __int128. +class Native5x64 { + public: + // Number of divsteps folded into one matrix application. Bigger BATCH + // means fewer matrix applications per inversion but bigger matrix + // entries. Cap is set by the matrix-times-state product staying inside + // an __int128 accumulator: a single (i63 entry) × (u64 limb) is 127 bits, + // so BATCH ≤ 63; we use 62 to keep one bit of slack for the running sum. + static constexpr int BATCH = 62; + + // Worst-case number of matrix applications needed before g must have + // reached 0. The 735-divstep bound for 254-bit inputs is from the BY + // paper's convergence proof (rate ≈ 1 / 1.7 bits of g consumed per + // divstep). We pick the smallest NUM_ITERATIONS so NUM_ITERATIONS * + // BATCH ≥ 735; ⌈735 / 62⌉ = 12. The actual loop usually exits much + // earlier via the early break on g == 0. + static constexpr int NUM_ITERATIONS = 12; + + // The Bezout coefficients (d, e) live mod p but during the iteration we + // hold them as signed integers in the state. Each matrix application grows + // |d|, |e| by roughly a factor of 2 (matrix entry × value) plus an + // additive p (from the 2-adic correction k·p), so without bringing them + // back to [0, p) they would eventually overflow the state. + // reduce_to_canonical does that subtract-or-add-p reduction. Calling it + // every iter is wasteful — the 5×64-bit signed state has enough room to + // let |d|, |e| reach ~2^K · p before they no longer fit, so we only + // reduce every REDUCE_INTERVAL = 4 iters (|d|, |e| stay ≤ ~32p between + // reductions, plenty of headroom). + static constexpr int REDUCE_INTERVAL = 4; + + // Worst-case iteration cap inside reduce_to_canonical. After + // REDUCE_INTERVAL iters between reductions, |d|, |e| ≤ (2^(REDUCE_INTERVAL+1) - 1)·p, + // so reducing requires that many subtractions plus one break iter. + static constexpr int REDUCE_TO_CANONICAL_MAX_ITERS = 36; + static_assert((1U << (REDUCE_INTERVAL + 1)) <= REDUCE_TO_CANONICAL_MAX_ITERS, + "REDUCE_INTERVAL too large for reduce_to_canonical iteration bound"); + + Native5x64() noexcept + : l{} + {} + explicit Native5x64(const uint256_t& x) noexcept + : l{ x.data[0], x.data[1], x.data[2], x.data[3], 0 } + {} + static Native5x64 one() noexcept + { + Native5x64 r; + r.l[0] = 1; + return r; + } + + uint256_t to_uint256() const noexcept { return { l[0], l[1], l[2], l[3] }; } + u64 low_64() const noexcept { return l[0]; } + bool is_zero() const noexcept { return (l[0] | l[1] | l[2] | l[3] | l[4]) == 0; } + bool is_negative() const noexcept { return (i64)l[4] < 0; } + void neg() noexcept + { + u64 c = 1; + for (int i = 0; i < N; ++i) { + u64 v = (~l[i]) + c; + c = (c && v == 0) ? 1 : 0; + l[i] = v; + } + } + // Iter cap chosen by the REDUCE_TO_CANONICAL_MAX_ITERS / REDUCE_INTERVAL + // static_assert above; see those constants for the magnitude argument. + void reduce_to_canonical(const Native5x64& p) noexcept + { + for (int it = 0; it < REDUCE_TO_CANONICAL_MAX_ITERS; ++it) { + if (is_negative()) { + add_inplace(p); + } else if (ge(p)) { + sub_inplace(p); + } else { + break; + } + } + } + + // BATCH branchy divsteps on the low 64 bits of (f, g); returns the + // transition matrix M and updates δ. Variable-time over the inner + // branches — non-secret inputs only. + static DivstepMatrix compute_divstep_matrix(i64& delta, u64 f_lo, u64 g_lo) noexcept; + // (f, g) ← M·(f, g) / 2^BATCH and (d, e) ← (M·(d, e) + k·p) / 2^BATCH, + // where k_i = -((M·(d, e))_i · p⁻¹) mod 2^BATCH (the 2-adic correction + // that makes (M·(d, e))_i + k_i·p divisible by 2^BATCH). + static void apply_divstep_matrix(const DivstepMatrix& m, + Native5x64& f, + Native5x64& g, + Native5x64& d, + Native5x64& e, + const Native5x64& p, + u64 p_inv_mod_2k) noexcept; + // r_inv = -p⁻¹ mod 2^64 (barretenberg's Montgomery constant), so p⁻¹ mod + // 2^BATCH is the low BATCH bits of -r_inv. + static constexpr u64 p_inv_mod_2k_from_montgomery_r_inv(u64 r_inv) noexcept + { + // r_inv = -p^{-1} mod 2^64, so 0 - r_inv = p^{-1} mod 2^64. + return (0ULL - r_inv) & ((1ULL << BATCH) - 1); + } + + private: + static constexpr int N = 5; + u64 l[N]; + + void add_inplace(const Native5x64& b) noexcept + { + u64 c = 0; + for (int i = 0; i < N; ++i) { + __uint128_t s = (__uint128_t)l[i] + b.l[i] + c; + l[i] = (u64)s; + c = (u64)(s >> 64); + } + } + void sub_inplace(const Native5x64& b) noexcept + { + u64 borrow = 0; + for (int i = 0; i < N; ++i) { + __uint128_t s = (__uint128_t)l[i] - (__uint128_t)b.l[i] - borrow; + l[i] = (u64)s; + borrow = ((i64)(s >> 64) < 0) ? 1 : 0; + } + } + bool ge(const Native5x64& b) const noexcept + { + i64 a_top = (i64)l[N - 1], b_top = (i64)b.l[N - 1]; + if (a_top != b_top) { + return a_top > b_top; + } + for (int i = N - 2; i >= 0; --i) { + if (l[i] != b.l[i]) { + return l[i] > b.l[i]; + } + } + return true; + } + + friend struct NativeMatrix; +}; + +// 6-limb signed product helpers used by Native5x64::apply_divstep_matrix. +struct NativeMatrix { + static void signed_linear_combination(i64 a, const Native5x64& x, i64 b, const Native5x64& y, u64 out[6]) noexcept + { + __int128 c = 0; + for (int i = 0; i < 4; ++i) { + c += (__int128)a * (__int128)(u64)x.l[i] + (__int128)b * (__int128)(u64)y.l[i]; + out[i] = (u64)c; + c >>= 64; + } + c += (__int128)a * (__int128)(i64)x.l[4] + (__int128)b * (__int128)(i64)y.l[4]; + out[4] = (u64)c; + out[5] = (u64)(c >> 64); + } + // Sign-preserving exact /2^BATCH on the 6-limb signed `t`. Sign of the + // result lives in bit 63 of r.l[4], which is bit 61 of t[5]. This is the + // sign of t iff t[5] is just sign-extension of the actual magnitude — i.e., + // iff |t| < 2^319. BY guarantees this: |M·(x,y) + k·p| ≤ 2^324 in the + // (d,e) row and ≤ 2^319 in the (f,g) row, both with |result/2^62| < 2^263 < 2^319. + // A future change widening the matrix entries or state without re-running + // this analysis will silently corrupt the sign bit. + static Native5x64 arithmetic_shift_by_batch(const u64 t[6]) noexcept + { + Native5x64 r; + for (int i = 0; i < 4; ++i) { + r.l[i] = (t[i] >> 62) | (t[i + 1] << 2); + } + r.l[4] = (t[4] >> 62) | (t[5] << 2); + return r; + } +}; + +// Each inner step shrinks |g| by ~1 bit using a binary-GCD-style move. +// Three cases, depending on g's parity and the "δ" tracker (which decides +// whether |f| or |g| is currently smaller): +// g even : g ← g/2. +// g odd, δ ≤ 0 : g ← (g + f)/2 (adding f to make g+f even before /2). +// g odd, δ > 0 : swap roles — (f, g) ← (g, (g - f)/2). +// The matrix (u, v, q, r) tracks the same linear transform applied +// symbolically; doubling u, v (or q, r) corresponds to the implicit /2 each +// inner step performs. After BATCH steps the low BATCH bits of the +// transformed state are guaranteed zero, so apply_divstep_matrix's implicit +// "/ 2^BATCH" is an exact integer division. +inline DivstepMatrix Native5x64::compute_divstep_matrix(i64& delta, u64 f_lo, u64 g_lo) noexcept +{ + i64 u = 1, v = 0, q = 0, r = 1; + for (int i = 0; i < BATCH; ++i) { + if (g_lo & 1) { + if (delta > 0) { + u64 nf = g_lo, ng = (g_lo - f_lo) >> 1; + i64 nu = q << 1, nv = r << 1, nq = q - u, nr = r - v; + f_lo = nf; + g_lo = ng; + u = nu; + v = nv; + q = nq; + r = nr; + delta = 1 - delta; + } else { + g_lo = (g_lo + f_lo) >> 1; + q = q + u; + r = r + v; + u <<= 1; + v <<= 1; + delta = delta + 1; + } + } else { + g_lo >>= 1; + u <<= 1; + v <<= 1; + delta = delta + 1; + } + } + return { u, v, q, r }; +} + +inline void Native5x64::apply_divstep_matrix(const DivstepMatrix& m, + Native5x64& f, + Native5x64& g, + Native5x64& d, + Native5x64& e, + const Native5x64& p, + u64 p_inv_mod_2k) noexcept +{ + constexpr u64 MASK_BATCH = (1ULL << BATCH) - 1; + + u64 nf[6], ng[6]; + NativeMatrix::signed_linear_combination(m.u, f, m.v, g, nf); + NativeMatrix::signed_linear_combination(m.q, f, m.r, g, ng); + f = NativeMatrix::arithmetic_shift_by_batch(nf); + g = NativeMatrix::arithmetic_shift_by_batch(ng); + + // k = -t · p_inv_mod_2k mod 2^BATCH makes t + k·p divisible by 2^BATCH. + auto apply_corrected_row = [&](i64 a, const Native5x64& da, i64 b, const Native5x64& eb, Native5x64& out) { + u64 t[6]; + NativeMatrix::signed_linear_combination(a, da, b, eb, t); + u64 k = ((0ULL - t[0]) * p_inv_mod_2k) & MASK_BATCH; + u64 kp[6] = {}; + u64 carry = 0; + for (int i = 0; i < 5; ++i) { + __uint128_t prod = (__uint128_t)k * (u64)p.l[i] + carry; + kp[i] = (u64)prod; + carry = (u64)(prod >> 64); + } + kp[5] = carry; + u64 c = 0; + for (int i = 0; i < 6; ++i) { + __uint128_t s = (__uint128_t)t[i] + kp[i] + c; + t[i] = (u64)s; + c = (u64)(s >> 64); + } + out = NativeMatrix::arithmetic_shift_by_batch(t); + }; + Native5x64 nd, ne; + apply_corrected_row(m.u, d, m.v, e, nd); + apply_corrected_row(m.q, d, m.r, e, ne); + d = nd; + e = ne; +} + +} // namespace bb::bernstein_yang + +#include "./bernstein_yang_inverse_wasm.hpp" + +namespace bb::bernstein_yang { + +#if defined(__wasm__) +using State = Wasm9x29; +#else +using State = Native5x64; +#endif + +/** + * @brief Variable-time safegcd inverse (Bernstein-Yang TCHES 2019, Pornin 2020 §4). + * + * Iterates (f, g) starting at (p, a); each outer iter batches BATCH divsteps + * into a 2×2 matrix M and applies M to (f, g) / (d, e). When g reaches 0, + * gcd(p, a) = ±f and a⁻¹ = ±d mod p. Returns 0 for a == 0. + * + * @param p_inv_mod_2k p⁻¹ mod 2^BATCH (used by apply_divstep_matrix's 2-adic correction). + * @pre p odd prime, p < 2^255, 0 ≤ a < p. + */ +template +inline uint256_t invert_vartime(const uint256_t& a, const uint256_t& p, u64 p_inv_mod_2k) noexcept +{ + if (a == uint256_t(0)) { + return uint256_t(0); + } + S P(p), f = P, g(a), d, e = S::one(); + // δ is Pornin's auxiliary used by the divstep rule to decide swap-vs-add cases. + i64 delta = 1; + for (int i = 0; i < S::NUM_ITERATIONS; ++i) { + DivstepMatrix m = S::compute_divstep_matrix(delta, f.low_64(), g.low_64()); + S::apply_divstep_matrix(m, f, g, d, e, P, p_inv_mod_2k); + if (g.is_zero()) { + break; + } + if ((i + 1) % S::REDUCE_INTERVAL == 0) { + d.reduce_to_canonical(P); + e.reduce_to_canonical(P); + } + } + d.reduce_to_canonical(P); + if (f.is_negative()) { + d.neg(); + d.reduce_to_canonical(P); + } + return d.to_uint256(); +} + +inline constexpr u64 p_inv_mod_2k_from_montgomery_r_inv(u64 r_inv) noexcept +{ + return State::p_inv_mod_2k_from_montgomery_r_inv(r_inv); +} + +// True iff `invert_vartime` is usable for field params T: the active +// kernel must be compilable on this toolchain (Native5x64 needs __int128, the +// WASM kernel is unconditional) and T's modulus must fit BY's < 2^255 +// precondition. Used to gate the dispatch in `field::invert()`. +template +inline constexpr bool supported_v = +#if defined(__SIZEOF_INT128__) || defined(__wasm__) + T::modulus_3 < (1ULL << 63); +#else + false; +#endif + +} // namespace bb::bernstein_yang diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.md b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.md new file mode 100644 index 000000000000..c03ca5c5bd0e --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.md @@ -0,0 +1,392 @@ +# Bernstein–Yang modular inverse: the algorithm + +A description of the modular inverse algorithm implemented in `bernstein_yang_inverse.hpp`. + +Throughout, $p$ is an **odd prime**, $0 < a < p$, and we want +$$a^{-1} \bmod p.$$ + +All quantities are integers; reductions modulo $p$ are made explicit when used. + +--- + +## 1. Setup + +We track a $2 \times 2$ integer matrix + +$$ +\Phi \;=\; \begin{pmatrix} f & d \\ g & e \end{pmatrix} +$$ + +initialised at + +$$ +\Phi_0 \;=\; \begin{pmatrix} p & 0 \\ a & 1 \end{pmatrix}, \qquad \det \Phi_0 \;=\; p. +$$ + +### The kernel invariant + +The whole algorithm preserves a single congruence: + +$$ +\boxed{\quad \Phi \begin{pmatrix} 1 \\ -a \end{pmatrix} \;\equiv\; \begin{pmatrix} 0 \\ 0 \end{pmatrix} \pmod p. \quad} +$$ + +Equivalently, the column vector $(1, -a)^T \in \mathbb{F}_p^2$ is in the kernel +of $\Phi \bmod p$. Unpacking entry by entry, this is just two Bezout congruences: + +$$ +f \;\equiv\; d \cdot a \pmod p, \qquad g \;\equiv\; e \cdot a \pmod p. +$$ + +The initial state satisfies them ($p \equiv 0 \cdot a$, $a \equiv 1 \cdot a$). + +### The reduction goal + +We reduce $\Phi$ to upper triangular form: + +$$ +\Phi_N \;=\; \begin{pmatrix} \pm 1 & a^{-1} \bmod p \\ 0 & \star \end{pmatrix}. +$$ + +When $g = 0$, the kernel invariant gives $e a \equiv 0 \pmod p$, +hence (since $\gcd(a, p) = 1$) $p \mid e$, putting the bottom-right at a +multiple of $p$. Independently, $f$ at termination equals $\pm \gcd(p, a) = \pm 1$, +and the invariant collapses to $d a \equiv \pm 1 \pmod p$, giving +$a^{-1} \equiv \pm d \pmod p$. The sign is read off the top-left entry. + +So **the BY algorithm is a reduction of $\Phi_0$ to upper triangular form, with +the inverse appearing as the top-right entry.** + +--- + +## 2. The generators $L_a, L_b, L_c$ + +The algorithm proceeds by repeated left multiplication: $\Phi \to L_n \Phi$, +choosing $L_n$ from a fixed set of three matrices: + +$$ +L_a \;=\; \begin{pmatrix} 1 & 0 \\ 0 & \tfrac{1}{2} \end{pmatrix}, \qquad +L_b \;=\; \begin{pmatrix} 1 & 0 \\ \tfrac{1}{2} & \tfrac{1}{2} \end{pmatrix}, \qquad +L_c \;=\; \begin{pmatrix} 0 & 1 \\ -\tfrac{1}{2} & \tfrac{1}{2} \end{pmatrix}. +$$ + +Each has $\det L_n = \tfrac{1}{2}$, so $L_n \in \mathrm{GL}_2(\mathbb{Z}[\tfrac{1}{2}])$ +but not in $\mathrm{SL}_2$. They act on the rows of $\Phi$: equivalently, on +$(f, g)$ and on $(d, e)$ in lockstep. + +### Divstep + +A small auxiliary integer $\delta$ is carried alongside $\Phi$, initialised at +$\delta_0 = 1$. At every step, the parity of $g$ together with the sign of +$\delta$ dictates which generator to apply: + +| Condition | Generator | $\delta$ update | +| --- | --- | --- | +| $g$ even | $L_a$ | $\delta \leftarrow \delta + 1$ | +| $g$ odd, $\delta \le 0$ | $L_b$ | $\delta \leftarrow \delta + 1$ | +| $g$ odd, $\delta > 0$ | $L_c$ | $\delta \leftarrow 1 - \delta$ | + +The convention $f$ odd is maintained throughout: $p$ is odd, the $L_n$ preserve +$f \bmod 2$, so $f$ stays odd until termination. + +One application of this rule is one **divstep**. Each divstep multiplies $\det \Phi$ +by $\tfrac{1}{2}$ (so after $N$ divsteps $\det \Phi_N = p / 2^N$) and shrinks +the magnitude of the lower-left entry $g$ on average. + +### Why the case split makes sense + +Unpacking $L_a, L_b, L_c$ as row operations on $(f, g)$: + +- $L_a$: $g \leftarrow g/2$. Valid only when $g$ is even, otherwise the result is non-integer. +- $L_b$: $g \leftarrow (g + f)/2$. Used when $g$ is odd but $|g| \le |f|$ in spirit (tracked by $\delta \le 0$): adding $f$ first makes the sum even. +- $L_c$: $(f, g) \leftarrow (g, (g - f)/2)$. Swap-and-subtract; used when $g$ is odd and $|g| > |f|$ in spirit ($\delta > 0$). + +The "in spirit" caveats are because BY tracks $|f|$ vs. $|g|$ comparisons via the +integer $\delta$ rather than by an actual size comparison — $\delta$ effectively +measures the running deficit "extra divisions of $g$ over extra divisions of $f$." + +### Right-column $p$-shift + +Left multiplication by $L_n$ alone would leave $\Phi$ with non-integer entries +in the right column (because $L_n$ has $\tfrac{1}{2}$ entries acting on +$(d, e)$). To keep the state integer-valued we add a multiple of $p$ to the +right column before halving — a *2-adic correction* that vanishes modulo $p$ +and hence doesn't disturb the kernel invariant. The mechanics live in §6. + +So one full divstep is: +$$ +\Phi \;\longmapsto\; L_n \,\Phi \;+\; p \cdot \big(\text{adjustment in right column}\big), \qquad L_n \in \{L_a, L_b, L_c\}. +$$ + +The left column ($f, g$) updates via clean left multiplication; the right +column ($d, e$) updates via left multiplication plus a $p$-shift. + +--- + +## 3. Convergence + +The Bernstein--Yang/Pornin convergence proof guarantees $g = 0$ within 735 divsteps for 254-bit inputs; native runs $12 \cdot 62 = 744$ divsteps and wasm runs $13 \cdot 58 = 754$, so both cover the bound. + +--- + +## 4. Batching: products of $L_n$ as a $2 \times 2$ matrix + +A single divstep touches only the lowest bit of $g$ (to read parity) and is a +linear combination of $(f, g)$. So the next $\mathrm{BATCH}$ divsteps depend +only on the low $\mathrm{BATCH}$ bits of $(f, g)$ — the high bits do not +influence the choice between $L_a, L_b, L_c$ within those steps. + +Let $\mathrm{BATCH} = 62$ (the implementation choice on native; $58$ on wasm). +Run $\mathrm{BATCH}$ divsteps purely on the low $64$ bits of $(f, g)$, +accumulating the result as a single $2 \times 2$ rational matrix + +$$ +M \;=\; L_{n_{\mathrm{BATCH}-1}} \cdots L_{n_1} L_{n_0} \;=\; \begin{pmatrix} u & v \\ q & r \end{pmatrix} \cdot 2^{-\mathrm{BATCH}}. +$$ + +After clearing the implicit denominator, the *integer* part $\begin{pmatrix} u & v \\ q & r \end{pmatrix} = 2^{\mathrm{BATCH}} M$ has all four entries bounded by $|u|, |v|, |q|, |r| \le 2^{\mathrm{BATCH}}$ (each individual $L_n$ at most doubles one entry of the running product). With $\mathrm{BATCH} = 62$, the four entries fit in `int64_t`. + +--- + +## 5. Applying $M$ to the left column $(f, g)$ + +The new $(f', g')^T = M \cdot (f, g)^T$ is computed as + +1. The two integer linear combinations + $$t_f = u \cdot f + v \cdot g, \qquad t_g = q \cdot f + r \cdot g$$ + in full precision. With $|u|, \ldots, |r| \le 2^{62}$ and $|f|, |g|$ + fitting in $256$ bits (see §7), each product fits in $318$ bits and the + sum in $319$ bits. +2. Arithmetic-right-shift by $\mathrm{BATCH}$ bits: + $$f' \;=\; t_f \gg \mathrm{BATCH}, \qquad g' \;=\; t_g \gg \mathrm{BATCH}.$$ + +The shift is exact: the low $\mathrm{BATCH}$ bits of $t_f$ and $t_g$ are zero +by construction. Each divstep makes one bit of $g$ cancel, and after +$\mathrm{BATCH}$ divsteps the bottom $\mathrm{BATCH}$ bits of the linear +combinations are guaranteed zero — this is the "exact integer division" +property of the divstep cascade. + +In the implementation this lives in `NativeMatrix::signed_linear_combination` (the +products) and `NativeMatrix::arithmetic_shift_by_batch` (the right shift). + +--- + +## 6. Applying $M$ to the right column $(d, e)$: the 2-adic correction + +Applying $M$ to $(d, e)$ presents the same divisibility question, but now the +state lives modulo $p$. The integer linear combination $t = u \cdot d + v \cdot e$ +is **not** generally divisible by $2^{\mathrm{BATCH}}$. We need an integer +correction $k$ such that $t + k \cdot p$ is divisible by $2^{\mathrm{BATCH}}$, +then take the quotient. + +This is a 2-adic problem: + +$$ +t + k\cdot p \;\equiv\; 0 \pmod{2^{\mathrm{BATCH}}} +\;\Longleftrightarrow\; +k \;\equiv\; -\, t \cdot p^{-1} \pmod{2^{\mathrm{BATCH}}}. +$$ + +$p$ is odd, so $p^{-1} \bmod 2^{\mathrm{BATCH}}$ exists. The implementation +precomputes it once: $p^{-1} \bmod 2^{\mathrm{BATCH}}$ is the low +$\mathrm{BATCH}$ bits of $-r_{\mathrm{inv}}$, where $r_{\mathrm{inv}}$ is +barretenberg's Montgomery constant $-p^{-1} \bmod 2^{64}$. + +The correction step: + +$$ +\begin{aligned} +t &\;=\; u \cdot d + v \cdot e, \\ +k &\;\equiv\; -\, t \cdot p^{-1} \pmod{2^{\mathrm{BATCH}}}, \\ +d' &\;=\; (t + k \cdot p) \gg \mathrm{BATCH}. +\end{aligned} +$$ + +After the shift, $d'$ is an integer (because $t + k \cdot p$ is divisible by +$2^{\mathrm{BATCH}}$). The added $k \cdot p$ contributes only a multiple of $p$ +to the right column, so the kernel invariant $\Phi \binom{1}{-a} \equiv 0 \pmod p$ +is preserved. + +The same construction applied with $(q, r)$ and the second row of $M$ gives +the new $e'$. This is `apply_corrected_row` in the implementation. + +--- + +## 7. State bounds: why 5 limbs + +`Native5x64` stores each state value in five signed 64-bit limbs. Four limbs +would be enough for canonical field elements, but the BY state is not always +canonical while a batched matrix is being applied. + +For the left column, $f$ and $g$ are roughly 256-bit values between matrix +applications. During §5, however, the products $u f$, $v g$, $q f$, and $r g$ +can be as large as $2^{62} \cdot 2^{256}$, and their sums need about 319 bits. +Those sums are held in the temporary six-limb arrays inside +`NativeMatrix::signed_linear_combination`; after the exact right shift by +`BATCH`, $f$ and $g$ return to the normal state size. + +For the right column, $d$ and $e$ need extra resting headroom too. The 2-adic +correction adds $k \cdot p$ before the shift, and repeated matrix applications +can move 00,p)1 The implementation therefore reduces them +to canonical form every `REDUCE_INTERVAL = 4` iterations rather than after every +matrix application. + +Between reductions, $|d|$ and $|e|$ are bounded by roughly $32p$. Five signed +64-bit limbs provide about 319 bits of magnitude, enough room for that growth. +`Native5x64::reduce_to_canonical(p)` then repeatedly adds or subtracts $p$, +with a loop bound of 36 to cover the same worst-case headroom. + +--- + +## 8. Final answer extraction + +After the iteration loop: + +- $g = 0$. +- $f = \pm \gcd(p, a) = \pm 1$. +- Kernel invariant: $d \cdot a \equiv f \pmod{p}$, so $d \equiv \pm a^{-1} \pmod p$. + +Reduce $d$ to canonical form $[0, p)$. Then + +$$ +a^{-1} \bmod p \;=\; +\begin{cases} +d & \text{if } f > 0, \\ +{-d} \bmod p & \text{if } f < 0. +\end{cases} +$$ + +In code, this is the +`if (f.is_negative()) { d.neg(); d.reduce_to_canonical(P); }` at the end of +`invert_vartime`. + +--- + +## 9. Math-to-code map + +The implementation keeps the same mathematical state but hides the limb details +behind `Native5x64` and `Wasm9x29`. + +| Math description | Code | +| --- | --- | +| $\Phi = \begin{pmatrix} f & d \\ g & e \end{pmatrix}$ | `S P(p), f = P, g(a), d, e = S::one()` in `invert_vartime` | +| Auxiliary divstep counter $\delta$ | local `i64 delta` | +| Product of one batch of divsteps | `S::compute_divstep_matrix(delta, f.low_64(), g.low_64())` | +| $M = \begin{pmatrix} u & v \\ q & r \end{pmatrix} 2^{-B}$ | `DivstepMatrix { u, v, q, r }`; the denominator is the implicit `2^BATCH` | +| $M(f,g)/2^B$ | `apply_divstep_matrix`, using `NativeMatrix::signed_linear_combination` and `arithmetic_shift_by_batch` on native | +| $k \equiv -t p^{-1} \pmod {2^B}$ | `apply_corrected_row` on native; streamed as `k_d`, `k_e` on wasm | +| $p^{-1} \bmod 2^B$ from Montgomery metadata | `p_inv_mod_2k_from_montgomery_r_inv` | +| Periodic reduction of $d,e$ modulo $p$ | `reduce_to_canonical(P)` every `REDUCE_INTERVAL` iterations | +| Final sign correction $a^{-1} = \pm d$ | final `if (f.is_negative()) { d.neg(); ... }` | +| Platform-specific limb representation | `using State = Native5x64` or `Wasm9x29` | + +--- + +## 10. Example + +This example uses the same batched mechanics as the implementation, but with a toy $B = 3$ instead of native $B = 62$. + +Let + +$$ +p = 17, \qquad a = 3, \qquad +\Phi_0 = \begin{pmatrix} f & d \\ g & e \end{pmatrix} + = \begin{pmatrix} 17 & 0 \\ 3 & 1 \end{pmatrix}, \qquad +\delta_0 = 1. +$$ + +Since $17 \equiv 1 \pmod 8$, we have $p^{-1} \equiv 1 \pmod 8$. For each row +of the right column, the correction is + +$$ +k \equiv -t \cdot p^{-1} \pmod 8, +\qquad +x^\prime = (t + k p) / 8. +$$ + +### Batch 0 + +The first three divsteps are $L_c, L_b, L_a$. Their product is + +$$ +M_0 = \begin{pmatrix} 0 & 8 \\ -1 & 3 \end{pmatrix} \cdot 2^{-3}. +$$ + +Apply it to the left column: + +$$ +f^\prime = \frac{0 \cdot 17 + 8 \cdot 3}{8} = 3, +\qquad +g^\prime = \frac{-1 \cdot 17 + 3 \cdot 3}{8} = -1. +$$ + +Apply it to the right column. For $d^\prime$, $t = 0 \cdot 0 + 8 \cdot 1 = 8$, +so $k = 0$ and $d^\prime = 1$. For $e^\prime$, $t = -1 \cdot 0 + 3 \cdot 1 = 3$, +so $k \equiv -3 \equiv 5 \pmod 8$ and + +$$ +e^\prime = \frac{3 + 5 \cdot 17}{8} = 11. +$$ + +After batch 0, + +$$ +(f,g,d,e;\delta) = (3,-1,1,11;2). +$$ + +### Batch 1 + +The next three divsteps are $L_c, L_a, L_b$, giving + +$$ +M_1 = \begin{pmatrix} 0 & 8 \\ -1 & 5 \end{pmatrix} \cdot 2^{-3}. +$$ + +For the left column, + +$$ +f^\prime = \frac{0 \cdot 3 + 8 \cdot (-1)}{8} = -1, +\qquad +g^\prime = \frac{-1 \cdot 3 + 5 \cdot (-1)}{8} = -1. +$$ + +For the right column, the first row gives $t = 88$ and $k = 0$, hence +$d^\prime = 11$. The second row gives $t = -1 \cdot 1 + 5 \cdot 11 = 54$, +so $k \equiv -54 \equiv 2 \pmod 8$ and + +$$ +e^\prime = \frac{54 + 2 \cdot 17}{8} = 11. +$$ + +After batch 1, + +$$ +(f,g,d,e;\delta) = (-1,-1,11,11;1). +$$ + +### Batch 2 + +The next batch starts with the terminating divstep $L_c$; the remaining two +low-bit divsteps keep $g = 0$. The accumulated matrix is + +$$ +M_2 = \begin{pmatrix} 0 & 8 \\ -1 & 1 \end{pmatrix} \cdot 2^{-3}. +$$ + +Applying it gives + +$$ +f^\prime = \frac{0 \cdot (-1) + 8 \cdot (-1)}{8} = -1, +\qquad +g^\prime = \frac{-1 \cdot (-1) + 1 \cdot (-1)}{8} = 0, +$$ + +and for the right column, $d^\prime = 11$ and $e^\prime = 0$. + +Now $g=0$. Since $f=-1$, the inverse is $-d \bmod p$: + +$$ +a^{-1} \equiv -11 \equiv 6 \pmod {17}, +\qquad +3 \cdot 6 \equiv 1 \pmod {17}. +$$ diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.test.cpp new file mode 100644 index 000000000000..35f65e5c56fe --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse.test.cpp @@ -0,0 +1,150 @@ +#include "barretenberg/ecc/fields/bernstein_yang_inverse.hpp" +#include "barretenberg/ecc/curves/bn254/fq.hpp" +#include "barretenberg/ecc/curves/bn254/fr.hpp" +#include "barretenberg/ecc/curves/secp256k1/secp256k1.hpp" +#include "barretenberg/ecc/curves/secp256r1/secp256r1.hpp" +#include "barretenberg/ecc/fields/bernstein_yang_inverse_wasm.hpp" +#include "barretenberg/numeric/uint256/uint256.hpp" +#include + +namespace { + +using bb::numeric::uint256_t; +using Native = bb::bernstein_yang::Native5x64; +using Wasm = bb::bernstein_yang::Wasm9x29; + +template uint256_t to_raw(const F& a) +{ + F nonmont = a.from_montgomery_form_reduced(); + return { nonmont.data[0], nonmont.data[1], nonmont.data[2], nonmont.data[3] }; +} + +template F from_raw(const uint256_t& a) +{ + F r{ a.data[0], a.data[1], a.data[2], a.data[3] }; + r.self_to_montgomery_form(); + return r; +} + +template uint256_t by_invert(const F& a) +{ + constexpr uint256_t p = F::modulus; + constexpr uint64_t p_inv_mod_2k = S::p_inv_mod_2k_from_montgomery_r_inv(F::Params::r_inv); + return bb::bernstein_yang::invert_vartime(to_raw(a), p, p_inv_mod_2k); +} + +// Random a, BY result roundtripped through Montgomery must invert a. +template void check_inverse_matches_modexp(size_t n) +{ + for (size_t i = 0; i < n; ++i) { + F a = F::random_element(); + if (a == F::zero()) { + continue; + } + F got = from_raw(by_invert(a)); + EXPECT_EQ(got * a, F::one()) << "iteration " << i; + } +} + +// Same input through both kernels must produce identical canonical output. +template void check_native_matches_wasm(size_t n) +{ + for (size_t i = 0; i < n; ++i) { + F a = F::random_element(); + if (a == F::zero()) { + continue; + } + EXPECT_EQ(by_invert(a), by_invert(a)) << "iteration " << i; + } +} + +template void check_edge_cases() +{ + // invert(1) == 1 + EXPECT_EQ(from_raw(by_invert(F::one())), F::one()); + + // invert(-1) == -1 + F neg_one = -F::one(); + EXPECT_EQ(from_raw(by_invert(neg_one)), neg_one); + + // a * invert(a) == 1 for small / boundary / sparse inputs + F two = F::one() + F::one(); + F neg_two = -two; + F top_limb_only{ 0, 0, 0, 1 }; // raw value 2^192, well below modulus for 254-bit fields + top_limb_only.self_to_montgomery_form(); + for (const F& a : { two, neg_two, top_limb_only }) { + F inv = from_raw(by_invert(a)); + EXPECT_EQ(inv * a, F::one()); + } + + // Involution: invert(invert(a)) == a, on random samples. + for (int i = 0; i < 64; ++i) { + F a = F::random_element(); + if (a == F::zero()) { + continue; + } + F inv = from_raw(by_invert(a)); + F inv_inv = from_raw(by_invert(inv)); + EXPECT_EQ(inv_inv, a); + } +} + +} // namespace + +TEST(Wasm9x29, MatchesModexp_BN254_Fr) +{ + check_inverse_matches_modexp(500); +} +TEST(Wasm9x29, MatchesModexp_BN254_Fq) +{ + check_inverse_matches_modexp(500); +} + +TEST(Native5x64, MatchesModexp_BN254_Fr) +{ + check_inverse_matches_modexp(500); +} +TEST(Native5x64, MatchesModexp_BN254_Fq) +{ + check_inverse_matches_modexp(500); +} + +TEST(Wasm9x29, EdgeCases_BN254_Fr) +{ + check_edge_cases(); +} +TEST(Wasm9x29, EdgeCases_BN254_Fq) +{ + check_edge_cases(); +} +TEST(Native5x64, EdgeCases_BN254_Fr) +{ + check_edge_cases(); +} +TEST(Native5x64, EdgeCases_BN254_Fq) +{ + check_edge_cases(); +} + +TEST(BernsteinYang, NativeMatchesWasm_BN254_Fr) +{ + check_native_matches_wasm(500); +} +TEST(BernsteinYang, NativeMatchesWasm_BN254_Fq) +{ + check_native_matches_wasm(500); +} + +// 256-bit moduli must keep working through field::invert() via the Fermat +// fallback (the BY dispatch is gated by `modulus_3 < 2^63`, which excludes +// secp256k1/r1). Pin that behavior so a future change to the gate gets caught. +TEST(BernsteinYang, FermatFallback_Secp256k1_Fr) +{ + auto a = bb::secp256k1::fr::random_element(); + EXPECT_EQ(a * a.invert(), bb::secp256k1::fr::one()); +} +TEST(BernsteinYang, FermatFallback_Secp256r1_Fr) +{ + auto a = bb::secp256r1::fr::random_element(); + EXPECT_EQ(a * a.invert(), bb::secp256r1::fr::one()); +} diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse_wasm.hpp b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse_wasm.hpp new file mode 100644 index 000000000000..4d728e3de73b --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/ecc/fields/bernstein_yang_inverse_wasm.hpp @@ -0,0 +1,280 @@ +// 9 × 29-bit-limb state. Included from bernstein_yang_inverse.hpp; uses the +// u64 / i64 / DivstepMatrix names declared there. +// +// Why a different limb size from Native5x64: on wasm32 there is no native +// 64×64→128 multiply, so i64 × u64 → __int128 lowers to a compiler-rt +// __multi3 dispatch. Pack the 254-bit state into 9 limbs of 29 bits each +// instead: every limb-level product is then i29 × i29 = i58, fitting in a +// single WASM i64.mul. Choosing the per-iter BATCH as exactly 2 × LIMB_BITS +// makes the "/ 2^BATCH" at the end of apply_divstep_matrix equivalent to dropping +// the bottom two 29-bit limbs (no sub-limb shift on the intermediate). + +#pragma once + +#include "barretenberg/numeric/uint256/uint256.hpp" +#include + +namespace bb::bernstein_yang { + +class Wasm9x29 { + public: + // Divsteps per matrix application; smaller than Native5x64::BATCH so + // the resulting "/ 2^BATCH" is limb-aligned (= drop the bottom two + // 29-bit limbs) and no sub-limb shift is needed on the intermediate. + static constexpr int BATCH = 58; + + // ⌈735 / 58⌉ = 13. Same convergence-bound logic as Native5x64; one + // iter more because BATCH is smaller. + static constexpr int NUM_ITERATIONS = 13; + + // |d|, |e| can grow by ~2× + p per matrix application; after 4 iters + // they reach ~31p ≈ 2^259, which still fits in the 9 × 29-bit signed + // state (capacity ~2^260). Reducing once every 4 iters instead of + // every iter saves ~3× reduce_to_canonical calls per inversion. + static constexpr int REDUCE_INTERVAL = 4; + + // Worst-case iteration cap inside reduce_to_canonical. After + // REDUCE_INTERVAL iters between reductions, |d|, |e| ≤ (2^(REDUCE_INTERVAL+1) - 1)·p, + // so reducing requires that many subtractions plus one break iter. + static constexpr int REDUCE_TO_CANONICAL_MAX_ITERS = 36; + static_assert((1U << (REDUCE_INTERVAL + 1)) <= REDUCE_TO_CANONICAL_MAX_ITERS, + "REDUCE_INTERVAL too large for reduce_to_canonical iteration bound"); + + Wasm9x29() noexcept + : l{} + {} + explicit Wasm9x29(const uint256_t& x) noexcept + { + const u64* d = x.data; + l[0] = (i64)(d[0] & LIMB_MASK); + l[1] = (i64)((d[0] >> 29) & LIMB_MASK); + l[2] = (i64)(((d[0] >> 58) & 0x3FULL) | ((d[1] & 0x7FFFFFULL) << 6)); + l[3] = (i64)((d[1] >> 23) & LIMB_MASK); + l[4] = (i64)(((d[1] >> 52) & 0xFFFULL) | ((d[2] & 0x1FFFFULL) << 12)); + l[5] = (i64)((d[2] >> 17) & LIMB_MASK); + l[6] = (i64)(((d[2] >> 46) & 0x3FFFFULL) | ((d[3] & 0x7FFULL) << 18)); + l[7] = (i64)((d[3] >> 11) & LIMB_MASK); + l[8] = (i64)((d[3] >> 40) & 0xFFFFFFULL); + } + static Wasm9x29 one() noexcept + { + Wasm9x29 r; + r.l[0] = 1; + return r; + } + + uint256_t to_uint256() const noexcept + { + return { (u64)l[0] | ((u64)l[1] << 29) | ((u64)l[2] << 58), + ((u64)l[2] >> 6) | ((u64)l[3] << 23) | ((u64)l[4] << 52), + ((u64)l[4] >> 12) | ((u64)l[5] << 17) | ((u64)l[6] << 46), + ((u64)l[6] >> 18) | ((u64)l[7] << 11) | ((u64)l[8] << 40) }; + } + u64 low_64() const noexcept { return (u64)l[0] | ((u64)l[1] << 29) | (((u64)l[2] & 0x3F) << 58); } + bool is_zero() const noexcept + { + i64 a = 0; + for (int i = 0; i < N; ++i) { + a |= l[i]; + } + return a == 0; + } + bool is_negative() const noexcept { return l[N - 1] < 0; } + void neg() noexcept + { + for (int i = 0; i < N; ++i) { + l[i] = -l[i]; + } + normalise(); + } + void reduce_to_canonical(const Wasm9x29& p) noexcept; + + // See Native5x64 for the batched divstep matrix, matrix application, + // and p_inv_mod_2k_from_montgomery_r_inv contracts; the bodies differ only + // in limb representation. + static DivstepMatrix compute_divstep_matrix(i64& delta, u64 f_lo, u64 g_lo) noexcept; + static void apply_divstep_matrix(const DivstepMatrix& m, + Wasm9x29& f, + Wasm9x29& g, + Wasm9x29& d, + Wasm9x29& e, + const Wasm9x29& p, + u64 p_inv_mod_2k) noexcept; + static constexpr u64 p_inv_mod_2k_from_montgomery_r_inv(u64 r_inv) noexcept + { + // r_inv = -p^{-1} mod 2^64, so 0 - r_inv = p^{-1} mod 2^64. + return (0ULL - r_inv) & ((1ULL << BATCH) - 1); + } + + private: + static constexpr int N = 9; + static constexpr int LIMB_BITS = 29; + static constexpr u64 LIMB_MASK = (1ULL << LIMB_BITS) - 1; + i64 l[N]; // top limb carries sign; lower limbs in [0, 2^29) post-normalise + + void normalise() noexcept + { + i64 c = 0; + for (int i = 0; i < N - 1; ++i) { + i64 v = l[i] + c; + l[i] = v & (i64)LIMB_MASK; + c = v >> LIMB_BITS; + } + l[N - 1] += c; + } + void add_inplace(const Wasm9x29& b) noexcept + { + for (int i = 0; i < N; ++i) { + l[i] += b.l[i]; + } + normalise(); + } + void sub_inplace(const Wasm9x29& b) noexcept + { + for (int i = 0; i < N; ++i) { + l[i] -= b.l[i]; + } + normalise(); + } +}; + +// Iter cap chosen by the REDUCE_TO_CANONICAL_MAX_ITERS / REDUCE_INTERVAL +// static_assert above; see those constants for the magnitude argument. +inline void Wasm9x29::reduce_to_canonical(const Wasm9x29& p) noexcept +{ + normalise(); + for (int it = 0; it < REDUCE_TO_CANONICAL_MAX_ITERS; ++it) { + if (is_negative()) { + add_inplace(p); + continue; + } + int cmp = 0; + for (int i = N - 1; i >= 0; --i) { + if (l[i] != p.l[i]) { + cmp = l[i] > p.l[i] ? 1 : -1; + break; + } + } + if (cmp < 0) { + break; + } + sub_inplace(p); + } +} + +inline DivstepMatrix Wasm9x29::compute_divstep_matrix(i64& delta, u64 f_lo, u64 g_lo) noexcept +{ + i64 u = 1, v = 0, q = 0, r = 1; + for (int i = 0; i < BATCH; ++i) { + if (g_lo & 1) { + if (delta > 0) { + u64 nf = g_lo, ng = (g_lo - f_lo) >> 1; + i64 nu = q << 1, nv = r << 1, nq = q - u, nr = r - v; + f_lo = nf; + g_lo = ng; + u = nu; + v = nv; + q = nq; + r = nr; + delta = 1 - delta; + } else { + g_lo = (g_lo + f_lo) >> 1; + q = q + u; + r = r + v; + u <<= 1; + v <<= 1; + delta = delta + 1; + } + } else { + g_lo >>= 1; + u <<= 1; + v <<= 1; + delta = delta + 1; + } + } + return { u, v, q, r }; +} + +// Streamed schoolbook: for each limb position i compute +// nf_i = u_lo·f_i + v_lo·g_i + u_hi·f_{i-1} + v_hi·g_{i-1} + carry_in +// (similarly ng, nd, ne), then carry_out = nf_i >> LIMB_BITS, masked low +// 29 bits land at output position i - 2 (= exact >> BATCH). The de row +// derives k_d, k_e from the low two limbs up front and folds k·p into the +// per-limb formula from position 2 onward. No 11-limb intermediate is +// materialised — the JIT keeps the four running carries in registers. +inline void Wasm9x29::apply_divstep_matrix(const DivstepMatrix& m, + Wasm9x29& f, + Wasm9x29& g, + Wasm9x29& d, + Wasm9x29& e, + const Wasm9x29& p, + u64 p_inv_mod_2k) noexcept +{ + constexpr u64 MASK_BATCH = (1ULL << BATCH) - 1; + const i64 u_lo = m.u & (i64)LIMB_MASK, u_hi = m.u >> LIMB_BITS; + const i64 v_lo = m.v & (i64)LIMB_MASK, v_hi = m.v >> LIMB_BITS; + const i64 q_lo = m.q & (i64)LIMB_MASK, q_hi = m.q >> LIMB_BITS; + const i64 r_lo = m.r & (i64)LIMB_MASK, r_hi = m.r >> LIMB_BITS; + + { + i64 cf = 0, cg = 0, fp = 0, gp = 0; + for (int i = 0; i < N; ++i) { + const i64 fi = f.l[i], gi = g.l[i]; + const i64 nf = u_lo * fi + v_lo * gi + u_hi * fp + v_hi * gp + cf; + const i64 ng = q_lo * fi + r_lo * gi + q_hi * fp + r_hi * gp + cg; + cf = nf >> LIMB_BITS; + cg = ng >> LIMB_BITS; + if (i >= 2) { + f.l[i - 2] = nf & (i64)LIMB_MASK; + g.l[i - 2] = ng & (i64)LIMB_MASK; + } + fp = fi; + gp = gi; + } + const i64 nf9 = u_hi * fp + v_hi * gp + cf; + const i64 ng9 = q_hi * fp + r_hi * gp + cg; + f.l[N - 2] = nf9 & (i64)LIMB_MASK; + g.l[N - 2] = ng9 & (i64)LIMB_MASK; + f.l[N - 1] = nf9 >> LIMB_BITS; + g.l[N - 1] = ng9 >> LIMB_BITS; + } + + // k_d, k_e (mod 2^BATCH) clear the low BATCH bits of nd, ne; fold k·p + // into the streaming pass from position 2 onward. + { + const i64 d0 = d.l[0], e0 = e.l[0], d1 = d.l[1], e1 = e.l[1]; + const i64 nd0 = u_lo * d0 + v_lo * e0; + const i64 ne0 = q_lo * d0 + r_lo * e0; + const i64 nd1 = u_lo * d1 + v_lo * e1 + u_hi * d0 + v_hi * e0; + const i64 ne1 = q_lo * d1 + r_lo * e1 + q_hi * d0 + r_hi * e0; + const u64 t_d = ((u64)nd0 & LIMB_MASK) | (((u64)(nd1 + (nd0 >> LIMB_BITS)) & LIMB_MASK) << LIMB_BITS); + const u64 t_e = ((u64)ne0 & LIMB_MASK) | (((u64)(ne1 + (ne0 >> LIMB_BITS)) & LIMB_MASK) << LIMB_BITS); + const u64 k_d = ((0ULL - t_d) * p_inv_mod_2k) & MASK_BATCH; + const u64 k_e = ((0ULL - t_e) * p_inv_mod_2k) & MASK_BATCH; + const i64 kd_lo = (i64)(k_d & LIMB_MASK), kd_hi = (i64)(k_d >> LIMB_BITS); + const i64 ke_lo = (i64)(k_e & LIMB_MASK), ke_hi = (i64)(k_e >> LIMB_BITS); + i64 cd = (nd1 + kd_lo * p.l[1] + kd_hi * p.l[0] + ((nd0 + kd_lo * p.l[0]) >> LIMB_BITS)) >> LIMB_BITS; + i64 ce = (ne1 + ke_lo * p.l[1] + ke_hi * p.l[0] + ((ne0 + ke_lo * p.l[0]) >> LIMB_BITS)) >> LIMB_BITS; + + i64 dp = d1, ep = e1; + for (int i = 2; i < N; ++i) { + const i64 di = d.l[i], ei = e.l[i]; + const i64 nd = u_lo * di + v_lo * ei + u_hi * dp + v_hi * ep + kd_lo * p.l[i] + kd_hi * p.l[i - 1] + cd; + const i64 ne = q_lo * di + r_lo * ei + q_hi * dp + r_hi * ep + ke_lo * p.l[i] + ke_hi * p.l[i - 1] + ce; + cd = nd >> LIMB_BITS; + ce = ne >> LIMB_BITS; + d.l[i - 2] = nd & (i64)LIMB_MASK; + e.l[i - 2] = ne & (i64)LIMB_MASK; + dp = di; + ep = ei; + } + const i64 nd9 = u_hi * dp + v_hi * ep + kd_hi * p.l[N - 1] + cd; + const i64 ne9 = q_hi * dp + r_hi * ep + ke_hi * p.l[N - 1] + ce; + d.l[N - 2] = nd9 & (i64)LIMB_MASK; + e.l[N - 2] = ne9 & (i64)LIMB_MASK; + d.l[N - 1] = nd9 >> LIMB_BITS; + e.l[N - 1] = ne9 >> LIMB_BITS; + } +} + +} // namespace bb::bernstein_yang diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp b/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp index fdbb7508dfa8..6362ac70e04c 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/fields/field_declarations.hpp @@ -340,6 +340,7 @@ template struct alignas(32) field { static constexpr uint256_t modulus_minus_two = uint256_t(Params::modulus_0 - 2ULL, Params::modulus_1, Params::modulus_2, Params::modulus_3); constexpr field invert() const noexcept; + constexpr field invert_const_time() const noexcept; template // has size() and operator[]. requires requires(C& c) { diff --git a/barretenberg/cpp/src/barretenberg/ecc/fields/field_impl.hpp b/barretenberg/cpp/src/barretenberg/ecc/fields/field_impl.hpp index f97695d9be19..c9283b3847c6 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/fields/field_impl.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/fields/field_impl.hpp @@ -15,6 +15,8 @@ #include #include +#include "./bernstein_yang_inverse.hpp" +#include "./bernstein_yang_inverse_wasm.hpp" #include "./field_declarations.hpp" #include "barretenberg/numeric/uint256/uint256.hpp" @@ -384,6 +386,29 @@ template constexpr field field::pow(const uint64_t exponent) con } template constexpr field field::invert() const noexcept +{ + if (*this == zero()) { + bb::assert_failure("Trying to invert zero in the field"); + } + // BY uses __int128 and is not constexpr-evaluable; compile-time inversions keep Fermat. + if (std::is_constant_evaluated()) { + return pow(modulus_minus_two); + } + if constexpr (bernstein_yang::supported_v) { + constexpr uint256_t p_uint = modulus; + constexpr uint64_t p_inv_mod_2k = bernstein_yang::p_inv_mod_2k_from_montgomery_r_inv(T::r_inv); + field non_mont = from_montgomery_form_reduced(); + uint256_t a{ non_mont.data[0], non_mont.data[1], non_mont.data[2], non_mont.data[3] }; + uint256_t inv = bernstein_yang::invert_vartime(a, p_uint, p_inv_mod_2k); + field result{ inv.data[0], inv.data[1], inv.data[2], inv.data[3] }; + result.self_to_montgomery_form(); + return result; + } else { + return pow(modulus_minus_two); + } +} + +template constexpr field field::invert_const_time() const noexcept { if (*this == zero()) { bb::assert_failure("Trying to invert zero in the field"); diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp index 03c24a5acb91..7c6fbb8d1c5b 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element.hpp @@ -113,6 +113,8 @@ template class alignas(32) element { // constexpr Fr operator/(const element& other) noexcept {} constexpr element normalize() const noexcept; + constexpr element normalize_const_time() const noexcept; + constexpr affine_element to_affine_const_time() const noexcept; static element infinity(); BB_INLINE constexpr element set_infinity() const noexcept; BB_INLINE constexpr void self_set_infinity() noexcept; diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp index bf3687445ad6..7c7f299f5fda 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp @@ -63,6 +63,10 @@ constexpr element& element::operator=(element&& other) noe return *this; } +// Warning: variable-time — calls `z.invert()` (Bernstein-Yang safegcd). Do not +// use on points derived from secret material (signing nonces, private keys, DH +// shared secrets). For those, call `to_affine_const_time()` explicitly; the +// implicit conversion does NOT pick up the const-time path. template constexpr element::operator affine_element() const noexcept { if (is_point_at_infinity()) { @@ -79,6 +83,23 @@ template constexpr element::operator af return result; } +template +constexpr affine_element element::to_affine_const_time() const noexcept +{ + if (is_point_at_infinity()) { + affine_element result; + result.x = Fq(0); + result.y = Fq(0); + result.self_set_infinity(); + return result; + } + Fq z_inv = z.invert_const_time(); + Fq zz_inv = z_inv.sqr(); + Fq zzz_inv = zz_inv * z_inv; + affine_element result(x * zz_inv, y * zzz_inv); + return result; +} + template constexpr void element::self_dbl() noexcept { if constexpr (Fq::modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) { @@ -461,12 +482,20 @@ element element::mul_const_time(const Fr& scalar, numeric: return R0; } +// Warning: variable-time via the implicit affine conversion above. For +// secret-input points use `normalize_const_time()`. template constexpr element element::normalize() const noexcept { const affine_element converted = *this; return element(converted); } +template +constexpr element element::normalize_const_time() const noexcept +{ + return element(to_affine_const_time()); +} + template element element::infinity() { element e{};