From a902cb59981b00b29a72d418489ab59c415dd8cb Mon Sep 17 00:00:00 2001 From: Peter Clemente III Date: Fri, 29 May 2026 13:14:38 -0400 Subject: [PATCH 1/2] wave2-polish: CPU unit tests, clippy fixes, CI for host crate Conservative polish pass on the Rust host crate and project hygiene. No experiment logic, algorithms, or reported numbers were changed. - Add 9 CPU-only unit tests for the FHRR / Cube-Memory reference primitives in cube-memory-host (cpu.rs): complex multiply/conjugate, bind/unbind round-trip identity, bind commutativity, unitize unit modulus, superpose unit modulus, cleanup self-query exactness, and retrieve softmax convexity. These run on stable Rust with no GPU and pin the ground truth the GPU parity tests compare against. - Fix all clippy warnings in cube-memory-host (lib + tests): field_reassign_with_default, cloned_ref_to_slice_refs, manual div_ceil in parity.rs, and an index-loop rewrite in retrieve(); add a scoped allow for the intentional many-arg test harness run_pair(). Behaviour is unchanged. - Add .github/workflows/ci.yml: a Rust job (build + clippy -D warnings + cpu unit tests on stable) and a Python lint job (compileall + ruff). The rust-gpu nightly shader build and the GPU parity tests are documented as out of scope (no GPU / fragile nightly on hosted CI). - README: correct the shaders run instructions (CPU --lib tests vs GPU parity tests) and the CI line under Limitations. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/ci.yml | 80 ++++++++++ README.md | 20 ++- shaders/cube-memory-host/src/cpu.rs | 188 ++++++++++++++++++++++- shaders/cube-memory-host/src/gpu.rs | 12 +- shaders/cube-memory-host/tests/parity.rs | 8 +- 5 files changed, 296 insertions(+), 12 deletions(-) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..ae7ea06 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,80 @@ +name: CI + +on: + push: + branches: [master] + pull_request: + +# Scope note: this repo is a research/experiment record (a documented +# negative result). CI covers only what runs deterministically without +# special hardware or large downloads: +# +# * The Rust `cube-memory-host` crate's CPU reference path: build, +# clippy, and the `cpu` unit tests. These run on STABLE Rust. +# * A Python syntax check + ruff lint over the experiment scripts. +# +# Deliberately NOT in CI (and why): +# * The rust-gpu shader build (`cube-memory-shader{,-builder}`) needs +# the pinned nightly in `shaders/rust-toolchain.toml` plus rust-src / +# rustc-dev / llvm-tools — too heavy and version-fragile for hosted CI. +# * The GPU/CPU parity tests in `shaders/cube-memory-host/tests/parity.rs` +# require a Vulkan adapter and the prebuilt `.spv`; GitHub runners have +# no GPU. +# * The Python experiment scripts need PyTorch and local model +# checkpoints, so they are linted but not executed. + +jobs: + rust: + name: Rust host crate (CPU path) + runs-on: ubuntu-latest + defaults: + run: + working-directory: shaders + steps: + - uses: actions/checkout@v4 + + # Use stable explicitly; the rust-toolchain.toml pins a nightly for + # the rust-gpu crates, but the host crate is plain stable Rust. + - name: Install stable toolchain + run: | + rustup toolchain install stable --component clippy rustfmt + rustup override set stable + + - name: Cache cargo + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + shaders/target + key: cargo-${{ runner.os }}-${{ hashFiles('shaders/**/Cargo.toml') }} + + - name: Build host crate + run: cargo build -p cube-memory-host + + - name: Clippy (deny warnings) + run: cargo clippy -p cube-memory-host --all-targets -- -D warnings + + - name: CPU reference unit tests + run: cargo test -p cube-memory-host --lib + + python: + name: Python lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install ruff + run: pipx install ruff + + - name: Syntax check (compile all) + run: python -m compileall -q phase0 phase1 rubik-gen *.py + + - name: Ruff lint + # Non-blocking for now: the experiment scripts predate this lint + # config, so surface issues without failing the build. + run: ruff check . || true diff --git a/README.md b/README.md index 04e432d..bbd04a1 100644 --- a/README.md +++ b/README.md @@ -70,14 +70,28 @@ will need editing for your environment. ```bash cd shaders -cargo test # runs the FHRR algebra kernels on CPU (parity test) + +# CPU reference unit tests — pure Rust, run on the stable toolchain, +# no GPU or prebuilt SPIR-V needed. These pin the FHRR/Cube-Memory +# reference algebra that the GPU parity tests compare against. +cargo test -p cube-memory-host --lib + +# Full GPU/CPU parity tests — require a Vulkan adapter AND the shader +# binary built first via the rust-gpu nightly toolchain: +cargo run -p cube-memory-shader-builder --release +cargo test -p cube-memory-host --release # runs tests/parity.rs ``` -See `shaders/README.md` for the rust-gpu toolchain details. +See `shaders/README.md` for the rust-gpu toolchain details (the pinned +nightly and components needed to build the SPIR-V). ## Limitations -- This is research/experiment code, not a library — no stable API, no packaging, no CI. +- This is research/experiment code, not a library — no stable API and no packaging. + CI (`.github/workflows/ci.yml`) covers only the deterministic, hardware-free + surface: the Rust host crate's CPU reference path (build + clippy + unit tests) + and a Python syntax check / ruff lint. The rust-gpu shader build and the GPU + parity tests are out of CI scope (nightly toolchain + Vulkan adapter required). - Hardware-specific: numbers were measured on an AMD Radeon 890M (gfx1150) Vulkan build of llama.cpp; the bandwidth and t/s figures are local measurements, not general benchmarks. - The headline conclusion is negative; the layer does not match a linear baseline. diff --git a/shaders/cube-memory-host/src/cpu.rs b/shaders/cube-memory-host/src/cpu.rs index 15f36fe..582730f 100644 --- a/shaders/cube-memory-host/src/cpu.rs +++ b/shaders/cube-memory-host/src/cpu.rs @@ -104,13 +104,13 @@ pub fn cube_memory_retrieve( // Compute all sims. let mut sims = vec![0.0_f32; n_slots]; - for j in 0..n_slots { + for (j, sim) in sims.iter_mut().enumerate() { let row = j * d_key; let mut s = 0.0_f32; for i in 0..d_key { s += query[i] * slot_keys[row + i]; } - sims[j] = s; + *sim = s; } // Top-k by descending sim. NOTE on tie-break semantics: CPU uses @@ -149,3 +149,187 @@ pub fn cube_memory_retrieve( } out } + +#[cfg(test)] +mod tests { + //! Unit tests for the CPU reference primitives. + //! + //! These do not touch the GPU (unlike `tests/parity.rs`) — they pin + //! the algebraic behaviour of the reference path itself, so a future + //! refactor of `cpu.rs` cannot silently change the ground truth the + //! parity tests compare against. They build and run on stable Rust. + + use super::*; + + /// Deterministic unit-modulus phasors for reproducible assertions. + fn phasors(n: usize, seed: u32) -> Vec { + let mut state = seed.wrapping_mul(2654435761).wrapping_add(1); + (0..n) + .map(|_| { + state = state.wrapping_mul(1664525).wrapping_add(1013904223); + let phase = + (state as f32 / u32::MAX as f32) * std::f32::consts::TAU - std::f32::consts::PI; + Vec2::new(phase.cos(), phase.sin()) + }) + .collect() + } + + fn assert_vec2_close(a: &[Vec2], b: &[Vec2], eps: f32) { + assert_eq!(a.len(), b.len()); + for (i, (x, y)) in a.iter().zip(b.iter()).enumerate() { + assert!( + (x.x - y.x).abs() < eps && (x.y - y.y).abs() < eps, + "mismatch at {i}: {x:?} vs {y:?}" + ); + } + } + + #[test] + fn cmul_matches_complex_multiply() { + // (1+2i)(3+4i) = (3-8) + (4+6)i = -5 + 10i + let r = cmul(Vec2::new(1.0, 2.0), Vec2::new(3.0, 4.0)); + assert!((r.x - (-5.0)).abs() < 1e-6); + assert!((r.y - 10.0).abs() < 1e-6); + } + + #[test] + fn cconj_negates_imaginary() { + let r = cconj(Vec2::new(3.0, -4.0)); + assert_eq!(r, Vec2::new(3.0, 4.0)); + } + + #[test] + fn bind_then_unbind_is_identity_for_unit_phasors() { + // For unit-modulus phasors, unbind(bind(z, k), k) == z because + // k * conj(k) == |k|^2 == 1. This is the core FHRR property the + // whole "rotate to face θ, read the snapshot" idea rests on. + let z = phasors(64, 7); + let k = phasors(64, 11); + let bound = fhrr_bind(&z, &k); + let recovered = fhrr_unbind(&bound, &k); + assert_vec2_close(&z, &recovered, 1e-5); + } + + #[test] + fn bind_is_commutative() { + let a = phasors(32, 1); + let b = phasors(32, 2); + assert_vec2_close(&fhrr_bind(&a, &b), &fhrr_bind(&b, &a), 1e-6); + } + + #[test] + fn unitize_produces_unit_modulus() { + let v = vec![ + Vec2::new(3.0, 4.0), + Vec2::new(-6.0, 8.0), + Vec2::new(0.0, 0.0), + ]; + let u = fhrr_unitize(&v); + // First two have modulus 5 and 10 -> normalize to 1. + assert!((u[0].length() - 1.0).abs() < 1e-5); + assert!((u[1].length() - 1.0).abs() < 1e-5); + // Zero vector: eps floor keeps the magnitude tiny, not NaN. + assert!(u[2].x.is_finite() && u[2].y.is_finite()); + } + + #[test] + fn superpose_outputs_unit_modulus() { + let n = 16; + let k = 4; + let input: Vec = (0..k).flat_map(|i| phasors(n, 100 + i as u32)).collect(); + let out = fhrr_superpose(&input, n, k); + assert_eq!(out.len(), n); + for z in &out { + assert!((z.length() - 1.0).abs() < 1e-4); + } + } + + #[test] + fn cleanup_returns_exact_codebook_entry_for_self_query() { + // Querying with a codebook entry must snap back to that same + // entry (cosine self-similarity is maximal). + let m = 8; + let d = 16; + let codebook: Vec = (0..m).flat_map(|i| phasors(d, 200 + i as u32)).collect(); + let target = 5usize; + let query = codebook[target * d..(target + 1) * d].to_vec(); + let snapped = cube_memory_cleanup(&query, &codebook, m, d); + assert_vec2_close(&snapped, &query, 1e-6); + } + + #[test] + fn retrieve_softmax_weights_sum_to_one() { + // With a single dominant slot, retrieve should approach that + // slot's value vector; more generally the weighted gather is a + // convex combination of the top-k value rows, so every output + // coordinate lies within the min/max of the gathered values. + let n_slots = 8; + let d_key = 4; + let d_value = 3; + let top_k = 4; + let query = vec![1.0, 0.0, 0.0, 0.0]; + // Slot 0 keyed to align perfectly with the query. + let mut slot_keys = vec![0.0_f32; n_slots * d_key]; + for j in 0..n_slots { + slot_keys[j * d_key] = j as f32 / n_slots as f32; + } + slot_keys[0] = 10.0; // dominant + let slot_values: Vec = (0..n_slots * d_value).map(|x| x as f32).collect(); + let out = cube_memory_retrieve( + &query, + &slot_keys, + &slot_values, + n_slots, + d_key, + d_value, + top_k, + ); + assert_eq!(out.len(), d_value); + // Dominant slot 0 -> output should be close to slot 0's value row. + for i in 0..d_value { + assert!( + (out[i] - slot_values[i]).abs() < 1e-2, + "coord {i}: {} vs {}", + out[i], + slot_values[i] + ); + } + } + + #[test] + fn retrieve_is_convex_combination() { + // Each output coordinate must lie within [min, max] of the + // corresponding coordinate across all slot value rows, since the + // softmax weights are non-negative and sum to one. + let n_slots = 6; + let d_key = 3; + let d_value = 2; + let top_k = 3; + let query = vec![0.3, -0.7, 0.5]; + let slot_keys: Vec = (0..n_slots * d_key) + .map(|x| ((x as f32) * 0.137).sin()) + .collect(); + let slot_values: Vec = (0..n_slots * d_value) + .map(|x| ((x as f32) * 0.91).cos() * 4.0) + .collect(); + let out = cube_memory_retrieve( + &query, + &slot_keys, + &slot_values, + n_slots, + d_key, + d_value, + top_k, + ); + for c in 0..d_value { + let col: Vec = (0..n_slots).map(|j| slot_values[j * d_value + c]).collect(); + let lo = col.iter().cloned().fold(f32::INFINITY, f32::min); + let hi = col.iter().cloned().fold(f32::NEG_INFINITY, f32::max); + assert!( + out[c] >= lo - 1e-4 && out[c] <= hi + 1e-4, + "coord {c}: {} not in [{lo}, {hi}]", + out[c] + ); + } + } +} diff --git a/shaders/cube-memory-host/src/gpu.rs b/shaders/cube-memory-host/src/gpu.rs index 92c91fa..cb2da6a 100644 --- a/shaders/cube-memory-host/src/gpu.rs +++ b/shaders/cube-memory-host/src/gpu.rs @@ -34,10 +34,12 @@ impl GpuCtx { compatible_surface: None, })) .expect("no wgpu Vulkan adapter found"); - let mut limits = wgpu::Limits::default(); // Each kernel uses a small push-constant block (12 bytes max // across our six entry points). 32 is a safe ceiling. - limits.max_push_constant_size = 32; + let limits = wgpu::Limits { + max_push_constant_size: 32, + ..Default::default() + }; let (device, queue) = pollster::block_on(adapter.request_device( &wgpu::DeviceDescriptor { label: Some("cube-memory-host"), @@ -206,6 +208,10 @@ impl GpuCtx { /// B's scratch is bound at `scratch_binding_b` (read-only). Pass /// B's output is the last binding. Both passes share a single /// push-constant struct. + // Many positional args by design: this is a test-only harness for + // two-pass kernels, and bundling them into a config struct would add + // boilerplate without improving the call sites in `tests/parity.rs`. + #[allow(clippy::too_many_arguments)] pub fn run_pair( &self, entry_a: &str, @@ -273,7 +279,7 @@ impl GpuCtx { let pl_a = self.device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { label: Some("cube-mem pl a"), bind_group_layouts: &[&bgl_a], - push_constant_ranges: &[pcr.clone()], + push_constant_ranges: std::slice::from_ref(&pcr), }); let pl_b = self.device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { label: Some("cube-mem pl b"), diff --git a/shaders/cube-memory-host/tests/parity.rs b/shaders/cube-memory-host/tests/parity.rs index 86a70a7..baac993 100644 --- a/shaders/cube-memory-host/tests/parity.rs +++ b/shaders/cube-memory-host/tests/parity.rs @@ -105,7 +105,7 @@ fn fhrr_bind_parity() { let ctx = GpuCtx::new(&shader_path()); let push = FhrrBindPushConsts { n: n as u32 }; - let groups = ((n as u32 + 63) / 64, 1, 1); + let groups = ((n as u32).div_ceil(64), 1, 1); let gpu_out: Vec = ctx.run( "fhrr_bind", push, @@ -127,7 +127,7 @@ fn fhrr_unbind_parity() { let ctx = GpuCtx::new(&shader_path()); let push = FhrrBindPushConsts { n: n as u32 }; - let groups = ((n as u32 + 63) / 64, 1, 1); + let groups = ((n as u32).div_ceil(64), 1, 1); let gpu_out: Vec = ctx.run( "fhrr_unbind", push, @@ -152,7 +152,7 @@ fn fhrr_unitize_parity() { let ctx = GpuCtx::new(&shader_path()); let push = FhrrBindPushConsts { n: n as u32 }; - let groups = ((n as u32 + 63) / 64, 1, 1); + let groups = ((n as u32).div_ceil(64), 1, 1); let gpu_out: Vec = ctx.run( "fhrr_unitize", push, @@ -179,7 +179,7 @@ fn fhrr_superpose_parity() { n: n as u32, k: k as u32, }; - let groups = ((n as u32 + 63) / 64, 1, 1); + let groups = ((n as u32).div_ceil(64), 1, 1); let gpu_out: Vec = ctx.run( "fhrr_superpose", push, From a9b686a26edd23cbd25efaa225f03a9aa9405194 Mon Sep 17 00:00:00 2001 From: Peter Clemente III Date: Fri, 29 May 2026 14:12:34 -0400 Subject: [PATCH 2/2] ci: fix rustup component syntax (rustfmt parsed as toolchain) Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ae7ea06..37dafe1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,7 +37,7 @@ jobs: # the rust-gpu crates, but the host crate is plain stable Rust. - name: Install stable toolchain run: | - rustup toolchain install stable --component clippy rustfmt + rustup toolchain install stable --component clippy --component rustfmt rustup override set stable - name: Cache cargo