From a902cb59981b00b29a72d418489ab59c415dd8cb Mon Sep 17 00:00:00 2001
From: Peter Clemente III <peterc3@live.com>
Date: Fri, 29 May 2026 13:14:38 -0400
Subject: [PATCH 1/2] wave2-polish: CPU unit tests, clippy fixes, CI for host
 crate

Conservative polish pass on the Rust host crate and project hygiene.
No experiment logic, algorithms, or reported numbers were changed.

- Add 9 CPU-only unit tests for the FHRR / Cube-Memory reference
  primitives in cube-memory-host (cpu.rs): complex multiply/conjugate,
  bind/unbind round-trip identity, bind commutativity, unitize unit
  modulus, superpose unit modulus, cleanup self-query exactness, and
  retrieve softmax convexity. These run on stable Rust with no GPU and
  pin the ground truth the GPU parity tests compare against.
- Fix all clippy warnings in cube-memory-host (lib + tests):
  field_reassign_with_default, cloned_ref_to_slice_refs, manual
  div_ceil in parity.rs, and an index-loop rewrite in retrieve(); add a
  scoped allow for the intentional many-arg test harness run_pair().
  Behaviour is unchanged.
- Add .github/workflows/ci.yml: a Rust job (build + clippy -D warnings
  + cpu unit tests on stable) and a Python lint job (compileall + ruff).
  The rust-gpu nightly shader build and the GPU parity tests are
  documented as out of scope (no GPU / fragile nightly on hosted CI).
- README: correct the shaders run instructions (CPU --lib tests vs
  GPU parity tests) and the CI line under Limitations.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci.yml                 |  80 ++++++++++
 README.md                                |  20 ++-
 shaders/cube-memory-host/src/cpu.rs      | 188 ++++++++++++++++++++++-
 shaders/cube-memory-host/src/gpu.rs      |  12 +-
 shaders/cube-memory-host/tests/parity.rs |   8 +-
 5 files changed, 296 insertions(+), 12 deletions(-)
 create mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..ae7ea06
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,80 @@
+name: CI
+
+on:
+  push:
+    branches: [master]
+  pull_request:
+
+# Scope note: this repo is a research/experiment record (a documented
+# negative result). CI covers only what runs deterministically without
+# special hardware or large downloads:
+#
+#   * The Rust `cube-memory-host` crate's CPU reference path: build,
+#     clippy, and the `cpu` unit tests. These run on STABLE Rust.
+#   * A Python syntax check + ruff lint over the experiment scripts.
+#
+# Deliberately NOT in CI (and why):
+#   * The rust-gpu shader build (`cube-memory-shader{,-builder}`) needs
+#     the pinned nightly in `shaders/rust-toolchain.toml` plus rust-src /
+#     rustc-dev / llvm-tools — too heavy and version-fragile for hosted CI.
+#   * The GPU/CPU parity tests in `shaders/cube-memory-host/tests/parity.rs`
+#     require a Vulkan adapter and the prebuilt `.spv`; GitHub runners have
+#     no GPU.
+#   * The Python experiment scripts need PyTorch and local model
+#     checkpoints, so they are linted but not executed.
+
+jobs:
+  rust:
+    name: Rust host crate (CPU path)
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: shaders
+    steps:
+      - uses: actions/checkout@v4
+
+      # Use stable explicitly; the rust-toolchain.toml pins a nightly for
+      # the rust-gpu crates, but the host crate is plain stable Rust.
+      - name: Install stable toolchain
+        run: |
+          rustup toolchain install stable --component clippy rustfmt
+          rustup override set stable
+
+      - name: Cache cargo
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            shaders/target
+          key: cargo-${{ runner.os }}-${{ hashFiles('shaders/**/Cargo.toml') }}
+
+      - name: Build host crate
+        run: cargo build -p cube-memory-host
+
+      - name: Clippy (deny warnings)
+        run: cargo clippy -p cube-memory-host --all-targets -- -D warnings
+
+      - name: CPU reference unit tests
+        run: cargo test -p cube-memory-host --lib
+
+  python:
+    name: Python lint
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install ruff
+        run: pipx install ruff
+
+      - name: Syntax check (compile all)
+        run: python -m compileall -q phase0 phase1 rubik-gen *.py
+
+      - name: Ruff lint
+        # Non-blocking for now: the experiment scripts predate this lint
+        # config, so surface issues without failing the build.
+        run: ruff check . || true
diff --git a/README.md b/README.md
index 04e432d..bbd04a1 100644
--- a/README.md
+++ b/README.md
@@ -70,14 +70,28 @@ will need editing for your environment.
 
 ```bash
 cd shaders
-cargo test            # runs the FHRR algebra kernels on CPU (parity test)
+
+# CPU reference unit tests — pure Rust, run on the stable toolchain,
+# no GPU or prebuilt SPIR-V needed. These pin the FHRR/Cube-Memory
+# reference algebra that the GPU parity tests compare against.
+cargo test -p cube-memory-host --lib
+
+# Full GPU/CPU parity tests — require a Vulkan adapter AND the shader
+# binary built first via the rust-gpu nightly toolchain:
+cargo run  -p cube-memory-shader-builder --release
+cargo test -p cube-memory-host --release   # runs tests/parity.rs
 ```
 
-See `shaders/README.md` for the rust-gpu toolchain details.
+See `shaders/README.md` for the rust-gpu toolchain details (the pinned
+nightly and components needed to build the SPIR-V).
 
 ## Limitations
 
-- This is research/experiment code, not a library — no stable API, no packaging, no CI.
+- This is research/experiment code, not a library — no stable API and no packaging.
+  CI (`.github/workflows/ci.yml`) covers only the deterministic, hardware-free
+  surface: the Rust host crate's CPU reference path (build + clippy + unit tests)
+  and a Python syntax check / ruff lint. The rust-gpu shader build and the GPU
+  parity tests are out of CI scope (nightly toolchain + Vulkan adapter required).
 - Hardware-specific: numbers were measured on an AMD Radeon 890M (gfx1150) Vulkan build of
   llama.cpp; the bandwidth and t/s figures are local measurements, not general benchmarks.
 - The headline conclusion is negative; the layer does not match a linear baseline.
diff --git a/shaders/cube-memory-host/src/cpu.rs b/shaders/cube-memory-host/src/cpu.rs
index 15f36fe..582730f 100644
--- a/shaders/cube-memory-host/src/cpu.rs
+++ b/shaders/cube-memory-host/src/cpu.rs
@@ -104,13 +104,13 @@ pub fn cube_memory_retrieve(
 
     // Compute all sims.
     let mut sims = vec![0.0_f32; n_slots];
-    for j in 0..n_slots {
+    for (j, sim) in sims.iter_mut().enumerate() {
         let row = j * d_key;
         let mut s = 0.0_f32;
         for i in 0..d_key {
             s += query[i] * slot_keys[row + i];
         }
-        sims[j] = s;
+        *sim = s;
     }
 
     // Top-k by descending sim. NOTE on tie-break semantics: CPU uses
@@ -149,3 +149,187 @@ pub fn cube_memory_retrieve(
     }
     out
 }
+
+#[cfg(test)]
+mod tests {
+    //! Unit tests for the CPU reference primitives.
+    //!
+    //! These do not touch the GPU (unlike `tests/parity.rs`) — they pin
+    //! the algebraic behaviour of the reference path itself, so a future
+    //! refactor of `cpu.rs` cannot silently change the ground truth the
+    //! parity tests compare against. They build and run on stable Rust.
+
+    use super::*;
+
+    /// Deterministic unit-modulus phasors for reproducible assertions.
+    fn phasors(n: usize, seed: u32) -> Vec<Vec2> {
+        let mut state = seed.wrapping_mul(2654435761).wrapping_add(1);
+        (0..n)
+            .map(|_| {
+                state = state.wrapping_mul(1664525).wrapping_add(1013904223);
+                let phase =
+                    (state as f32 / u32::MAX as f32) * std::f32::consts::TAU - std::f32::consts::PI;
+                Vec2::new(phase.cos(), phase.sin())
+            })
+            .collect()
+    }
+
+    fn assert_vec2_close(a: &[Vec2], b: &[Vec2], eps: f32) {
+        assert_eq!(a.len(), b.len());
+        for (i, (x, y)) in a.iter().zip(b.iter()).enumerate() {
+            assert!(
+                (x.x - y.x).abs() < eps && (x.y - y.y).abs() < eps,
+                "mismatch at {i}: {x:?} vs {y:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn cmul_matches_complex_multiply() {
+        // (1+2i)(3+4i) = (3-8) + (4+6)i = -5 + 10i
+        let r = cmul(Vec2::new(1.0, 2.0), Vec2::new(3.0, 4.0));
+        assert!((r.x - (-5.0)).abs() < 1e-6);
+        assert!((r.y - 10.0).abs() < 1e-6);
+    }
+
+    #[test]
+    fn cconj_negates_imaginary() {
+        let r = cconj(Vec2::new(3.0, -4.0));
+        assert_eq!(r, Vec2::new(3.0, 4.0));
+    }
+
+    #[test]
+    fn bind_then_unbind_is_identity_for_unit_phasors() {
+        // For unit-modulus phasors, unbind(bind(z, k), k) == z because
+        // k * conj(k) == |k|^2 == 1. This is the core FHRR property the
+        // whole "rotate to face θ, read the snapshot" idea rests on.
+        let z = phasors(64, 7);
+        let k = phasors(64, 11);
+        let bound = fhrr_bind(&z, &k);
+        let recovered = fhrr_unbind(&bound, &k);
+        assert_vec2_close(&z, &recovered, 1e-5);
+    }
+
+    #[test]
+    fn bind_is_commutative() {
+        let a = phasors(32, 1);
+        let b = phasors(32, 2);
+        assert_vec2_close(&fhrr_bind(&a, &b), &fhrr_bind(&b, &a), 1e-6);
+    }
+
+    #[test]
+    fn unitize_produces_unit_modulus() {
+        let v = vec![
+            Vec2::new(3.0, 4.0),
+            Vec2::new(-6.0, 8.0),
+            Vec2::new(0.0, 0.0),
+        ];
+        let u = fhrr_unitize(&v);
+        // First two have modulus 5 and 10 -> normalize to 1.
+        assert!((u[0].length() - 1.0).abs() < 1e-5);
+        assert!((u[1].length() - 1.0).abs() < 1e-5);
+        // Zero vector: eps floor keeps the magnitude tiny, not NaN.
+        assert!(u[2].x.is_finite() && u[2].y.is_finite());
+    }
+
+    #[test]
+    fn superpose_outputs_unit_modulus() {
+        let n = 16;
+        let k = 4;
+        let input: Vec<Vec2> = (0..k).flat_map(|i| phasors(n, 100 + i as u32)).collect();
+        let out = fhrr_superpose(&input, n, k);
+        assert_eq!(out.len(), n);
+        for z in &out {
+            assert!((z.length() - 1.0).abs() < 1e-4);
+        }
+    }
+
+    #[test]
+    fn cleanup_returns_exact_codebook_entry_for_self_query() {
+        // Querying with a codebook entry must snap back to that same
+        // entry (cosine self-similarity is maximal).
+        let m = 8;
+        let d = 16;
+        let codebook: Vec<Vec2> = (0..m).flat_map(|i| phasors(d, 200 + i as u32)).collect();
+        let target = 5usize;
+        let query = codebook[target * d..(target + 1) * d].to_vec();
+        let snapped = cube_memory_cleanup(&query, &codebook, m, d);
+        assert_vec2_close(&snapped, &query, 1e-6);
+    }
+
+    #[test]
+    fn retrieve_softmax_weights_sum_to_one() {
+        // With a single dominant slot, retrieve should approach that
+        // slot's value vector; more generally the weighted gather is a
+        // convex combination of the top-k value rows, so every output
+        // coordinate lies within the min/max of the gathered values.
+        let n_slots = 8;
+        let d_key = 4;
+        let d_value = 3;
+        let top_k = 4;
+        let query = vec![1.0, 0.0, 0.0, 0.0];
+        // Slot 0 keyed to align perfectly with the query.
+        let mut slot_keys = vec![0.0_f32; n_slots * d_key];
+        for j in 0..n_slots {
+            slot_keys[j * d_key] = j as f32 / n_slots as f32;
+        }
+        slot_keys[0] = 10.0; // dominant
+        let slot_values: Vec<f32> = (0..n_slots * d_value).map(|x| x as f32).collect();
+        let out = cube_memory_retrieve(
+            &query,
+            &slot_keys,
+            &slot_values,
+            n_slots,
+            d_key,
+            d_value,
+            top_k,
+        );
+        assert_eq!(out.len(), d_value);
+        // Dominant slot 0 -> output should be close to slot 0's value row.
+        for i in 0..d_value {
+            assert!(
+                (out[i] - slot_values[i]).abs() < 1e-2,
+                "coord {i}: {} vs {}",
+                out[i],
+                slot_values[i]
+            );
+        }
+    }
+
+    #[test]
+    fn retrieve_is_convex_combination() {
+        // Each output coordinate must lie within [min, max] of the
+        // corresponding coordinate across all slot value rows, since the
+        // softmax weights are non-negative and sum to one.
+        let n_slots = 6;
+        let d_key = 3;
+        let d_value = 2;
+        let top_k = 3;
+        let query = vec![0.3, -0.7, 0.5];
+        let slot_keys: Vec<f32> = (0..n_slots * d_key)
+            .map(|x| ((x as f32) * 0.137).sin())
+            .collect();
+        let slot_values: Vec<f32> = (0..n_slots * d_value)
+            .map(|x| ((x as f32) * 0.91).cos() * 4.0)
+            .collect();
+        let out = cube_memory_retrieve(
+            &query,
+            &slot_keys,
+            &slot_values,
+            n_slots,
+            d_key,
+            d_value,
+            top_k,
+        );
+        for c in 0..d_value {
+            let col: Vec<f32> = (0..n_slots).map(|j| slot_values[j * d_value + c]).collect();
+            let lo = col.iter().cloned().fold(f32::INFINITY, f32::min);
+            let hi = col.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
+            assert!(
+                out[c] >= lo - 1e-4 && out[c] <= hi + 1e-4,
+                "coord {c}: {} not in [{lo}, {hi}]",
+                out[c]
+            );
+        }
+    }
+}
diff --git a/shaders/cube-memory-host/src/gpu.rs b/shaders/cube-memory-host/src/gpu.rs
index 92c91fa..cb2da6a 100644
--- a/shaders/cube-memory-host/src/gpu.rs
+++ b/shaders/cube-memory-host/src/gpu.rs
@@ -34,10 +34,12 @@ impl GpuCtx {
             compatible_surface: None,
         }))
         .expect("no wgpu Vulkan adapter found");
-        let mut limits = wgpu::Limits::default();
         // Each kernel uses a small push-constant block (12 bytes max
         // across our six entry points). 32 is a safe ceiling.
-        limits.max_push_constant_size = 32;
+        let limits = wgpu::Limits {
+            max_push_constant_size: 32,
+            ..Default::default()
+        };
         let (device, queue) = pollster::block_on(adapter.request_device(
             &wgpu::DeviceDescriptor {
                 label: Some("cube-memory-host"),
@@ -206,6 +208,10 @@ impl GpuCtx {
     /// B's scratch is bound at `scratch_binding_b` (read-only). Pass
     /// B's output is the last binding. Both passes share a single
     /// push-constant struct.
+    // Many positional args by design: this is a test-only harness for
+    // two-pass kernels, and bundling them into a config struct would add
+    // boilerplate without improving the call sites in `tests/parity.rs`.
+    #[allow(clippy::too_many_arguments)]
     pub fn run_pair<P, T>(
         &self,
         entry_a: &str,
@@ -273,7 +279,7 @@ impl GpuCtx {
         let pl_a = self.device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
             label: Some("cube-mem pl a"),
             bind_group_layouts: &[&bgl_a],
-            push_constant_ranges: &[pcr.clone()],
+            push_constant_ranges: std::slice::from_ref(&pcr),
         });
         let pl_b = self.device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
             label: Some("cube-mem pl b"),
diff --git a/shaders/cube-memory-host/tests/parity.rs b/shaders/cube-memory-host/tests/parity.rs
index 86a70a7..baac993 100644
--- a/shaders/cube-memory-host/tests/parity.rs
+++ b/shaders/cube-memory-host/tests/parity.rs
@@ -105,7 +105,7 @@ fn fhrr_bind_parity() {
 
     let ctx = GpuCtx::new(&shader_path());
     let push = FhrrBindPushConsts { n: n as u32 };
-    let groups = ((n as u32 + 63) / 64, 1, 1);
+    let groups = ((n as u32).div_ceil(64), 1, 1);
     let gpu_out: Vec<Vec2> = ctx.run(
         "fhrr_bind",
         push,
@@ -127,7 +127,7 @@ fn fhrr_unbind_parity() {
 
     let ctx = GpuCtx::new(&shader_path());
     let push = FhrrBindPushConsts { n: n as u32 };
-    let groups = ((n as u32 + 63) / 64, 1, 1);
+    let groups = ((n as u32).div_ceil(64), 1, 1);
     let gpu_out: Vec<Vec2> = ctx.run(
         "fhrr_unbind",
         push,
@@ -152,7 +152,7 @@ fn fhrr_unitize_parity() {
 
     let ctx = GpuCtx::new(&shader_path());
     let push = FhrrBindPushConsts { n: n as u32 };
-    let groups = ((n as u32 + 63) / 64, 1, 1);
+    let groups = ((n as u32).div_ceil(64), 1, 1);
     let gpu_out: Vec<Vec2> = ctx.run(
         "fhrr_unitize",
         push,
@@ -179,7 +179,7 @@ fn fhrr_superpose_parity() {
         n: n as u32,
         k: k as u32,
     };
-    let groups = ((n as u32 + 63) / 64, 1, 1);
+    let groups = ((n as u32).div_ceil(64), 1, 1);
     let gpu_out: Vec<Vec2> = ctx.run(
         "fhrr_superpose",
         push,

From a9b686a26edd23cbd25efaa225f03a9aa9405194 Mon Sep 17 00:00:00 2001
From: Peter Clemente III <peterc3@live.com>
Date: Fri, 29 May 2026 14:12:34 -0400
Subject: [PATCH 2/2] ci: fix rustup component syntax (rustfmt parsed as
 toolchain)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ae7ea06..37dafe1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -37,7 +37,7 @@ jobs:
       # the rust-gpu crates, but the host crate is plain stable Rust.
       - name: Install stable toolchain
         run: |
-          rustup toolchain install stable --component clippy rustfmt
+          rustup toolchain install stable --component clippy --component rustfmt
           rustup override set stable
 
       - name: Cache cargo