From 8786760ccd72b4ce055b7cb5f63885a3ef6fd3be Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 01:00:48 +0100
Subject: [PATCH] Codex-driven sketch of what it would take to put x86 CPU
 levels behind feature flags

---
 fearless_simd/Cargo.toml                  |   7 +-
 fearless_simd/README.md                   |   2 +
 fearless_simd/src/generated.rs            |   8 +-
 fearless_simd/src/generated/avx2.rs       | 204 ++++++++++++++++++++++
 fearless_simd/src/generated/simd_trait.rs |  14 +-
 fearless_simd/src/generated/sse4_2.rs     |   2 +
 fearless_simd/src/kernel_macros.rs        |  78 +++++++--
 fearless_simd/src/lib.rs                  | 137 ++++++++++++---
 fearless_simd/src/macros.rs               |  79 +++++++--
 fearless_simd_gen/src/level.rs            |  24 ++-
 fearless_simd_gen/src/mk_x86.rs           |  19 ++
 fearless_simd_gen/src/ops.rs              |   2 +-
 12 files changed, 501 insertions(+), 75 deletions(-)

diff --git a/fearless_simd/Cargo.toml b/fearless_simd/Cargo.toml
index c04827177..cc045dae6 100644
--- a/fearless_simd/Cargo.toml
+++ b/fearless_simd/Cargo.toml
@@ -26,13 +26,18 @@ rustdoc-args = [
 
 
 [features]
-default = ["std"]
+default = ["std", "sse4_2", "avx2"]
 # Get floating point functions from the standard library (likely using your targets libc).
 # Also allows using `Level::new` on all platforms, to detect which target features are enabled
 std = []
 # Use floating point implementations from libm
 libm = ["dep:libm"]
 
+# Enable the SSE4.2/x86-64-v2 runtime SIMD level on x86 and x86_64.
+sse4_2 = []
+# Enable the AVX2/x86-64-v3 runtime SIMD level on x86 and x86_64.
+avx2 = []
+
 # Force the "fallback" SIMD level to be supported
 # This is primarily used for tests
 force_support_fallback = []
diff --git a/fearless_simd/README.md b/fearless_simd/README.md
index 22da184a3..d1c1178a4 100644
--- a/fearless_simd/README.md
+++ b/fearless_simd/README.md
@@ -158,6 +158,8 @@ The following crate [feature flags](https://doc.rust-lang.org/cargo/reference/fe
 - `std` (enabled by default): Get floating point functions from the standard library (likely using your target's libc).
   Also allows using [`Level::new`] on all platforms, to detect which target features are enabled.
 - `libm`: Use floating point implementations from [libm].
+- `sse4_2` (enabled by default): Enable the SSE4.2/x86-64-v2 runtime SIMD level on `x86` and `x86_64`.
+- `avx2` (enabled by default): Enable the AVX2/x86-64-v3 runtime SIMD level on `x86` and `x86_64`.
 - `force_support_fallback`: Force scalar fallback, to be supported, even if your compilation target has a better baseline.
 
 At least one of `std` and `libm` is required; `std` overrides `libm`.
diff --git a/fearless_simd/src/generated.rs b/fearless_simd/src/generated.rs
index 9d342539a..0e110589b 100644
--- a/fearless_simd/src/generated.rs
+++ b/fearless_simd/src/generated.rs
@@ -44,7 +44,7 @@
 //!
 //! All files in this subdirectory are autogenerated by the `fearless_simd_gen` crate.
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))]
 mod avx2;
 mod fallback;
 #[cfg(target_arch = "aarch64")]
@@ -52,19 +52,19 @@ mod neon;
 mod ops;
 pub(crate) mod simd_trait;
 mod simd_types;
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[cfg(all(feature = "sse4_2", any(target_arch = "x86", target_arch = "x86_64")))]
 mod sse4_2;
 #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
 mod wasm;
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))]
 pub use avx2::*;
 pub use fallback::*;
 #[cfg(target_arch = "aarch64")]
 pub use neon::*;
 pub use simd_trait::*;
 pub use simd_types::*;
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[cfg(all(feature = "sse4_2", any(target_arch = "x86", target_arch = "x86_64")))]
 pub use sse4_2::*;
 #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
 pub use wasm::*;
diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 0a71b8de3..ccb27277a 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -8038,6 +8038,210 @@ impl Simd for Avx2 {
         )
     }
 }
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128, S> for f32x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<f32x4<S>> for __m128 {
+    #[inline(always)]
+    fn from(value: f32x4<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for i8x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<i8x16<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: i8x16<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for u8x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<u8x16<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: u8x16<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for mask8x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<mask8x16<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: mask8x16<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for i16x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<i16x8<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: i16x8<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for u16x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<u16x8<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: u16x8<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for mask16x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<mask16x8<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: mask16x8<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for i32x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<i32x4<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: i32x4<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for u32x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<u32x4<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: u32x4<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for mask32x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<mask32x4<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: mask32x4<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128d, S> for f64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128d) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<f64x2<S>> for __m128d {
+    #[inline(always)]
+    fn from(value: f64x2<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> SimdFrom<__m128i, S> for mask64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: unsafe { core::mem::transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+#[cfg(not(feature = "sse4_2"))]
+impl<S: Simd> From<mask64x2<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: mask64x2<S>) -> Self {
+        unsafe { core::mem::transmute_copy(&value.val) }
+    }
+}
 impl<S: Simd> SimdFrom<__m256, S> for f32x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256) -> Self {
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index 90e5894fe..8f47a027e 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -150,7 +150,7 @@ pub trait Simd:
     fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self>;
@@ -857,7 +857,7 @@ pub trait Simd:
     fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self>;
@@ -984,7 +984,7 @@ pub trait Simd:
     fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self>;
@@ -1713,7 +1713,7 @@ pub trait Simd:
     fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self>;
@@ -1844,7 +1844,7 @@ pub trait Simd:
     fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self>;
@@ -2567,7 +2567,7 @@ pub trait Simd:
     fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self>;
@@ -2813,7 +2813,7 @@ pub trait SimdFloat<S: Simd>:
     fn abs(self) -> Self;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt(self) -> Self;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip(self) -> Self;
     #[doc = "Return a vector with the magnitude of `self` and the sign of `rhs` for each element.\n\nThis operation copies the sign bit, so if an input element is NaN, the output element will be a NaN with the same payload and a copied sign bit."]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self;
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index fb15d17b1..0aab08fc2 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -86,6 +86,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn level(self) -> Level {
         #[cfg(not(all(
+            feature = "avx2",
             target_feature = "avx2",
             target_feature = "bmi1",
             target_feature = "bmi2",
@@ -99,6 +100,7 @@ impl Simd for Sse4_2 {
         )))]
         return Level::Sse4_2(self);
         #[cfg(all(
+            feature = "avx2",
             target_feature = "avx2",
             target_feature = "bmi1",
             target_feature = "bmi2",
diff --git a/fearless_simd/src/kernel_macros.rs b/fearless_simd/src/kernel_macros.rs
index c713657b9..c8a990fdc 100644
--- a/fearless_simd/src/kernel_macros.rs
+++ b/fearless_simd/src/kernel_macros.rs
@@ -55,6 +55,7 @@
 ///
 /// The SIMD token type must be written as a bare supported name:
 /// literally `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`. No paths or aliases.
+/// The x86 token names are accepted only when their matching crate features are enabled.
 ///
 /// For soundness, this macro only accepts safe functions.
 ///
@@ -131,26 +132,14 @@ macro_rules! __fearless_simd_kernel_dispatch {
         Sse4_2,
         $($body:tt)*
     ) => {
-        $crate::__fearless_simd_kernel_impl! {
-            @cfg any(target_arch = "x86", target_arch = "x86_64");
-            @token_ty $crate::Sse4_2;
-            @kernel_attrs #[target_feature(enable = "sse4.2,cmpxchg16b,popcnt")];
-            $($body)*
-        }
+        $crate::__fearless_simd_kernel_dispatch_sse4_2! { $($body)* }
     };
 
     (
         Avx2,
         $($body:tt)*
     ) => {
-        $crate::__fearless_simd_kernel_impl! {
-            @cfg any(target_arch = "x86", target_arch = "x86_64");
-            @token_ty $crate::Avx2;
-            @kernel_attrs #[target_feature(
-                enable = "avx2,bmi1,bmi2,cmpxchg16b,f16c,fma,lzcnt,movbe,popcnt,xsave"
-            )];
-            $($body)*
-        }
+        $crate::__fearless_simd_kernel_dispatch_avx2! { $($body)* }
     };
 
     (
@@ -166,6 +155,56 @@ macro_rules! __fearless_simd_kernel_dispatch {
     };
 }
 
+#[doc(hidden)]
+#[macro_export]
+#[cfg(feature = "sse4_2")]
+macro_rules! __fearless_simd_kernel_dispatch_sse4_2 {
+    ($($body:tt)*) => {
+        $crate::__fearless_simd_kernel_impl! {
+            @cfg any(target_arch = "x86", target_arch = "x86_64");
+            @token_ty $crate::Sse4_2;
+            @kernel_attrs #[target_feature(enable = "sse4.2,cmpxchg16b,popcnt")];
+            $($body)*
+        }
+    };
+}
+
+#[doc(hidden)]
+#[macro_export]
+#[cfg(not(feature = "sse4_2"))]
+macro_rules! __fearless_simd_kernel_dispatch_sse4_2 {
+    ($($body:tt)*) => {
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        compile_error!("fearless_simd::kernel! token `Sse4_2` requires the `sse4_2` crate feature");
+    };
+}
+
+#[doc(hidden)]
+#[macro_export]
+#[cfg(feature = "avx2")]
+macro_rules! __fearless_simd_kernel_dispatch_avx2 {
+    ($($body:tt)*) => {
+        $crate::__fearless_simd_kernel_impl! {
+            @cfg any(target_arch = "x86", target_arch = "x86_64");
+            @token_ty $crate::Avx2;
+            @kernel_attrs #[target_feature(
+                enable = "avx2,bmi1,bmi2,cmpxchg16b,f16c,fma,lzcnt,movbe,popcnt,xsave"
+            )];
+            $($body)*
+        }
+    };
+}
+
+#[doc(hidden)]
+#[macro_export]
+#[cfg(not(feature = "avx2"))]
+macro_rules! __fearless_simd_kernel_dispatch_avx2 {
+    ($($body:tt)*) => {
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        compile_error!("fearless_simd::kernel! token `Avx2` requires the `avx2` crate feature");
+    };
+}
+
 #[doc(hidden)]
 #[macro_export]
 macro_rules! __fearless_simd_kernel_impl {
@@ -205,8 +244,8 @@ macro_rules! __fearless_simd_kernel_impl {
 mod tests {
     #[cfg(any(
         target_arch = "aarch64",
-        target_arch = "x86",
-        target_arch = "x86_64",
+        all(feature = "avx2", target_arch = "x86"),
+        all(feature = "avx2", target_arch = "x86_64"),
         all(target_arch = "wasm32", target_feature = "simd128")
     ))]
     use crate::prelude::*;
@@ -215,9 +254,9 @@ mod tests {
     use core::arch::aarch64::{float32x4_t, vaddq_f32};
     #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
     use core::arch::wasm32::{f32x4_add, v128};
-    #[cfg(target_arch = "x86")]
+    #[cfg(all(feature = "avx2", target_arch = "x86"))]
     use core::arch::x86::{__m256i, _mm256_add_epi32};
-    #[cfg(target_arch = "x86_64")]
+    #[cfg(all(feature = "avx2", target_arch = "x86_64"))]
     use core::arch::x86_64::{__m256i, _mm256_add_epi32};
 
     crate::kernel! {
@@ -232,6 +271,7 @@ mod tests {
         }
     }
 
+    #[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))]
     crate::kernel! {
         fn add_i32x8_avx2(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i {
             _mm256_add_epi32(a, b)
@@ -274,7 +314,7 @@ mod tests {
         );
     }
 
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))]
     #[test]
     fn kernel_instantiates_for_avx2() {
         let Some(avx2) = crate::Level::new().as_avx2() else {
diff --git a/fearless_simd/src/lib.rs b/fearless_simd/src/lib.rs
index 84e91269e..da4153b65 100644
--- a/fearless_simd/src/lib.rs
+++ b/fearless_simd/src/lib.rs
@@ -120,6 +120,8 @@
 //! - `std` (enabled by default): Get floating point functions from the standard library (likely using your target's libc).
 //!   Also allows using [`Level::new`] on all platforms, to detect which target features are enabled.
 //! - `libm`: Use floating point implementations from [libm].
+//! - `sse4_2` (enabled by default): Enable the SSE4.2/x86-64-v2 runtime SIMD level on `x86` and `x86_64`.
+//! - `avx2` (enabled by default): Enable the AVX2/x86-64-v3 runtime SIMD level on `x86` and `x86_64`.
 //! - `force_support_fallback`: Force scalar fallback, to be supported, even if your compilation target has a better baseline.
 //!
 //! At least one of `std` and `libm` is required; `std` overrides `libm`.
@@ -179,9 +181,14 @@ pub mod wasm32 {
 }
 
 /// Implementations of [`Simd`] on x86 architectures (both 32 and 64 bit).
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[cfg(all(
+    any(target_arch = "x86", target_arch = "x86_64"),
+    any(feature = "sse4_2", feature = "avx2")
+))]
 pub mod x86 {
+    #[cfg(feature = "avx2")]
     pub use crate::generated::Avx2;
+    #[cfg(feature = "sse4_2")]
     pub use crate::generated::Sse4_2;
 }
 
@@ -203,10 +210,26 @@ pub enum Level {
         all(target_arch = "aarch64", not(target_feature = "neon")),
         all(
             any(target_arch = "x86", target_arch = "x86_64"),
-            not(all(
-                target_feature = "sse4.2",
-                target_feature = "cmpxchg16b",
-                target_feature = "popcnt"
+            not(any(
+                all(
+                    feature = "avx2",
+                    target_feature = "avx2",
+                    target_feature = "bmi1",
+                    target_feature = "bmi2",
+                    target_feature = "cmpxchg16b",
+                    target_feature = "f16c",
+                    target_feature = "fma",
+                    target_feature = "lzcnt",
+                    target_feature = "movbe",
+                    target_feature = "popcnt",
+                    target_feature = "xsave"
+                ),
+                all(
+                    feature = "sse4_2",
+                    target_feature = "sse4.2",
+                    target_feature = "cmpxchg16b",
+                    target_feature = "popcnt"
+                )
             ))
         ),
         all(target_arch = "wasm32", not(target_feature = "simd128")),
@@ -232,7 +255,9 @@ pub enum Level {
     // We don't need to support this if the compilation target definitely supports something better.
     #[cfg(all(
         any(target_arch = "x86", target_arch = "x86_64"),
+        feature = "sse4_2",
         not(all(
+            feature = "avx2",
             target_feature = "avx2",
             target_feature = "bmi1",
             target_feature = "bmi2",
@@ -247,7 +272,7 @@ pub enum Level {
     ))]
     Sse4_2(Sse4_2),
     /// The x86-64-v3 instruction set on (32 and 64 bit) x86, including AVX2 and FMA.
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))]
     Avx2(Avx2),
     // If new variants are added, make sure to handle them in `Level::dispatch`
     // and `dispatch!()`
@@ -302,6 +327,7 @@ impl Level {
             // avx,fxsr,sse,sse2,sse3,sse4.1,sse4.2,ssse3
             // This can be verified by running:
             // rustc --print=cfg --target x86_64-unknown-linux-gnu -C target-feature='+avx2'
+            #[cfg(feature = "avx2")]
             if std::arch::is_x86_feature_detected!("avx2")
                 && std::arch::is_x86_feature_detected!("bmi1")
                 && std::arch::is_x86_feature_detected!("bmi2")
@@ -314,14 +340,15 @@ impl Level {
                 && std::arch::is_x86_feature_detected!("xsave")
             {
                 return unsafe { Self::Avx2(Avx2::new_unchecked()) };
+            }
+
             // All x86 CPUs that ever shipped with sse4.2 also have cmpxchg16b and popcnt:
             // Intel Nehalem, AMD Bulldozer and VIA Isaiah II were the first with SSE4.2
             // and have these extensions already.
-            } else if std::arch::is_x86_feature_detected!("sse4.2")
-                && std::arch::is_x86_feature_detected!("cmpxchg16b")
-                && std::arch::is_x86_feature_detected!("popcnt")
-            {
-                #[cfg(not(all(
+            #[cfg(all(
+                feature = "sse4_2",
+                not(all(
+                    feature = "avx2",
                     target_feature = "avx2",
                     target_feature = "bmi1",
                     target_feature = "bmi2",
@@ -332,7 +359,12 @@ impl Level {
                     target_feature = "movbe",
                     target_feature = "popcnt",
                     target_feature = "xsave"
-                )))]
+                ))
+            ))]
+            if std::arch::is_x86_feature_detected!("sse4.2")
+                && std::arch::is_x86_feature_detected!("cmpxchg16b")
+                && std::arch::is_x86_feature_detected!("popcnt")
+            {
                 return unsafe { Self::Sse4_2(Sse4_2::new_unchecked()) };
             }
         }
@@ -340,10 +372,26 @@ impl Level {
             all(target_arch = "aarch64", not(target_feature = "neon")),
             all(
                 any(target_arch = "x86", target_arch = "x86_64"),
-                not(all(
-                    target_feature = "sse4.2",
-                    target_feature = "cmpxchg16b",
-                    target_feature = "popcnt"
+                not(any(
+                    all(
+                        feature = "avx2",
+                        target_feature = "avx2",
+                        target_feature = "bmi1",
+                        target_feature = "bmi2",
+                        target_feature = "cmpxchg16b",
+                        target_feature = "f16c",
+                        target_feature = "fma",
+                        target_feature = "lzcnt",
+                        target_feature = "movbe",
+                        target_feature = "popcnt",
+                        target_feature = "xsave"
+                    ),
+                    all(
+                        feature = "sse4_2",
+                        target_feature = "sse4.2",
+                        target_feature = "cmpxchg16b",
+                        target_feature = "popcnt"
+                    )
                 ))
             ),
             all(target_arch = "wasm32", not(target_feature = "simd128")),
@@ -392,10 +440,26 @@ impl Level {
             all(target_arch = "aarch64", not(target_feature = "neon")),
             all(
                 any(target_arch = "x86", target_arch = "x86_64"),
-                not(all(
-                    target_feature = "sse4.2",
-                    target_feature = "cmpxchg16b",
-                    target_feature = "popcnt"
+                not(any(
+                    all(
+                        feature = "avx2",
+                        target_feature = "avx2",
+                        target_feature = "bmi1",
+                        target_feature = "bmi2",
+                        target_feature = "cmpxchg16b",
+                        target_feature = "f16c",
+                        target_feature = "fma",
+                        target_feature = "lzcnt",
+                        target_feature = "movbe",
+                        target_feature = "popcnt",
+                        target_feature = "xsave"
+                    ),
+                    all(
+                        feature = "sse4_2",
+                        target_feature = "sse4.2",
+                        target_feature = "cmpxchg16b",
+                        target_feature = "popcnt"
+                    )
                 ))
             ),
             all(target_arch = "wasm32", not(target_feature = "simd128")),
@@ -466,14 +530,16 @@ impl Level {
     ///
     /// This can be used in combination with the [kernel] macro to safely access level-specific
     /// SIMD intrinsics.
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[cfg(all(feature = "sse4_2", any(target_arch = "x86", target_arch = "x86_64")))]
     #[inline]
     pub fn as_sse4_2(self) -> Option<Sse4_2> {
         match self {
             // Safety: The Avx2 struct represents the x86-64-v3 feature set being enabled, which
             // includes the `sse4.2`, `cmpxchg16b`, and `popcnt` features required by Sse4_2.
+            #[cfg(feature = "avx2")]
             Self::Avx2(_avx) => unsafe { Some(Sse4_2::new_unchecked()) },
             #[cfg(not(all(
+                feature = "avx2",
                 target_feature = "avx2",
                 target_feature = "bmi1",
                 target_feature = "bmi2",
@@ -505,7 +571,7 @@ impl Level {
     ///
     /// This can be used in combination with the [kernel] macro to safely access level-specific
     /// SIMD intrinsics.
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))]
     #[inline]
     pub fn as_avx2(self) -> Option<Avx2> {
         #[allow(
@@ -561,6 +627,7 @@ impl Level {
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         {
             #[cfg(all(
+                feature = "avx2",
                 target_feature = "avx2",
                 target_feature = "bmi1",
                 target_feature = "bmi2",
@@ -574,12 +641,14 @@ impl Level {
             ))]
             return unsafe { Self::Avx2(Avx2::new_unchecked()) };
             #[cfg(all(
+                feature = "sse4_2",
                 all(
                     target_feature = "sse4.2",
                     target_feature = "cmpxchg16b",
                     target_feature = "popcnt"
                 ),
                 not(all(
+                    feature = "avx2",
                     target_feature = "avx2",
                     target_feature = "bmi1",
                     target_feature = "bmi2",
@@ -593,10 +662,26 @@ impl Level {
                 ))
             ))]
             return unsafe { Self::Sse4_2(Sse4_2::new_unchecked()) };
-            #[cfg(not(all(
-                target_feature = "sse4.2",
-                target_feature = "cmpxchg16b",
-                target_feature = "popcnt"
+            #[cfg(not(any(
+                all(
+                    feature = "avx2",
+                    target_feature = "avx2",
+                    target_feature = "bmi1",
+                    target_feature = "bmi2",
+                    target_feature = "cmpxchg16b",
+                    target_feature = "f16c",
+                    target_feature = "fma",
+                    target_feature = "lzcnt",
+                    target_feature = "movbe",
+                    target_feature = "popcnt",
+                    target_feature = "xsave"
+                ),
+                all(
+                    feature = "sse4_2",
+                    target_feature = "sse4.2",
+                    target_feature = "cmpxchg16b",
+                    target_feature = "popcnt"
+                )
             )))]
             return Self::Fallback(Fallback::new());
         }
diff --git a/fearless_simd/src/macros.rs b/fearless_simd/src/macros.rs
index 346913862..839792681 100644
--- a/fearless_simd/src/macros.rs
+++ b/fearless_simd/src/macros.rs
@@ -50,7 +50,7 @@ macro_rules! dispatch {
     // This falls through to the next branch, but with `forced_fallback_arm` turned into a boolean literal
     // indicating whether or not the `force_support_fallback` crate feature is enabled.
     ($level:expr, $simd:pat => $op:expr) => {{ $crate::internal_unstable_dispatch_inner!($level, $simd => $op) }};
-    (@impl $level:expr, $simd:pat => $op:expr; $forced_fallback_arm: literal) => {{
+    (@impl $level:expr, $simd:pat => $op:expr; $sse4_2_arm: literal, $avx2_arm: literal, $forced_fallback_arm: literal) => {{
         /// Convert the `Simd` value into an `impl Simd`, which enforces that
         /// it is correctly handled.
         // TODO: Just make into a `pub` function in fearless_simd itself?
@@ -81,7 +81,9 @@ macro_rules! dispatch {
             // This fallthrough logic is documented at the definition site of `Level`.
             #[cfg(all(
                 any(target_arch = "x86", target_arch = "x86_64"),
+                $sse4_2_arm,
                 not(all(
+                    $avx2_arm,
                     target_feature = "avx2",
                     target_feature = "bmi1",
                     target_feature = "bmi2",
@@ -102,7 +104,10 @@ macro_rules! dispatch {
                     || $op,
                 )
             }
-            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            #[cfg(all(
+                any(target_arch = "x86", target_arch = "x86_64"),
+                $avx2_arm
+            ))]
             $crate::Level::Avx2(avx2) => {
                 let $simd = launder(avx2);
                 $crate::Simd::vectorize(
@@ -115,10 +120,26 @@ macro_rules! dispatch {
                 all(target_arch = "aarch64", not(target_feature = "neon")),
                 all(
                     any(target_arch = "x86", target_arch = "x86_64"),
-                    not(all(
-                        target_feature = "sse4.2",
-                        target_feature = "cmpxchg16b",
-                        target_feature = "popcnt"
+                    not(any(
+                        all(
+                            $avx2_arm,
+                            target_feature = "avx2",
+                            target_feature = "bmi1",
+                            target_feature = "bmi2",
+                            target_feature = "cmpxchg16b",
+                            target_feature = "f16c",
+                            target_feature = "fma",
+                            target_feature = "lzcnt",
+                            target_feature = "movbe",
+                            target_feature = "popcnt",
+                            target_feature = "xsave"
+                        ),
+                        all(
+                            $sse4_2_arm,
+                            target_feature = "sse4.2",
+                            target_feature = "cmpxchg16b",
+                            target_feature = "popcnt"
+                        )
                     ))
                 ),
                 all(target_arch = "wasm32", not(target_feature = "simd128")),
@@ -155,9 +176,7 @@ macro_rules! dispatch {
 #[cfg(feature = "force_support_fallback")]
 macro_rules! internal_unstable_dispatch_inner {
     ($level:expr, $simd:pat => $op:expr) => {
-        $crate::dispatch!(
-            @impl $level, $simd => $op; true
-        )
+        $crate::internal_unstable_dispatch_x86_features!($level, $simd => $op; true)
     };
 }
 
@@ -167,7 +186,47 @@ macro_rules! internal_unstable_dispatch_inner {
 #[cfg(not(feature = "force_support_fallback"))]
 macro_rules! internal_unstable_dispatch_inner {
     ($level:expr, $simd:pat => $op:expr) => {
-        $crate::dispatch!(@impl $level, $simd => $op; false)
+        $crate::internal_unstable_dispatch_x86_features!($level, $simd => $op; false)
+    };
+}
+
+/// Implementation detail of [`crate::dispatch`]; this is not public API.
+#[macro_export]
+#[doc(hidden)]
+#[cfg(all(feature = "sse4_2", feature = "avx2"))]
+macro_rules! internal_unstable_dispatch_x86_features {
+    ($level:expr, $simd:pat => $op:expr; $forced_fallback_arm:literal) => {
+        $crate::dispatch!(@impl $level, $simd => $op; true, true, $forced_fallback_arm)
+    };
+}
+
+/// Implementation detail of [`crate::dispatch`]; this is not public API.
+#[macro_export]
+#[doc(hidden)]
+#[cfg(all(feature = "sse4_2", not(feature = "avx2")))]
+macro_rules! internal_unstable_dispatch_x86_features {
+    ($level:expr, $simd:pat => $op:expr; $forced_fallback_arm:literal) => {
+        $crate::dispatch!(@impl $level, $simd => $op; true, false, $forced_fallback_arm)
+    };
+}
+
+/// Implementation detail of [`crate::dispatch`]; this is not public API.
+#[macro_export]
+#[doc(hidden)]
+#[cfg(all(not(feature = "sse4_2"), feature = "avx2"))]
+macro_rules! internal_unstable_dispatch_x86_features {
+    ($level:expr, $simd:pat => $op:expr; $forced_fallback_arm:literal) => {
+        $crate::dispatch!(@impl $level, $simd => $op; false, true, $forced_fallback_arm)
+    };
+}
+
+/// Implementation detail of [`crate::dispatch`]; this is not public API.
+#[macro_export]
+#[doc(hidden)]
+#[cfg(all(not(feature = "sse4_2"), not(feature = "avx2")))]
+macro_rules! internal_unstable_dispatch_x86_features {
+    ($level:expr, $simd:pat => $op:expr; $forced_fallback_arm:literal) => {
+        $crate::dispatch!(@impl $level, $simd => $op; false, false, $forced_fallback_arm)
     };
 }
 
diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs
index c4800698f..7c3ce3657 100644
--- a/fearless_simd_gen/src/level.rs
+++ b/fearless_simd_gen/src/level.rs
@@ -75,6 +75,19 @@ pub(crate) trait Level {
         TokenStream::new()
     }
 
+    /// Optional cfg attributes for native architecture conversion impls for this vector type.
+    ///
+    /// Returning `None` skips the impl. Returning an empty token stream emits an unconditional impl.
+    fn arch_type_impl_cfg(&self, vec_ty: &VecType) -> Option<TokenStream> {
+        let n_bits = vec_ty.n_bits();
+        // If n_bits is below our native width (e.g. 128 bits for AVX2), another module will have already
+        // implemented the conversion.
+        if n_bits > self.max_block_size() || n_bits < self.native_width() {
+            return None;
+        }
+        Some(TokenStream::new())
+    }
+
     /// The body of the `Simd::level` function. This can be overridden, e.g. to return `Level::baseline()` if we know a
     /// higher SIMD level is statically enabled.
     fn make_level_body(&self) -> TokenStream {
@@ -175,19 +188,15 @@ pub(crate) trait Level {
     }
 
     fn make_type_impl(&self) -> TokenStream {
-        let native_width = self.native_width();
-        let max_block_size = self.max_block_size();
         let mut result = vec![];
         for ty in SIMD_TYPES {
-            let n_bits = ty.n_bits();
-            // If n_bits is below our native width (e.g. 128 bits for AVX2), another module will have already
-            // implemented the conversion.
-            if n_bits > max_block_size || n_bits < native_width {
+            let Some(cfg) = self.arch_type_impl_cfg(ty) else {
                 continue;
-            }
+            };
             let simd = ty.rust();
             let arch = self.arch_ty(ty);
             result.push(quote! {
+                #cfg
                 impl<S: Simd> SimdFrom<#arch, S> for #simd<S> {
                     #[inline(always)]
                     fn simd_from(simd: S, arch: #arch) -> Self {
@@ -197,6 +206,7 @@ pub(crate) trait Level {
                         }
                     }
                 }
+                #cfg
                 impl<S: Simd> From<#simd<S>> for #arch {
                     #[inline(always)]
                     fn from(value: #simd<S>) -> Self {
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index a9d52d8ee..76ea6fdfc 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -99,11 +99,29 @@ impl Level for X86 {
         }
     }
 
+    fn arch_type_impl_cfg(&self, vec_ty: &VecType) -> Option<TokenStream> {
+        let n_bits = vec_ty.n_bits();
+        if n_bits > self.max_block_size() {
+            return None;
+        }
+
+        match (self, n_bits) {
+            (Self::Sse4_2, 128) | (Self::Avx2, 256) => Some(TokenStream::new()),
+            // The AVX2 backend uses 128-bit intrinsics internally. Normally the SSE4.2 backend
+            // provides those conversion impls, but AVX2 still needs them when SSE4.2 is disabled.
+            (Self::Avx2, 128) => Some(quote! {
+                #[cfg(not(feature = "sse4_2"))]
+            }),
+            _ => None,
+        }
+    }
+
     fn make_level_body(&self) -> TokenStream {
         let level_tok = self.token();
         match self {
             Self::Sse4_2 => quote! {
                 #[cfg(not(all(
+                    feature = "avx2",
                     target_feature = "avx2",
                     target_feature = "bmi1",
                     target_feature = "bmi2",
@@ -117,6 +135,7 @@ impl Level for X86 {
                 )))]
                 return Level::#level_tok(self);
                 #[cfg(all(
+                    feature = "avx2",
                     target_feature = "avx2",
                     target_feature = "bmi1",
                     target_feature = "bmi2",
diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs
index 08ba196c6..b54b94d49 100644
--- a/fearless_simd_gen/src/ops.rs
+++ b/fearless_simd_gen/src/ops.rs
@@ -587,7 +587,7 @@ const FLOAT_OPS: &[Op] = &[
         "Compute an approximate reciprocal (`1. / x`) for each element.\n\n\
          This uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\n\
          On x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. \
-         On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. \
+         On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. \
          The precision of this operation may change as new platform support is added.",
     ),
     Op::new(