From 8786760ccd72b4ce055b7cb5f63885a3ef6fd3be Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 01:00:48 +0100 Subject: [PATCH] Codex-driven sketch of what it would take to put x86 CPU levels behind feature flags --- fearless_simd/Cargo.toml | 7 +- fearless_simd/README.md | 2 + fearless_simd/src/generated.rs | 8 +- fearless_simd/src/generated/avx2.rs | 204 ++++++++++++++++++++++ fearless_simd/src/generated/simd_trait.rs | 14 +- fearless_simd/src/generated/sse4_2.rs | 2 + fearless_simd/src/kernel_macros.rs | 78 +++++++-- fearless_simd/src/lib.rs | 137 ++++++++++++--- fearless_simd/src/macros.rs | 79 +++++++-- fearless_simd_gen/src/level.rs | 24 ++- fearless_simd_gen/src/mk_x86.rs | 19 ++ fearless_simd_gen/src/ops.rs | 2 +- 12 files changed, 501 insertions(+), 75 deletions(-) diff --git a/fearless_simd/Cargo.toml b/fearless_simd/Cargo.toml index c04827177..cc045dae6 100644 --- a/fearless_simd/Cargo.toml +++ b/fearless_simd/Cargo.toml @@ -26,13 +26,18 @@ rustdoc-args = [ [features] -default = ["std"] +default = ["std", "sse4_2", "avx2"] # Get floating point functions from the standard library (likely using your targets libc). # Also allows using `Level::new` on all platforms, to detect which target features are enabled std = [] # Use floating point implementations from libm libm = ["dep:libm"] +# Enable the SSE4.2/x86-64-v2 runtime SIMD level on x86 and x86_64. +sse4_2 = [] +# Enable the AVX2/x86-64-v3 runtime SIMD level on x86 and x86_64. +avx2 = [] + # Force the "fallback" SIMD level to be supported # This is primarily used for tests force_support_fallback = [] diff --git a/fearless_simd/README.md b/fearless_simd/README.md index 22da184a3..d1c1178a4 100644 --- a/fearless_simd/README.md +++ b/fearless_simd/README.md @@ -158,6 +158,8 @@ The following crate [feature flags](https://doc.rust-lang.org/cargo/reference/fe - `std` (enabled by default): Get floating point functions from the standard library (likely using your target's libc). Also allows using [`Level::new`] on all platforms, to detect which target features are enabled. - `libm`: Use floating point implementations from [libm]. +- `sse4_2` (enabled by default): Enable the SSE4.2/x86-64-v2 runtime SIMD level on `x86` and `x86_64`. +- `avx2` (enabled by default): Enable the AVX2/x86-64-v3 runtime SIMD level on `x86` and `x86_64`. - `force_support_fallback`: Force scalar fallback, to be supported, even if your compilation target has a better baseline. At least one of `std` and `libm` is required; `std` overrides `libm`. diff --git a/fearless_simd/src/generated.rs b/fearless_simd/src/generated.rs index 9d342539a..0e110589b 100644 --- a/fearless_simd/src/generated.rs +++ b/fearless_simd/src/generated.rs @@ -44,7 +44,7 @@ //! //! All files in this subdirectory are autogenerated by the `fearless_simd_gen` crate. -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))] mod avx2; mod fallback; #[cfg(target_arch = "aarch64")] @@ -52,19 +52,19 @@ mod neon; mod ops; pub(crate) mod simd_trait; mod simd_types; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[cfg(all(feature = "sse4_2", any(target_arch = "x86", target_arch = "x86_64")))] mod sse4_2; #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] mod wasm; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))] pub use avx2::*; pub use fallback::*; #[cfg(target_arch = "aarch64")] pub use neon::*; pub use simd_trait::*; pub use simd_types::*; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[cfg(all(feature = "sse4_2", any(target_arch = "x86", target_arch = "x86_64")))] pub use sse4_2::*; #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] pub use wasm::*; diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index 0a71b8de3..ccb27277a 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -8038,6 +8038,210 @@ impl Simd for Avx2 { ) } } +#[cfg(not(feature = "sse4_2"))] +impl SimdFrom<__m128, S> for f32x4 { + #[inline(always)] + fn simd_from(simd: S, arch: __m128) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +#[cfg(not(feature = "sse4_2"))] +impl From> for __m128 { + #[inline(always)] + fn from(value: f32x4) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +#[cfg(not(feature = "sse4_2"))] +impl SimdFrom<__m128i, S> for i8x16 { + #[inline(always)] + fn simd_from(simd: S, arch: __m128i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +#[cfg(not(feature = "sse4_2"))] +impl From> for __m128i { + #[inline(always)] + fn from(value: i8x16) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +#[cfg(not(feature = "sse4_2"))] +impl SimdFrom<__m128i, S> for u8x16 { + #[inline(always)] + fn simd_from(simd: S, arch: __m128i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +#[cfg(not(feature = "sse4_2"))] +impl From> for __m128i { + #[inline(always)] + fn from(value: u8x16) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +#[cfg(not(feature = "sse4_2"))] +impl SimdFrom<__m128i, S> for mask8x16 { + #[inline(always)] + fn simd_from(simd: S, arch: __m128i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +#[cfg(not(feature = "sse4_2"))] +impl From> for __m128i { + #[inline(always)] + fn from(value: mask8x16) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +#[cfg(not(feature = "sse4_2"))] +impl SimdFrom<__m128i, S> for i16x8 { + #[inline(always)] + fn simd_from(simd: S, arch: __m128i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +#[cfg(not(feature = "sse4_2"))] +impl From> for __m128i { + #[inline(always)] + fn from(value: i16x8) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +#[cfg(not(feature = "sse4_2"))] +impl SimdFrom<__m128i, S> for u16x8 { + #[inline(always)] + fn simd_from(simd: S, arch: __m128i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +#[cfg(not(feature = "sse4_2"))] +impl From> for __m128i { + #[inline(always)] + fn from(value: u16x8) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +#[cfg(not(feature = "sse4_2"))] +impl SimdFrom<__m128i, S> for mask16x8 { + #[inline(always)] + fn simd_from(simd: S, arch: __m128i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +#[cfg(not(feature = "sse4_2"))] +impl From> for __m128i { + #[inline(always)] + fn from(value: mask16x8) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +#[cfg(not(feature = "sse4_2"))] +impl SimdFrom<__m128i, S> for i32x4 { + #[inline(always)] + fn simd_from(simd: S, arch: __m128i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +#[cfg(not(feature = "sse4_2"))] +impl From> for __m128i { + #[inline(always)] + fn from(value: i32x4) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +#[cfg(not(feature = "sse4_2"))] +impl SimdFrom<__m128i, S> for u32x4 { + #[inline(always)] + fn simd_from(simd: S, arch: __m128i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +#[cfg(not(feature = "sse4_2"))] +impl From> for __m128i { + #[inline(always)] + fn from(value: u32x4) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +#[cfg(not(feature = "sse4_2"))] +impl SimdFrom<__m128i, S> for mask32x4 { + #[inline(always)] + fn simd_from(simd: S, arch: __m128i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +#[cfg(not(feature = "sse4_2"))] +impl From> for __m128i { + #[inline(always)] + fn from(value: mask32x4) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +#[cfg(not(feature = "sse4_2"))] +impl SimdFrom<__m128d, S> for f64x2 { + #[inline(always)] + fn simd_from(simd: S, arch: __m128d) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +#[cfg(not(feature = "sse4_2"))] +impl From> for __m128d { + #[inline(always)] + fn from(value: f64x2) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} +#[cfg(not(feature = "sse4_2"))] +impl SimdFrom<__m128i, S> for mask64x2 { + #[inline(always)] + fn simd_from(simd: S, arch: __m128i) -> Self { + Self { + val: unsafe { core::mem::transmute_copy(&arch) }, + simd, + } + } +} +#[cfg(not(feature = "sse4_2"))] +impl From> for __m128i { + #[inline(always)] + fn from(value: mask64x2) -> Self { + unsafe { core::mem::transmute_copy(&value.val) } + } +} impl SimdFrom<__m256, S> for f32x8 { #[inline(always)] fn simd_from(simd: S, arch: __m256) -> Self { diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs index 90e5894fe..8f47a027e 100644 --- a/fearless_simd/src/generated/simd_trait.rs +++ b/fearless_simd/src/generated/simd_trait.rs @@ -150,7 +150,7 @@ pub trait Simd: fn neg_f32x4(self, a: f32x4) -> f32x4; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f32x4(self, a: f32x4) -> f32x4; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f32x4(self, a: f32x4) -> f32x4; #[doc = "Add two vectors element-wise."] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4; @@ -857,7 +857,7 @@ pub trait Simd: fn neg_f64x2(self, a: f64x2) -> f64x2; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f64x2(self, a: f64x2) -> f64x2; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f64x2(self, a: f64x2) -> f64x2; #[doc = "Add two vectors element-wise."] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2; @@ -984,7 +984,7 @@ pub trait Simd: fn neg_f32x8(self, a: f32x8) -> f32x8; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f32x8(self, a: f32x8) -> f32x8; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f32x8(self, a: f32x8) -> f32x8; #[doc = "Add two vectors element-wise."] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8; @@ -1713,7 +1713,7 @@ pub trait Simd: fn neg_f64x4(self, a: f64x4) -> f64x4; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f64x4(self, a: f64x4) -> f64x4; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f64x4(self, a: f64x4) -> f64x4; #[doc = "Add two vectors element-wise."] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4; @@ -1844,7 +1844,7 @@ pub trait Simd: fn neg_f32x16(self, a: f32x16) -> f32x16; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f32x16(self, a: f32x16) -> f32x16; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f32x16(self, a: f32x16) -> f32x16; #[doc = "Add two vectors element-wise."] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16; @@ -2567,7 +2567,7 @@ pub trait Simd: fn neg_f64x8(self, a: f64x8) -> f64x8; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f64x8(self, a: f64x8) -> f64x8; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f64x8(self, a: f64x8) -> f64x8; #[doc = "Add two vectors element-wise."] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8; @@ -2813,7 +2813,7 @@ pub trait SimdFloat: fn abs(self) -> Self; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt(self) -> Self; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip(self) -> Self; #[doc = "Return a vector with the magnitude of `self` and the sign of `rhs` for each element.\n\nThis operation copies the sign bit, so if an input element is NaN, the output element will be a NaN with the same payload and a copied sign bit."] fn copysign(self, rhs: impl SimdInto) -> Self; diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index fb15d17b1..0aab08fc2 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -86,6 +86,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn level(self) -> Level { #[cfg(not(all( + feature = "avx2", target_feature = "avx2", target_feature = "bmi1", target_feature = "bmi2", @@ -99,6 +100,7 @@ impl Simd for Sse4_2 { )))] return Level::Sse4_2(self); #[cfg(all( + feature = "avx2", target_feature = "avx2", target_feature = "bmi1", target_feature = "bmi2", diff --git a/fearless_simd/src/kernel_macros.rs b/fearless_simd/src/kernel_macros.rs index c713657b9..c8a990fdc 100644 --- a/fearless_simd/src/kernel_macros.rs +++ b/fearless_simd/src/kernel_macros.rs @@ -55,6 +55,7 @@ /// /// The SIMD token type must be written as a bare supported name: /// literally `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`. No paths or aliases. +/// The x86 token names are accepted only when their matching crate features are enabled. /// /// For soundness, this macro only accepts safe functions. /// @@ -131,26 +132,14 @@ macro_rules! __fearless_simd_kernel_dispatch { Sse4_2, $($body:tt)* ) => { - $crate::__fearless_simd_kernel_impl! { - @cfg any(target_arch = "x86", target_arch = "x86_64"); - @token_ty $crate::Sse4_2; - @kernel_attrs #[target_feature(enable = "sse4.2,cmpxchg16b,popcnt")]; - $($body)* - } + $crate::__fearless_simd_kernel_dispatch_sse4_2! { $($body)* } }; ( Avx2, $($body:tt)* ) => { - $crate::__fearless_simd_kernel_impl! { - @cfg any(target_arch = "x86", target_arch = "x86_64"); - @token_ty $crate::Avx2; - @kernel_attrs #[target_feature( - enable = "avx2,bmi1,bmi2,cmpxchg16b,f16c,fma,lzcnt,movbe,popcnt,xsave" - )]; - $($body)* - } + $crate::__fearless_simd_kernel_dispatch_avx2! { $($body)* } }; ( @@ -166,6 +155,56 @@ macro_rules! __fearless_simd_kernel_dispatch { }; } +#[doc(hidden)] +#[macro_export] +#[cfg(feature = "sse4_2")] +macro_rules! __fearless_simd_kernel_dispatch_sse4_2 { + ($($body:tt)*) => { + $crate::__fearless_simd_kernel_impl! { + @cfg any(target_arch = "x86", target_arch = "x86_64"); + @token_ty $crate::Sse4_2; + @kernel_attrs #[target_feature(enable = "sse4.2,cmpxchg16b,popcnt")]; + $($body)* + } + }; +} + +#[doc(hidden)] +#[macro_export] +#[cfg(not(feature = "sse4_2"))] +macro_rules! __fearless_simd_kernel_dispatch_sse4_2 { + ($($body:tt)*) => { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + compile_error!("fearless_simd::kernel! token `Sse4_2` requires the `sse4_2` crate feature"); + }; +} + +#[doc(hidden)] +#[macro_export] +#[cfg(feature = "avx2")] +macro_rules! __fearless_simd_kernel_dispatch_avx2 { + ($($body:tt)*) => { + $crate::__fearless_simd_kernel_impl! { + @cfg any(target_arch = "x86", target_arch = "x86_64"); + @token_ty $crate::Avx2; + @kernel_attrs #[target_feature( + enable = "avx2,bmi1,bmi2,cmpxchg16b,f16c,fma,lzcnt,movbe,popcnt,xsave" + )]; + $($body)* + } + }; +} + +#[doc(hidden)] +#[macro_export] +#[cfg(not(feature = "avx2"))] +macro_rules! __fearless_simd_kernel_dispatch_avx2 { + ($($body:tt)*) => { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + compile_error!("fearless_simd::kernel! token `Avx2` requires the `avx2` crate feature"); + }; +} + #[doc(hidden)] #[macro_export] macro_rules! __fearless_simd_kernel_impl { @@ -205,8 +244,8 @@ macro_rules! __fearless_simd_kernel_impl { mod tests { #[cfg(any( target_arch = "aarch64", - target_arch = "x86", - target_arch = "x86_64", + all(feature = "avx2", target_arch = "x86"), + all(feature = "avx2", target_arch = "x86_64"), all(target_arch = "wasm32", target_feature = "simd128") ))] use crate::prelude::*; @@ -215,9 +254,9 @@ mod tests { use core::arch::aarch64::{float32x4_t, vaddq_f32}; #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] use core::arch::wasm32::{f32x4_add, v128}; - #[cfg(target_arch = "x86")] + #[cfg(all(feature = "avx2", target_arch = "x86"))] use core::arch::x86::{__m256i, _mm256_add_epi32}; - #[cfg(target_arch = "x86_64")] + #[cfg(all(feature = "avx2", target_arch = "x86_64"))] use core::arch::x86_64::{__m256i, _mm256_add_epi32}; crate::kernel! { @@ -232,6 +271,7 @@ mod tests { } } + #[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))] crate::kernel! { fn add_i32x8_avx2(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i { _mm256_add_epi32(a, b) @@ -274,7 +314,7 @@ mod tests { ); } - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))] #[test] fn kernel_instantiates_for_avx2() { let Some(avx2) = crate::Level::new().as_avx2() else { diff --git a/fearless_simd/src/lib.rs b/fearless_simd/src/lib.rs index 84e91269e..da4153b65 100644 --- a/fearless_simd/src/lib.rs +++ b/fearless_simd/src/lib.rs @@ -120,6 +120,8 @@ //! - `std` (enabled by default): Get floating point functions from the standard library (likely using your target's libc). //! Also allows using [`Level::new`] on all platforms, to detect which target features are enabled. //! - `libm`: Use floating point implementations from [libm]. +//! - `sse4_2` (enabled by default): Enable the SSE4.2/x86-64-v2 runtime SIMD level on `x86` and `x86_64`. +//! - `avx2` (enabled by default): Enable the AVX2/x86-64-v3 runtime SIMD level on `x86` and `x86_64`. //! - `force_support_fallback`: Force scalar fallback, to be supported, even if your compilation target has a better baseline. //! //! At least one of `std` and `libm` is required; `std` overrides `libm`. @@ -179,9 +181,14 @@ pub mod wasm32 { } /// Implementations of [`Simd`] on x86 architectures (both 32 and 64 bit). -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + any(feature = "sse4_2", feature = "avx2") +))] pub mod x86 { + #[cfg(feature = "avx2")] pub use crate::generated::Avx2; + #[cfg(feature = "sse4_2")] pub use crate::generated::Sse4_2; } @@ -203,10 +210,26 @@ pub enum Level { all(target_arch = "aarch64", not(target_feature = "neon")), all( any(target_arch = "x86", target_arch = "x86_64"), - not(all( - target_feature = "sse4.2", - target_feature = "cmpxchg16b", - target_feature = "popcnt" + not(any( + all( + feature = "avx2", + target_feature = "avx2", + target_feature = "bmi1", + target_feature = "bmi2", + target_feature = "cmpxchg16b", + target_feature = "f16c", + target_feature = "fma", + target_feature = "lzcnt", + target_feature = "movbe", + target_feature = "popcnt", + target_feature = "xsave" + ), + all( + feature = "sse4_2", + target_feature = "sse4.2", + target_feature = "cmpxchg16b", + target_feature = "popcnt" + ) )) ), all(target_arch = "wasm32", not(target_feature = "simd128")), @@ -232,7 +255,9 @@ pub enum Level { // We don't need to support this if the compilation target definitely supports something better. #[cfg(all( any(target_arch = "x86", target_arch = "x86_64"), + feature = "sse4_2", not(all( + feature = "avx2", target_feature = "avx2", target_feature = "bmi1", target_feature = "bmi2", @@ -247,7 +272,7 @@ pub enum Level { ))] Sse4_2(Sse4_2), /// The x86-64-v3 instruction set on (32 and 64 bit) x86, including AVX2 and FMA. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))] Avx2(Avx2), // If new variants are added, make sure to handle them in `Level::dispatch` // and `dispatch!()` @@ -302,6 +327,7 @@ impl Level { // avx,fxsr,sse,sse2,sse3,sse4.1,sse4.2,ssse3 // This can be verified by running: // rustc --print=cfg --target x86_64-unknown-linux-gnu -C target-feature='+avx2' + #[cfg(feature = "avx2")] if std::arch::is_x86_feature_detected!("avx2") && std::arch::is_x86_feature_detected!("bmi1") && std::arch::is_x86_feature_detected!("bmi2") @@ -314,14 +340,15 @@ impl Level { && std::arch::is_x86_feature_detected!("xsave") { return unsafe { Self::Avx2(Avx2::new_unchecked()) }; + } + // All x86 CPUs that ever shipped with sse4.2 also have cmpxchg16b and popcnt: // Intel Nehalem, AMD Bulldozer and VIA Isaiah II were the first with SSE4.2 // and have these extensions already. - } else if std::arch::is_x86_feature_detected!("sse4.2") - && std::arch::is_x86_feature_detected!("cmpxchg16b") - && std::arch::is_x86_feature_detected!("popcnt") - { - #[cfg(not(all( + #[cfg(all( + feature = "sse4_2", + not(all( + feature = "avx2", target_feature = "avx2", target_feature = "bmi1", target_feature = "bmi2", @@ -332,7 +359,12 @@ impl Level { target_feature = "movbe", target_feature = "popcnt", target_feature = "xsave" - )))] + )) + ))] + if std::arch::is_x86_feature_detected!("sse4.2") + && std::arch::is_x86_feature_detected!("cmpxchg16b") + && std::arch::is_x86_feature_detected!("popcnt") + { return unsafe { Self::Sse4_2(Sse4_2::new_unchecked()) }; } } @@ -340,10 +372,26 @@ impl Level { all(target_arch = "aarch64", not(target_feature = "neon")), all( any(target_arch = "x86", target_arch = "x86_64"), - not(all( - target_feature = "sse4.2", - target_feature = "cmpxchg16b", - target_feature = "popcnt" + not(any( + all( + feature = "avx2", + target_feature = "avx2", + target_feature = "bmi1", + target_feature = "bmi2", + target_feature = "cmpxchg16b", + target_feature = "f16c", + target_feature = "fma", + target_feature = "lzcnt", + target_feature = "movbe", + target_feature = "popcnt", + target_feature = "xsave" + ), + all( + feature = "sse4_2", + target_feature = "sse4.2", + target_feature = "cmpxchg16b", + target_feature = "popcnt" + ) )) ), all(target_arch = "wasm32", not(target_feature = "simd128")), @@ -392,10 +440,26 @@ impl Level { all(target_arch = "aarch64", not(target_feature = "neon")), all( any(target_arch = "x86", target_arch = "x86_64"), - not(all( - target_feature = "sse4.2", - target_feature = "cmpxchg16b", - target_feature = "popcnt" + not(any( + all( + feature = "avx2", + target_feature = "avx2", + target_feature = "bmi1", + target_feature = "bmi2", + target_feature = "cmpxchg16b", + target_feature = "f16c", + target_feature = "fma", + target_feature = "lzcnt", + target_feature = "movbe", + target_feature = "popcnt", + target_feature = "xsave" + ), + all( + feature = "sse4_2", + target_feature = "sse4.2", + target_feature = "cmpxchg16b", + target_feature = "popcnt" + ) )) ), all(target_arch = "wasm32", not(target_feature = "simd128")), @@ -466,14 +530,16 @@ impl Level { /// /// This can be used in combination with the [kernel] macro to safely access level-specific /// SIMD intrinsics. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[cfg(all(feature = "sse4_2", any(target_arch = "x86", target_arch = "x86_64")))] #[inline] pub fn as_sse4_2(self) -> Option { match self { // Safety: The Avx2 struct represents the x86-64-v3 feature set being enabled, which // includes the `sse4.2`, `cmpxchg16b`, and `popcnt` features required by Sse4_2. + #[cfg(feature = "avx2")] Self::Avx2(_avx) => unsafe { Some(Sse4_2::new_unchecked()) }, #[cfg(not(all( + feature = "avx2", target_feature = "avx2", target_feature = "bmi1", target_feature = "bmi2", @@ -505,7 +571,7 @@ impl Level { /// /// This can be used in combination with the [kernel] macro to safely access level-specific /// SIMD intrinsics. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[cfg(all(feature = "avx2", any(target_arch = "x86", target_arch = "x86_64")))] #[inline] pub fn as_avx2(self) -> Option { #[allow( @@ -561,6 +627,7 @@ impl Level { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { #[cfg(all( + feature = "avx2", target_feature = "avx2", target_feature = "bmi1", target_feature = "bmi2", @@ -574,12 +641,14 @@ impl Level { ))] return unsafe { Self::Avx2(Avx2::new_unchecked()) }; #[cfg(all( + feature = "sse4_2", all( target_feature = "sse4.2", target_feature = "cmpxchg16b", target_feature = "popcnt" ), not(all( + feature = "avx2", target_feature = "avx2", target_feature = "bmi1", target_feature = "bmi2", @@ -593,10 +662,26 @@ impl Level { )) ))] return unsafe { Self::Sse4_2(Sse4_2::new_unchecked()) }; - #[cfg(not(all( - target_feature = "sse4.2", - target_feature = "cmpxchg16b", - target_feature = "popcnt" + #[cfg(not(any( + all( + feature = "avx2", + target_feature = "avx2", + target_feature = "bmi1", + target_feature = "bmi2", + target_feature = "cmpxchg16b", + target_feature = "f16c", + target_feature = "fma", + target_feature = "lzcnt", + target_feature = "movbe", + target_feature = "popcnt", + target_feature = "xsave" + ), + all( + feature = "sse4_2", + target_feature = "sse4.2", + target_feature = "cmpxchg16b", + target_feature = "popcnt" + ) )))] return Self::Fallback(Fallback::new()); } diff --git a/fearless_simd/src/macros.rs b/fearless_simd/src/macros.rs index 346913862..839792681 100644 --- a/fearless_simd/src/macros.rs +++ b/fearless_simd/src/macros.rs @@ -50,7 +50,7 @@ macro_rules! dispatch { // This falls through to the next branch, but with `forced_fallback_arm` turned into a boolean literal // indicating whether or not the `force_support_fallback` crate feature is enabled. ($level:expr, $simd:pat => $op:expr) => {{ $crate::internal_unstable_dispatch_inner!($level, $simd => $op) }}; - (@impl $level:expr, $simd:pat => $op:expr; $forced_fallback_arm: literal) => {{ + (@impl $level:expr, $simd:pat => $op:expr; $sse4_2_arm: literal, $avx2_arm: literal, $forced_fallback_arm: literal) => {{ /// Convert the `Simd` value into an `impl Simd`, which enforces that /// it is correctly handled. // TODO: Just make into a `pub` function in fearless_simd itself? @@ -81,7 +81,9 @@ macro_rules! dispatch { // This fallthrough logic is documented at the definition site of `Level`. #[cfg(all( any(target_arch = "x86", target_arch = "x86_64"), + $sse4_2_arm, not(all( + $avx2_arm, target_feature = "avx2", target_feature = "bmi1", target_feature = "bmi2", @@ -102,7 +104,10 @@ macro_rules! dispatch { || $op, ) } - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + $avx2_arm + ))] $crate::Level::Avx2(avx2) => { let $simd = launder(avx2); $crate::Simd::vectorize( @@ -115,10 +120,26 @@ macro_rules! dispatch { all(target_arch = "aarch64", not(target_feature = "neon")), all( any(target_arch = "x86", target_arch = "x86_64"), - not(all( - target_feature = "sse4.2", - target_feature = "cmpxchg16b", - target_feature = "popcnt" + not(any( + all( + $avx2_arm, + target_feature = "avx2", + target_feature = "bmi1", + target_feature = "bmi2", + target_feature = "cmpxchg16b", + target_feature = "f16c", + target_feature = "fma", + target_feature = "lzcnt", + target_feature = "movbe", + target_feature = "popcnt", + target_feature = "xsave" + ), + all( + $sse4_2_arm, + target_feature = "sse4.2", + target_feature = "cmpxchg16b", + target_feature = "popcnt" + ) )) ), all(target_arch = "wasm32", not(target_feature = "simd128")), @@ -155,9 +176,7 @@ macro_rules! dispatch { #[cfg(feature = "force_support_fallback")] macro_rules! internal_unstable_dispatch_inner { ($level:expr, $simd:pat => $op:expr) => { - $crate::dispatch!( - @impl $level, $simd => $op; true - ) + $crate::internal_unstable_dispatch_x86_features!($level, $simd => $op; true) }; } @@ -167,7 +186,47 @@ macro_rules! internal_unstable_dispatch_inner { #[cfg(not(feature = "force_support_fallback"))] macro_rules! internal_unstable_dispatch_inner { ($level:expr, $simd:pat => $op:expr) => { - $crate::dispatch!(@impl $level, $simd => $op; false) + $crate::internal_unstable_dispatch_x86_features!($level, $simd => $op; false) + }; +} + +/// Implementation detail of [`crate::dispatch`]; this is not public API. +#[macro_export] +#[doc(hidden)] +#[cfg(all(feature = "sse4_2", feature = "avx2"))] +macro_rules! internal_unstable_dispatch_x86_features { + ($level:expr, $simd:pat => $op:expr; $forced_fallback_arm:literal) => { + $crate::dispatch!(@impl $level, $simd => $op; true, true, $forced_fallback_arm) + }; +} + +/// Implementation detail of [`crate::dispatch`]; this is not public API. +#[macro_export] +#[doc(hidden)] +#[cfg(all(feature = "sse4_2", not(feature = "avx2")))] +macro_rules! internal_unstable_dispatch_x86_features { + ($level:expr, $simd:pat => $op:expr; $forced_fallback_arm:literal) => { + $crate::dispatch!(@impl $level, $simd => $op; true, false, $forced_fallback_arm) + }; +} + +/// Implementation detail of [`crate::dispatch`]; this is not public API. +#[macro_export] +#[doc(hidden)] +#[cfg(all(not(feature = "sse4_2"), feature = "avx2"))] +macro_rules! internal_unstable_dispatch_x86_features { + ($level:expr, $simd:pat => $op:expr; $forced_fallback_arm:literal) => { + $crate::dispatch!(@impl $level, $simd => $op; false, true, $forced_fallback_arm) + }; +} + +/// Implementation detail of [`crate::dispatch`]; this is not public API. +#[macro_export] +#[doc(hidden)] +#[cfg(all(not(feature = "sse4_2"), not(feature = "avx2")))] +macro_rules! internal_unstable_dispatch_x86_features { + ($level:expr, $simd:pat => $op:expr; $forced_fallback_arm:literal) => { + $crate::dispatch!(@impl $level, $simd => $op; false, false, $forced_fallback_arm) }; } diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs index c4800698f..7c3ce3657 100644 --- a/fearless_simd_gen/src/level.rs +++ b/fearless_simd_gen/src/level.rs @@ -75,6 +75,19 @@ pub(crate) trait Level { TokenStream::new() } + /// Optional cfg attributes for native architecture conversion impls for this vector type. + /// + /// Returning `None` skips the impl. Returning an empty token stream emits an unconditional impl. + fn arch_type_impl_cfg(&self, vec_ty: &VecType) -> Option { + let n_bits = vec_ty.n_bits(); + // If n_bits is below our native width (e.g. 128 bits for AVX2), another module will have already + // implemented the conversion. + if n_bits > self.max_block_size() || n_bits < self.native_width() { + return None; + } + Some(TokenStream::new()) + } + /// The body of the `Simd::level` function. This can be overridden, e.g. to return `Level::baseline()` if we know a /// higher SIMD level is statically enabled. fn make_level_body(&self) -> TokenStream { @@ -175,19 +188,15 @@ pub(crate) trait Level { } fn make_type_impl(&self) -> TokenStream { - let native_width = self.native_width(); - let max_block_size = self.max_block_size(); let mut result = vec![]; for ty in SIMD_TYPES { - let n_bits = ty.n_bits(); - // If n_bits is below our native width (e.g. 128 bits for AVX2), another module will have already - // implemented the conversion. - if n_bits > max_block_size || n_bits < native_width { + let Some(cfg) = self.arch_type_impl_cfg(ty) else { continue; - } + }; let simd = ty.rust(); let arch = self.arch_ty(ty); result.push(quote! { + #cfg impl SimdFrom<#arch, S> for #simd { #[inline(always)] fn simd_from(simd: S, arch: #arch) -> Self { @@ -197,6 +206,7 @@ pub(crate) trait Level { } } } + #cfg impl From<#simd> for #arch { #[inline(always)] fn from(value: #simd) -> Self { diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index a9d52d8ee..76ea6fdfc 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -99,11 +99,29 @@ impl Level for X86 { } } + fn arch_type_impl_cfg(&self, vec_ty: &VecType) -> Option { + let n_bits = vec_ty.n_bits(); + if n_bits > self.max_block_size() { + return None; + } + + match (self, n_bits) { + (Self::Sse4_2, 128) | (Self::Avx2, 256) => Some(TokenStream::new()), + // The AVX2 backend uses 128-bit intrinsics internally. Normally the SSE4.2 backend + // provides those conversion impls, but AVX2 still needs them when SSE4.2 is disabled. + (Self::Avx2, 128) => Some(quote! { + #[cfg(not(feature = "sse4_2"))] + }), + _ => None, + } + } + fn make_level_body(&self) -> TokenStream { let level_tok = self.token(); match self { Self::Sse4_2 => quote! { #[cfg(not(all( + feature = "avx2", target_feature = "avx2", target_feature = "bmi1", target_feature = "bmi2", @@ -117,6 +135,7 @@ impl Level for X86 { )))] return Level::#level_tok(self); #[cfg(all( + feature = "avx2", target_feature = "avx2", target_feature = "bmi1", target_feature = "bmi2", diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs index 08ba196c6..b54b94d49 100644 --- a/fearless_simd_gen/src/ops.rs +++ b/fearless_simd_gen/src/ops.rs @@ -587,7 +587,7 @@ const FLOAT_OPS: &[Op] = &[ "Compute an approximate reciprocal (`1. / x`) for each element.\n\n\ This uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\n\ On x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. \ - On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. \ + On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. \ The precision of this operation may change as new platform support is added.", ), Op::new(