diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 01cd78a21..906b886c5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,12 +3,12 @@ env:
   # version like 1.70. Note that we only specify MAJOR.MINOR and not PATCH so that bugfixes still
   # come automatically. If the version specified here is no longer the latest stable version,
   # then please feel free to submit a PR that adjusts it along with the potential clippy fixes.
-  RUST_STABLE_VER: "1.88" # In quotes because otherwise (e.g.) 1.70 would be interpreted as 1.7
+  RUST_STABLE_VER: "1.89" # In quotes because otherwise (e.g.) 1.70 would be interpreted as 1.7
   # The purpose of checking with the minimum supported Rust toolchain is to detect its staleness.
   # If the compilation fails, then the version specified here needs to be bumped up to reality.
   # Be sure to also update the rust-version property in the workspace Cargo.toml file,
   # plus all the README.md files of the affected packages.
-  RUST_MIN_VER: "1.88"
+  RUST_MIN_VER: "1.89"
   # List of packages that will be checked with the minimum supported Rust version.
   # This should be limited to packages that are intended for publishing.
   RUST_MIN_VER_PKGS: "-p fearless_simd"
@@ -268,8 +268,7 @@ jobs:
       - name: run tests on CPU with AVX-512
         # Github Actions doesn't give us AVX-512 so this is the only way to exercise AVX-512 codepaths on CI.
         # -icl stands for Ice Lake. Technically Skylake added AVX-512 first, but it's mostly useless there due to
-        # downclocking. When we do eventually add explicit AVX-512 support, we'll likely target the Ice Lake feature
-        # level.
+        # downclocking, so our explicit AVX-512 level targets Ice Lake.
         run: ${SDE_PKG}/sde64 -icl -- cargo test $CARGO_TEST_ARGS
 
   test-aarch64-qemu:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index facb8b857..7638028ac 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,7 +11,15 @@ You can find its changes [documented below](#041-2026-05-16).
 
 ## [Unreleased]
 
-This release has an [MSRV][] of 1.88.
+This release has an [MSRV][] of 1.89.
+
+### Added
+
+- Added Ice Lake-class AVX-512 support with a generated `Avx512` level and 512-bit native-width vector types.
+
+### Changed
+
+- The MSRV is now Rust 1.89.
 
 ## [0.4.1][] (2026-05-16)
 
diff --git a/Cargo.toml b/Cargo.toml
index 0158a30a3..398c2c514 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ license = "Apache-2.0 OR MIT"
 repository = "https://github.com/linebender/fearless_simd"
 # Keep in sync with RUST_MIN_VER in .github/workflows/ci.yml, with the relevant README.md files
 # and with the MSRV in the `Unreleased` section of CHANGELOG.md.
-rust-version = "1.88"
+rust-version = "1.89"
 
 [workspace.lints]
 
@@ -44,10 +44,11 @@ clippy.collection_is_never_read = "warn"
 clippy.default_trait_access = "warn"
 clippy.dbg_macro = "warn"
 clippy.debug_assert_with_mut_call = "warn"
+clippy.disallowed_methods = "deny"
 clippy.doc_markdown = "warn"
 clippy.fn_to_numeric_cast_any = "warn"
 clippy.infinite_loop = "warn"
-clippy.large_stack_arrays = "warn"
+clippy.large_stack_arrays = "allow"             # appears to be buggy as of 1.93, fixed in 1.95. TODO: re-enable
 clippy.mismatching_type_param_order = "warn"
 clippy.missing_assert_message = "warn"
 clippy.missing_fields_in_debug = "warn"
diff --git a/README.md b/README.md
index 3e7243a11..b94d5beb5 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ It benefited from conversations with Luca Versari, though he is not responsible
 
 ## Minimum supported Rust Version (MSRV)
 
-This version of Fearless SIMD has been verified to compile with **Rust 1.88** and later.
+This version of Fearless SIMD has been verified to compile with **Rust 1.89** and later.
 
 Future versions of Fearless SIMD might increase the Rust version requirement.
 It will not be treated as a breaking change and as such can even happen with small patch releases.
diff --git a/check_targets.sh b/check_targets.sh
index 90b09fb7f..98e61c22c 100644
--- a/check_targets.sh
+++ b/check_targets.sh
@@ -15,6 +15,8 @@ cargo check -p fearless_simd --target aarch64-linux-android  --features force_su
 cargo check -p fearless_simd --target aarch64-linux-android
 
 # x86_64, at all supported static SIMD levels.
+RUSTFLAGS=-Ctarget-cpu=icelake-server cargo check -p fearless_simd --target x86_64-unknown-linux-gnu
+RUSTFLAGS=-Ctarget-cpu=icelake-server cargo check -p fearless_simd --target x86_64-unknown-linux-gnu --features force_support_fallback
 RUSTFLAGS=-Ctarget-feature=+avx2,+fma cargo check -p fearless_simd --target x86_64-unknown-linux-gnu
 RUSTFLAGS=-Ctarget-feature=+avx2,+fma cargo check -p fearless_simd --target x86_64-unknown-linux-gnu --features force_support_fallback
 RUSTFLAGS=-Ctarget-feature=+sse4.2 cargo check -p fearless_simd --target x86_64-unknown-linux-gnu
diff --git a/fearless_simd/README.md b/fearless_simd/README.md
index 22da184a3..1c4c4410a 100644
--- a/fearless_simd/README.md
+++ b/fearless_simd/README.md
@@ -168,7 +168,7 @@ At least one of `std` and `libm` is required; `std` overrides `libm`.
 
 ## Minimum supported Rust Version (MSRV)
 
-This version of Fearless SIMD has been verified to compile with **Rust 1.88** and later.
+This version of Fearless SIMD has been verified to compile with **Rust 1.89** and later.
 
 Future versions of Fearless SIMD might increase the Rust version requirement.
 It will not be treated as a breaking change and as such can even happen with small patch releases.
diff --git a/fearless_simd/src/generated.rs b/fearless_simd/src/generated.rs
index 381ffadf8..e5d417ece 100644
--- a/fearless_simd/src/generated.rs
+++ b/fearless_simd/src/generated.rs
@@ -41,6 +41,8 @@
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod avx2;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod avx512;
 mod fallback;
 #[cfg(target_arch = "aarch64")]
 mod neon;
@@ -54,6 +56,8 @@ mod wasm;
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub use avx2::*;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub use avx512::*;
 pub use fallback::*;
 #[cfg(target_arch = "aarch64")]
 pub use neon::*;
diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 188b0e696..1ff2ee186 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -907,6 +907,17 @@ impl Simd for Avx2 {
         unsafe { _mm_movemask_epi8(a.into()) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask8x16(self, a: &mut mask8x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask8x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -1410,6 +1421,17 @@ impl Simd for Avx2 {
         }
     }
     #[inline(always)]
+    fn set_mask16x8(self, a: &mut mask16x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask16x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -1918,6 +1940,17 @@ impl Simd for Avx2 {
         unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask32x4(self, a: &mut mask32x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask32x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -2235,6 +2268,17 @@ impl Simd for Avx2 {
         unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let mut lanes = self.as_array_mask64x2(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x2(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -3333,6 +3377,17 @@ impl Simd for Avx2 {
         unsafe { _mm256_movemask_epi8(a.into()) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask8x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
     }
@@ -4041,6 +4096,17 @@ impl Simd for Avx2 {
         }
     }
     #[inline(always)]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask16x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
     }
@@ -4686,6 +4752,17 @@ impl Simd for Avx2 {
         unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask32x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
     }
@@ -5078,6 +5155,17 @@ impl Simd for Avx2 {
         unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask64x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
     }
@@ -6230,6 +6318,17 @@ impl Simd for Avx2 {
         lo | (hi << 32usize)
     }
     #[inline(always)]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
+        );
+        let mut lanes = self.as_array_mask8x64(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x64(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         let (a0, a1) = self.split_mask8x64(a);
         let (b0, b1) = self.split_mask8x64(b);
@@ -6985,6 +7084,17 @@ impl Simd for Avx2 {
         }
     }
     #[inline(always)]
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask16x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         let (a0, a1) = self.split_mask16x32(a);
         let (b0, b1) = self.split_mask16x32(b);
@@ -7715,6 +7825,17 @@ impl Simd for Avx2 {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask32x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         let (a0, a1) = self.split_mask32x16(a);
         let (b0, b1) = self.split_mask32x16(b);
@@ -8160,6 +8281,17 @@ impl Simd for Avx2 {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask64x8(self, a: &mut mask64x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask64x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         let (a0, a1) = self.split_mask64x8(a);
         let (b0, b1) = self.split_mask64x8(b);
@@ -8285,16 +8417,15 @@ impl<S: Simd> From<u8x32<S>> for __m256i {
 impl<S: Simd> SimdFrom<__m256i, S> for mask8x32<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
-        Self {
-            val: crate::transmute::checked_transmute_copy(&arch),
-            simd,
-        }
+        let lanes: [i8; 32usize] = crate::transmute::checked_transmute_copy(&arch);
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask8x32<S>> for __m256i {
     #[inline(always)]
     fn from(value: mask8x32<S>) -> Self {
-        crate::transmute::checked_transmute_copy(&value.val)
+        let lanes: [i8; 32usize] = value.into();
+        crate::transmute::checked_transmute_copy(&lanes)
     }
 }
 impl<S: Simd> SimdFrom<__m256i, S> for i16x16<S> {
@@ -8330,16 +8461,15 @@ impl<S: Simd> From<u16x16<S>> for __m256i {
 impl<S: Simd> SimdFrom<__m256i, S> for mask16x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
-        Self {
-            val: crate::transmute::checked_transmute_copy(&arch),
-            simd,
-        }
+        let lanes: [i16; 16usize] = crate::transmute::checked_transmute_copy(&arch);
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask16x16<S>> for __m256i {
     #[inline(always)]
     fn from(value: mask16x16<S>) -> Self {
-        crate::transmute::checked_transmute_copy(&value.val)
+        let lanes: [i16; 16usize] = value.into();
+        crate::transmute::checked_transmute_copy(&lanes)
     }
 }
 impl<S: Simd> SimdFrom<__m256i, S> for i32x8<S> {
@@ -8375,16 +8505,15 @@ impl<S: Simd> From<u32x8<S>> for __m256i {
 impl<S: Simd> SimdFrom<__m256i, S> for mask32x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
-        Self {
-            val: crate::transmute::checked_transmute_copy(&arch),
-            simd,
-        }
+        let lanes: [i32; 8usize] = crate::transmute::checked_transmute_copy(&arch);
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask32x8<S>> for __m256i {
     #[inline(always)]
     fn from(value: mask32x8<S>) -> Self {
-        crate::transmute::checked_transmute_copy(&value.val)
+        let lanes: [i32; 8usize] = value.into();
+        crate::transmute::checked_transmute_copy(&lanes)
     }
 }
 impl<S: Simd> SimdFrom<__m256d, S> for f64x4<S> {
@@ -8405,16 +8534,15 @@ impl<S: Simd> From<f64x4<S>> for __m256d {
 impl<S: Simd> SimdFrom<__m256i, S> for mask64x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
-        Self {
-            val: crate::transmute::checked_transmute_copy(&arch),
-            simd,
-        }
+        let lanes: [i64; 4usize] = crate::transmute::checked_transmute_copy(&arch);
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask64x4<S>> for __m256i {
     #[inline(always)]
     fn from(value: mask64x4<S>) -> Self {
-        crate::transmute::checked_transmute_copy(&value.val)
+        let lanes: [i64; 4usize] = value.into();
+        crate::transmute::checked_transmute_copy(&lanes)
     }
 }
 #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
new file mode 100644
index 000000000..10c2a9658
--- /dev/null
+++ b/fearless_simd/src/generated/avx512.rs
@@ -0,0 +1,9824 @@
+// Copyright 2025 the Fearless_SIMD Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+// This file is autogenerated by fearless_simd_gen
+
+#![allow(
+    clippy::identity_op,
+    reason = "AVX-512 mask code is generated uniformly for all __mmask widths"
+)]
+#![allow(
+    clippy::useless_conversion,
+    reason = "AVX-512 mask code is generated uniformly for all __mmask widths"
+)]
+use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal};
+use crate::{
+    f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
+    i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
+    mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
+    u32x4, u32x8, u32x16,
+};
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+#[doc = "A token for AVX-512 intrinsics on `x86` and `x86_64`, representing an Ice Lake feature level."]
+#[derive(Clone, Copy, Debug)]
+pub struct Avx512 {
+    _private: (),
+}
+impl Avx512 {
+    #[doc = r" Create a SIMD token."]
+    #[doc = r""]
+    #[doc = r" # Safety"]
+    #[doc = r""]
+    #[doc = r" The Ice Lake AVX-512 CPU feature set must be available."]
+    #[inline]
+    pub const unsafe fn new_unchecked() -> Self {
+        Self { _private: () }
+    }
+}
+impl Seal for Avx512 {}
+impl ArchTypes for Avx512 {
+    type f32x4 = crate::support::Aligned128<__m128>;
+    type i8x16 = crate::support::Aligned128<__m128i>;
+    type u8x16 = crate::support::Aligned128<__m128i>;
+    type mask8x16 = __mmask16;
+    type i16x8 = crate::support::Aligned128<__m128i>;
+    type u16x8 = crate::support::Aligned128<__m128i>;
+    type mask16x8 = __mmask8;
+    type i32x4 = crate::support::Aligned128<__m128i>;
+    type u32x4 = crate::support::Aligned128<__m128i>;
+    type mask32x4 = __mmask8;
+    type f64x2 = crate::support::Aligned128<__m128d>;
+    type mask64x2 = __mmask8;
+    type f32x8 = crate::support::Aligned256<__m256>;
+    type i8x32 = crate::support::Aligned256<__m256i>;
+    type u8x32 = crate::support::Aligned256<__m256i>;
+    type mask8x32 = __mmask32;
+    type i16x16 = crate::support::Aligned256<__m256i>;
+    type u16x16 = crate::support::Aligned256<__m256i>;
+    type mask16x16 = __mmask16;
+    type i32x8 = crate::support::Aligned256<__m256i>;
+    type u32x8 = crate::support::Aligned256<__m256i>;
+    type mask32x8 = __mmask8;
+    type f64x4 = crate::support::Aligned256<__m256d>;
+    type mask64x4 = __mmask8;
+    type f32x16 = crate::support::Aligned512<__m512>;
+    type i8x64 = crate::support::Aligned512<__m512i>;
+    type u8x64 = crate::support::Aligned512<__m512i>;
+    type mask8x64 = __mmask64;
+    type i16x32 = crate::support::Aligned512<__m512i>;
+    type u16x32 = crate::support::Aligned512<__m512i>;
+    type mask16x32 = __mmask32;
+    type i32x16 = crate::support::Aligned512<__m512i>;
+    type u32x16 = crate::support::Aligned512<__m512i>;
+    type mask32x16 = __mmask16;
+    type f64x8 = crate::support::Aligned512<__m512d>;
+    type mask64x8 = __mmask8;
+}
+impl Simd for Avx512 {
+    type f32s = f32x16<Self>;
+    type f64s = f64x8<Self>;
+    type u8s = u8x64<Self>;
+    type i8s = i8x64<Self>;
+    type u16s = u16x32<Self>;
+    type i16s = i16x32<Self>;
+    type u32s = u32x16<Self>;
+    type i32s = i32x16<Self>;
+    type mask8s = mask8x64<Self>;
+    type mask16s = mask16x32<Self>;
+    type mask32s = mask32x16<Self>;
+    type mask64s = mask64x8<Self>;
+    #[inline(always)]
+    fn level(self) -> Level {
+        Level::Avx512(self)
+    }
+    #[inline]
+    fn vectorize<F: FnOnce() -> R, R>(self, f: F) -> R {
+        #[target_feature(
+            enable = "adx,aes,avx512bitalg,avx512bw,avx512cd,avx512dq,avx512f,avx512ifma,avx512vbmi,avx512vbmi2,avx512vl,avx512vnni,avx512vpopcntdq,bmi1,bmi2,cmpxchg16b,fma,gfni,lzcnt,movbe,pclmulqdq,popcnt,rdrand,rdseed,sha,vaes,vpclmulqdq,xsave,xsavec,xsaveopt,xsaves"
+        )]
+        unsafe fn vectorize_avx512<F: FnOnce() -> R, R>(f: F) -> R {
+            f()
+        }
+        unsafe { vectorize_avx512(f) }
+    }
+    #[inline(always)]
+    fn splat_f32x4(self, val: f32) -> f32x4<Self> {
+        unsafe { _mm_set1_ps(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
+        f32x4 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
+        f32x4 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f32x4(self, a: f32x4<Self>) -> [f32; 4usize] {
+        crate::transmute::checked_transmute_copy::<__m128, [f32; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_f32x4(self, a: &f32x4<Self>) -> &[f32; 4usize] {
+        crate::transmute::checked_cast_ref::<__m128, [f32; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_f32x4(self, a: &mut f32x4<Self>) -> &mut [f32; 4usize] {
+        crate::transmute::checked_cast_mut::<__m128, [f32; 4usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_f32x4(self, a: f32x4<Self>, dest: &mut [f32; 4usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const f32,
+                dest.as_mut_ptr(),
+                4usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f32x4(self, a: u8x16<Self>) -> f32x4<Self> {
+        f32x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_f32x4<const SHIFT: usize>(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_f32x4(b).val.0,
+                self.cvt_to_bytes_f32x4(a).val.0,
+                SHIFT * 4usize,
+            );
+            self.cvt_from_bytes_f32x4(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f32x4<const SHIFT: usize>(
+        self,
+        a: f32x4<Self>,
+        b: f32x4<Self>,
+    ) -> f32x4<Self> {
+        self.slide_f32x4::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn abs_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_rcp14_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn div_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn copysign_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            let mask = _mm_set1_ps(-0.0);
+            _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmp_ps_mask::<0i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmp_ps_mask::<17i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmp_ps_mask::<18i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmp_ps_mask::<29i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmp_ps_mask::<30i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn interleave_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> (f32x4<Self>, f32x4<Self>) {
+        (self.zip_low_f32x4(a, b), self.zip_high_f32x4(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> (f32x4<Self>, f32x4<Self>) {
+        (self.unzip_low_f32x4(a, b), self.unzip_high_f32x4(a, b))
+    }
+    #[inline(always)]
+    fn max_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_range_ps::<5i32>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_range_ps::<4i32>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn ceil_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            _mm_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn round_ties_even_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            _mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn fract_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        a - self.trunc_f32x4(a)
+    }
+    #[inline(always)]
+    fn trunc_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            _mm_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn select_f32x4(self, a: mask32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_mask_blend_ps(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x8<Self> {
+        unsafe { _mm256_setr_m128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
+        unsafe { _mm_castps_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_castps_si128(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
+        unsafe { _mm_castps_si128(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_castps_si128(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_cvttps_epu32(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_u32_precise_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
+        unsafe {
+            let a = _mm_max_ps(a.into(), _mm_setzero_ps());
+            let mut converted = _mm_cvttps_epu32(a);
+            let exceeds_unsigned_range = _mm_cmp_ps_mask::<17i32>(_mm_set1_ps(4294967040.0), a);
+            converted = _mm_mask_blend_epi32(
+                exceeds_unsigned_range,
+                converted,
+                _mm_set1_epi32(u32::MAX.cast_signed()),
+            );
+            converted.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn cvt_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_cvttps_epi32(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_i32_precise_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
+        unsafe {
+            let a = a.into();
+            let mut converted = _mm_cvttps_epi32(a);
+            let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0));
+            let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
+            if !all_in_range {
+                converted = _mm_blendv_epi8(
+                    _mm_set1_epi32(i32::MAX),
+                    converted,
+                    _mm_castps_si128(in_range),
+                );
+                let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a));
+                converted = _mm_and_si128(converted, is_not_nan);
+            }
+            converted.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn splat_i8x16(self, val: i8) -> i8x16<Self> {
+        unsafe { _mm_set1_epi8(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
+        i8x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
+        i8x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i8x16(self, a: i8x16<Self>) -> [i8; 16usize] {
+        crate::transmute::checked_transmute_copy::<__m128i, [i8; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i8x16(self, a: &i8x16<Self>) -> &[i8; 16usize] {
+        crate::transmute::checked_cast_ref::<__m128i, [i8; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i8x16(self, a: &mut i8x16<Self>) -> &mut [i8; 16usize] {
+        crate::transmute::checked_cast_mut::<__m128i, [i8; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i8x16(self, a: i8x16<Self>, dest: &mut [i8; 16usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i8,
+                dest.as_mut_ptr(),
+                16usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i8x16(self, a: u8x16<Self>) -> i8x16<Self> {
+        i8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i8x16<const SHIFT: usize>(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe {
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_i8x16(b).val.0,
+                self.cvt_to_bytes_i8x16(a).val.0,
+                SHIFT,
+            );
+            self.cvt_from_bytes_i8x16(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i8x16<const SHIFT: usize>(
+        self,
+        a: i8x16<Self>,
+        b: i8x16<Self>,
+    ) -> i8x16<Self> {
+        self.slide_i8x16::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe {
+            let dst_even = _mm_mullo_epi16(a.into(), b.into());
+            let dst_odd =
+                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
+            _mm_or_si128(
+                _mm_slli_epi16(dst_odd, 8),
+                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
+            let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
+            let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
+            let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
+            _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm_setzero_si128();
+            let value_extend = zero;
+            let lo_values = _mm_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm_set1_epi16(0x00ff);
+            let lo_shifted = _mm_and_si128(_mm_sllv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm_and_si128(_mm_sllv_epi16(hi_values, hi_counts), byte_mask);
+            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
+            let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
+            let lo_shifted = _mm_sra_epi16(lo_16, shift_count);
+            let hi_shifted = _mm_sra_epi16(hi_16, shift_count);
+            _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm_setzero_si128();
+            let value_extend = _mm_cmpgt_epi8(zero, val);
+            let lo_values = _mm_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm_set1_epi16(0x00ff);
+            let lo_shifted = _mm_and_si128(_mm_srav_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm_and_si128(_mm_srav_epi16(hi_values, hi_counts), byte_mask);
+            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmpeq_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmplt_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmple_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmpge_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmpgt_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpacklo_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpackhi_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> (i8x16<Self>, i8x16<Self>) {
+        (self.zip_low_i8x16(a, b), self.zip_high_i8x16(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> (i8x16<Self>, i8x16<Self>) {
+        (self.unzip_low_i8x16(a, b), self.unzip_high_i8x16(a, b))
+    }
+    #[inline(always)]
+    fn select_i8x16(self, a: mask8x16<Self>, b: i8x16<Self>, c: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x32<Self> {
+        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i8x16(self, a: i8x16<Self>) -> u32x4<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_u8x16(self, val: u8) -> u8x16<Self> {
+        unsafe { _mm_set1_epi8(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u8x16(self, a: u8x16<Self>) -> [u8; 16usize] {
+        crate::transmute::checked_transmute_copy::<__m128i, [u8; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u8x16(self, a: &u8x16<Self>) -> &[u8; 16usize] {
+        crate::transmute::checked_cast_ref::<__m128i, [u8; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u8x16(self, a: &mut u8x16<Self>) -> &mut [u8; 16usize] {
+        crate::transmute::checked_cast_mut::<__m128i, [u8; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u8x16(self, a: u8x16<Self>, dest: &mut [u8; 16usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u8,
+                dest.as_mut_ptr(),
+                16usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u8x16<const SHIFT: usize>(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe {
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_u8x16(b).val.0,
+                self.cvt_to_bytes_u8x16(a).val.0,
+                SHIFT,
+            );
+            self.cvt_from_bytes_u8x16(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u8x16<const SHIFT: usize>(
+        self,
+        a: u8x16<Self>,
+        b: u8x16<Self>,
+    ) -> u8x16<Self> {
+        self.slide_u8x16::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe {
+            let dst_even = _mm_mullo_epi16(a.into(), b.into());
+            let dst_odd =
+                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
+            _mm_or_si128(
+                _mm_slli_epi16(dst_odd, 8),
+                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
+            let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
+            let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
+            let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
+            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm_setzero_si128();
+            let value_extend = zero;
+            let lo_values = _mm_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm_set1_epi16(0x00ff);
+            let lo_shifted = _mm_and_si128(_mm_sllv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm_and_si128(_mm_sllv_epi16(hi_values, hi_counts), byte_mask);
+            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
+            let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
+            let lo_shifted = _mm_srl_epi16(lo_16, shift_count);
+            let hi_shifted = _mm_srl_epi16(hi_16, shift_count);
+            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm_setzero_si128();
+            let value_extend = zero;
+            let lo_values = _mm_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm_set1_epi16(0x00ff);
+            let lo_shifted = _mm_and_si128(_mm_srlv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm_and_si128(_mm_srlv_epi16(hi_values, hi_counts), byte_mask);
+            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmpeq_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmplt_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmple_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmpge_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmpgt_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpacklo_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpackhi_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> (u8x16<Self>, u8x16<Self>) {
+        (self.zip_low_u8x16(a, b), self.zip_high_u8x16(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> (u8x16<Self>, u8x16<Self>) {
+        (self.unzip_low_u8x16(a, b), self.unzip_high_u8x16(a, b))
+    }
+    #[inline(always)]
+    fn select_u8x16(self, a: mask8x16<Self>, b: u8x16<Self>, c: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x32<Self> {
+        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn widen_u8x16(self, a: u8x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_cvtepu8_epi16(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u8x16(self, a: u8x16<Self>) -> u32x4<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_mask8x16(self, val: bool) -> mask8x16<Self> {
+        mask8x16 {
+            val: (if val { 65535u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
+        unsafe {
+            let lanes = crate::transmute::checked_transmute_copy(&val);
+            mask8x16 {
+                val: _mm_movepi8_mask(lanes),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask8x16(self, a: mask8x16<Self>) -> [i8; 16usize] {
+        unsafe {
+            let lanes = _mm_movm_epi8(a.val);
+            crate::transmute::checked_transmute_copy(&lanes)
+        }
+    }
+    #[inline(always)]
+    fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self> {
+        mask8x16 {
+            val: (bits & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64 {
+        u64::from((a).val) & 65535u64
+    }
+    #[inline(always)]
+    fn set_mask8x16(self, a: &mut mask8x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask8x16 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
+        mask8x16 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
+        mask8x16 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
+        mask8x16 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask8x16(self, a: mask8x16<Self>) -> mask8x16<Self> {
+        mask8x16 {
+            val: ((!u64::from((a).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask8x16(
+        self,
+        a: mask8x16<Self>,
+        b: mask8x16<Self>,
+        c: mask8x16<Self>,
+    ) -> mask8x16<Self> {
+        mask8x16 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
+        mask8x16 {
+            val: (!u64::from(a.val ^ b.val) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask8x16(self, a: mask8x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask8x16(self, a: mask8x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits == 65535u64
+    }
+    #[inline(always)]
+    fn any_false_mask8x16(self, a: mask8x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits != 65535u64
+    }
+    #[inline(always)]
+    fn all_false_mask8x16(self, a: mask8x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x32<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 16usize)) & 4294967295u64;
+        mask8x32 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn splat_i16x8(self, val: i16) -> i16x8<Self> {
+        unsafe { _mm_set1_epi16(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
+        i16x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
+        i16x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i16x8(self, a: i16x8<Self>) -> [i16; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m128i, [i16; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i16x8(self, a: &i16x8<Self>) -> &[i16; 8usize] {
+        crate::transmute::checked_cast_ref::<__m128i, [i16; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i16x8(self, a: &mut i16x8<Self>) -> &mut [i16; 8usize] {
+        crate::transmute::checked_cast_mut::<__m128i, [i16; 8usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i16x8(self, a: i16x8<Self>, dest: &mut [i16; 8usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i16,
+                dest.as_mut_ptr(),
+                8usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i16x8(self, a: u8x16<Self>) -> i16x8<Self> {
+        i16x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i16x8<const SHIFT: usize>(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe {
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_i16x8(b).val.0,
+                self.cvt_to_bytes_i16x8(a).val.0,
+                SHIFT * 2usize,
+            );
+            self.cvt_from_bytes_i16x8(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i16x8<const SHIFT: usize>(
+        self,
+        a: i16x8<Self>,
+        b: i16x8<Self>,
+    ) -> i16x8<Self> {
+        self.slide_i16x8::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
+        unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shlv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_sllv_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
+        unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shrv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_srav_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmpeq_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmplt_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmple_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmpge_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmpgt_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpacklo_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpackhi_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> (i16x8<Self>, i16x8<Self>) {
+        (self.zip_low_i16x8(a, b), self.zip_high_i16x8(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> (i16x8<Self>, i16x8<Self>) {
+        (self.unzip_low_i16x8(a, b), self.unzip_high_i16x8(a, b))
+    }
+    #[inline(always)]
+    fn select_i16x8(self, a: mask16x8<Self>, b: i16x8<Self>, c: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x16<Self> {
+        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i16x8(self, a: i16x8<Self>) -> u32x4<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_u16x8(self, val: u16) -> u16x8<Self> {
+        unsafe { _mm_set1_epi16(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
+        u16x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
+        u16x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u16x8(self, a: u16x8<Self>) -> [u16; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m128i, [u16; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u16x8(self, a: &u16x8<Self>) -> &[u16; 8usize] {
+        crate::transmute::checked_cast_ref::<__m128i, [u16; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u16x8(self, a: &mut u16x8<Self>) -> &mut [u16; 8usize] {
+        crate::transmute::checked_cast_mut::<__m128i, [u16; 8usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u16x8(self, a: u16x8<Self>, dest: &mut [u16; 8usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u16,
+                dest.as_mut_ptr(),
+                8usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u16x8(self, a: u8x16<Self>) -> u16x8<Self> {
+        u16x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u16x8<const SHIFT: usize>(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe {
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_u16x8(b).val.0,
+                self.cvt_to_bytes_u16x8(a).val.0,
+                SHIFT * 2usize,
+            );
+            self.cvt_from_bytes_u16x8(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u16x8<const SHIFT: usize>(
+        self,
+        a: u16x8<Self>,
+        b: u16x8<Self>,
+    ) -> u16x8<Self> {
+        self.slide_u16x8::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u16x8(self, a: u16x8<Self>) -> u16x8<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
+        unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shlv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_sllv_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
+        unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shrv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_srlv_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmpeq_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmplt_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmple_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmpge_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmpgt_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpacklo_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpackhi_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> (u16x8<Self>, u16x8<Self>) {
+        (self.zip_low_u16x8(a, b), self.zip_high_u16x8(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> (u16x8<Self>, u16x8<Self>) {
+        (self.unzip_low_u16x8(a, b), self.unzip_high_u16x8(a, b))
+    }
+    #[inline(always)]
+    fn select_u16x8(self, a: mask16x8<Self>, b: u16x8<Self>, c: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x16<Self> {
+        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u16x8(self, a: u16x8<Self>) -> u32x4<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_mask16x8(self, val: bool) -> mask16x8<Self> {
+        mask16x8 {
+            val: (if val { 255u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
+        unsafe {
+            let lanes = crate::transmute::checked_transmute_copy(&val);
+            mask16x8 {
+                val: _mm_movepi16_mask(lanes),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask16x8(self, a: mask16x8<Self>) -> [i16; 8usize] {
+        unsafe {
+            let lanes = _mm_movm_epi16(a.val);
+            crate::transmute::checked_transmute_copy(&lanes)
+        }
+    }
+    #[inline(always)]
+    fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
+        mask16x8 {
+            val: (bits & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64 {
+        u64::from((a).val) & 255u64
+    }
+    #[inline(always)]
+    fn set_mask16x8(self, a: &mut mask16x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask16x8 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
+        mask16x8 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
+        mask16x8 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
+        mask16x8 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask16x8(self, a: mask16x8<Self>) -> mask16x8<Self> {
+        mask16x8 {
+            val: ((!u64::from((a).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask16x8(
+        self,
+        a: mask16x8<Self>,
+        b: mask16x8<Self>,
+        c: mask16x8<Self>,
+    ) -> mask16x8<Self> {
+        mask16x8 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
+        mask16x8 {
+            val: (!u64::from(a.val ^ b.val) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask16x8(self, a: mask16x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask16x8(self, a: mask16x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits == 255u64
+    }
+    #[inline(always)]
+    fn any_false_mask16x8(self, a: mask16x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits != 255u64
+    }
+    #[inline(always)]
+    fn all_false_mask16x8(self, a: mask16x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x16<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 8usize)) & 65535u64;
+        mask16x16 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn splat_i32x4(self, val: i32) -> i32x4<Self> {
+        unsafe { _mm_set1_epi32(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
+        i32x4 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
+        i32x4 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i32x4(self, a: i32x4<Self>) -> [i32; 4usize] {
+        crate::transmute::checked_transmute_copy::<__m128i, [i32; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i32x4(self, a: &i32x4<Self>) -> &[i32; 4usize] {
+        crate::transmute::checked_cast_ref::<__m128i, [i32; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i32x4(self, a: &mut i32x4<Self>) -> &mut [i32; 4usize] {
+        crate::transmute::checked_cast_mut::<__m128i, [i32; 4usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i32x4(self, a: i32x4<Self>, dest: &mut [i32; 4usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i32,
+                dest.as_mut_ptr(),
+                4usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i32x4(self, a: u8x16<Self>) -> i32x4<Self> {
+        i32x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i32x4<const SHIFT: usize>(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe {
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_i32x4(b).val.0,
+                self.cvt_to_bytes_i32x4(a).val.0,
+                SHIFT * 4usize,
+            );
+            self.cvt_from_bytes_i32x4(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i32x4<const SHIFT: usize>(
+        self,
+        a: i32x4<Self>,
+        b: i32x4<Self>,
+    ) -> i32x4<Self> {
+        self.slide_i32x4::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
+        unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shlv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
+        unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shrv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_srav_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmpeq_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmplt_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmple_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmpge_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmpgt_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe {
+            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
+            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
+            _mm_unpacklo_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe {
+            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
+            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
+            _mm_unpackhi_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> (i32x4<Self>, i32x4<Self>) {
+        (self.zip_low_i32x4(a, b), self.zip_high_i32x4(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> (i32x4<Self>, i32x4<Self>) {
+        (self.unzip_low_i32x4(a, b), self.unzip_high_i32x4(a, b))
+    }
+    #[inline(always)]
+    fn select_i32x4(self, a: mask32x4<Self>, b: i32x4<Self>, c: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x8<Self> {
+        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i32x4(self, a: i32x4<Self>) -> u32x4<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn cvt_f32_i32x4(self, a: i32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn splat_u32x4(self, val: u32) -> u32x4<Self> {
+        unsafe { _mm_set1_epi32(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
+        u32x4 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
+        u32x4 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u32x4(self, a: u32x4<Self>) -> [u32; 4usize] {
+        crate::transmute::checked_transmute_copy::<__m128i, [u32; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u32x4(self, a: &u32x4<Self>) -> &[u32; 4usize] {
+        crate::transmute::checked_cast_ref::<__m128i, [u32; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u32x4(self, a: &mut u32x4<Self>) -> &mut [u32; 4usize] {
+        crate::transmute::checked_cast_mut::<__m128i, [u32; 4usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u32x4(self, a: u32x4<Self>, dest: &mut [u32; 4usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u32,
+                dest.as_mut_ptr(),
+                4usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u32x4(self, a: u8x16<Self>) -> u32x4<Self> {
+        u32x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u32x4<const SHIFT: usize>(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe {
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_u32x4(b).val.0,
+                self.cvt_to_bytes_u32x4(a).val.0,
+                SHIFT * 4usize,
+            );
+            self.cvt_from_bytes_u32x4(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u32x4<const SHIFT: usize>(
+        self,
+        a: u32x4<Self>,
+        b: u32x4<Self>,
+    ) -> u32x4<Self> {
+        self.slide_u32x4::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u32x4(self, a: u32x4<Self>) -> u32x4<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
+        unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shlv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
+        unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shrv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_srlv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmpeq_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmplt_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmple_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmpge_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmpgt_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe {
+            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
+            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
+            _mm_unpacklo_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe {
+            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
+            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
+            _mm_unpackhi_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> (u32x4<Self>, u32x4<Self>) {
+        (self.zip_low_u32x4(a, b), self.zip_high_u32x4(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> (u32x4<Self>, u32x4<Self>) {
+        (self.unzip_low_u32x4(a, b), self.unzip_high_u32x4(a, b))
+    }
+    #[inline(always)]
+    fn select_u32x4(self, a: mask32x4<Self>, b: u32x4<Self>, c: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x8<Self> {
+        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn cvt_f32_u32x4(self, a: u32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            let a = a.into();
+            let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000));
+            let hi = _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000));
+            let fhi = _mm_sub_ps(
+                _mm_castsi128_ps(hi),
+                _mm_set1_ps(f32::from_bits(0x53000080)),
+            );
+            let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi);
+            result.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn splat_mask32x4(self, val: bool) -> mask32x4<Self> {
+        mask32x4 {
+            val: (if val { 15u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
+        unsafe {
+            let lanes = crate::transmute::checked_transmute_copy(&val);
+            mask32x4 {
+                val: _mm_movepi32_mask(lanes),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask32x4(self, a: mask32x4<Self>) -> [i32; 4usize] {
+        unsafe {
+            let lanes = _mm_movm_epi32(a.val);
+            crate::transmute::checked_transmute_copy(&lanes)
+        }
+    }
+    #[inline(always)]
+    fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
+        mask32x4 {
+            val: (bits & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64 {
+        u64::from((a).val) & 15u64
+    }
+    #[inline(always)]
+    fn set_mask32x4(self, a: &mut mask32x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask32x4 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
+        mask32x4 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
+        mask32x4 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
+        mask32x4 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask32x4(self, a: mask32x4<Self>) -> mask32x4<Self> {
+        mask32x4 {
+            val: ((!u64::from((a).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask32x4(
+        self,
+        a: mask32x4<Self>,
+        b: mask32x4<Self>,
+        c: mask32x4<Self>,
+    ) -> mask32x4<Self> {
+        mask32x4 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
+        mask32x4 {
+            val: (!u64::from(a.val ^ b.val) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask32x4(self, a: mask32x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask32x4(self, a: mask32x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits == 15u64
+    }
+    #[inline(always)]
+    fn any_false_mask32x4(self, a: mask32x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits != 15u64
+    }
+    #[inline(always)]
+    fn all_false_mask32x4(self, a: mask32x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x8<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 4usize)) & 255u64;
+        mask32x8 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn splat_f64x2(self, val: f64) -> f64x2<Self> {
+        unsafe { _mm_set1_pd(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
+        f64x2 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
+        f64x2 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f64x2(self, a: f64x2<Self>) -> [f64; 2usize] {
+        crate::transmute::checked_transmute_copy::<__m128d, [f64; 2usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_f64x2(self, a: &f64x2<Self>) -> &[f64; 2usize] {
+        crate::transmute::checked_cast_ref::<__m128d, [f64; 2usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_f64x2(self, a: &mut f64x2<Self>) -> &mut [f64; 2usize] {
+        crate::transmute::checked_cast_mut::<__m128d, [f64; 2usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_f64x2(self, a: f64x2<Self>, dest: &mut [f64; 2usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const f64,
+                dest.as_mut_ptr(),
+                2usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f64x2(self, a: u8x16<Self>) -> f64x2<Self> {
+        f64x2 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f64x2(self, a: f64x2<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_f64x2<const SHIFT: usize>(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe {
+            if SHIFT >= 2usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_f64x2(b).val.0,
+                self.cvt_to_bytes_f64x2(a).val.0,
+                SHIFT * 8usize,
+            );
+            self.cvt_from_bytes_f64x2(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f64x2<const SHIFT: usize>(
+        self,
+        a: f64x2<Self>,
+        b: f64x2<Self>,
+    ) -> f64x2<Self> {
+        self.slide_f64x2::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn abs_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_rcp14_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn div_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn copysign_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe {
+            let mask = _mm_set1_pd(-0.0);
+            _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
+        unsafe {
+            mask64x2 {
+                val: _mm_cmp_pd_mask::<0i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
+        unsafe {
+            mask64x2 {
+                val: _mm_cmp_pd_mask::<17i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
+        unsafe {
+            mask64x2 {
+                val: _mm_cmp_pd_mask::<18i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
+        unsafe {
+            mask64x2 {
+                val: _mm_cmp_pd_mask::<29i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
+        unsafe {
+            mask64x2 {
+                val: _mm_cmp_pd_mask::<30i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn interleave_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> (f64x2<Self>, f64x2<Self>) {
+        (self.zip_low_f64x2(a, b), self.zip_high_f64x2(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> (f64x2<Self>, f64x2<Self>) {
+        (self.unzip_low_f64x2(a, b), self.unzip_high_f64x2(a, b))
+    }
+    #[inline(always)]
+    fn max_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_range_pd::<5i32>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_range_pd::<4i32>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe {
+            _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn ceil_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe {
+            _mm_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn round_ties_even_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe {
+            _mm_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn fract_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        a - self.trunc_f64x2(a)
+    }
+    #[inline(always)]
+    fn trunc_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe {
+            _mm_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn select_f64x2(self, a: mask64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_mask_blend_pd(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x4<Self> {
+        unsafe { _mm256_setr_m128d(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_f32_f64x2(self, a: f64x2<Self>) -> f32x4<Self> {
+        unsafe { _mm_castpd_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn splat_mask64x2(self, val: bool) -> mask64x2<Self> {
+        mask64x2 {
+            val: (if val { 3u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
+        unsafe {
+            let lanes = crate::transmute::checked_transmute_copy(&val);
+            mask64x2 {
+                val: _mm_movepi64_mask(lanes),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
+        unsafe {
+            let lanes = _mm_movm_epi64(a.val);
+            crate::transmute::checked_transmute_copy(&lanes)
+        }
+    }
+    #[inline(always)]
+    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
+        mask64x2 {
+            val: (bits & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
+        u64::from((a).val) & 3u64
+    }
+    #[inline(always)]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask64x2 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        mask64x2 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        mask64x2 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        mask64x2 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
+        mask64x2 {
+            val: ((!u64::from((a).val)) & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask64x2(
+        self,
+        a: mask64x2<Self>,
+        b: mask64x2<Self>,
+        c: mask64x2<Self>,
+    ) -> mask64x2<Self> {
+        mask64x2 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        mask64x2 {
+            val: (!u64::from(a.val ^ b.val) & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+        let bits = u64::from((a).val) & 3u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+        let bits = u64::from((a).val) & 3u64;
+        bits == 3u64
+    }
+    #[inline(always)]
+    fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+        let bits = u64::from((a).val) & 3u64;
+        bits != 3u64
+    }
+    #[inline(always)]
+    fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+        let bits = u64::from((a).val) & 3u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 2usize)) & 15u64;
+        mask64x4 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn splat_f32x8(self, val: f32) -> f32x8<Self> {
+        unsafe { _mm256_set1_ps(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
+        f32x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
+        f32x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m256, [f32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
+        crate::transmute::checked_cast_ref::<__m256, [f32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
+        crate::transmute::checked_cast_mut::<__m256, [f32; 8usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const f32,
+                dest.as_mut_ptr(),
+                8usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
+        f32x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT * 4usize) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_f32x8(a).val.0,
+                idx,
+                self.cvt_to_bytes_f32x8(b).val.0,
+            );
+            self.cvt_from_bytes_f32x8(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f32x8<const SHIFT: usize>(
+        self,
+        a: f32x8<Self>,
+        b: f32x8<Self>,
+    ) -> f32x8<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_f32x8(a).val.0;
+            let b = self.cvt_to_bytes_f32x8(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT * 4usize);
+            self.cvt_from_bytes_f32x8(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_sqrt_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_rcp14_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_add_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_sub_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_mul_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_div_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            let mask = _mm256_set1_ps(-0.0);
+            _mm256_or_ps(
+                _mm256_and_ps(mask, b.into()),
+                _mm256_andnot_ps(mask, a.into()),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmp_ps_mask::<0i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmp_ps_mask::<17i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmp_ps_mask::<18i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmp_ps_mask::<29i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmp_ps_mask::<30i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_ps(
+                a.into(),
+                _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_ps(
+                a.into(),
+                _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_ps(
+                a.into(),
+                _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_ps(
+                a.into(),
+                _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
+                    .simd_into(self),
+                _mm256_permutex2var_ps(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
+                    .simd_into(self),
+                _mm256_permutex2var_ps(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_max_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_min_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_range_ps::<5i32>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_range_ps::<4i32>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        a - self.trunc_f32x8(a)
+    }
+    #[inline(always)]
+    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_mask_blend_ps(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
+        unsafe {
+            _mm512_insertf32x8::<1>(_mm512_castps256_ps512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
+        unsafe {
+            (
+                _mm256_extractf128_ps::<0>(a.into()).simd_into(self),
+                _mm256_extractf128_ps::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
+        unsafe { _mm256_castps_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_castps_si256(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
+        unsafe { _mm256_castps_si256(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_castps_si256(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_cvttps_epu32(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        unsafe {
+            let a = _mm256_max_ps(a.into(), _mm256_setzero_ps());
+            let mut converted = _mm256_cvttps_epu32(a);
+            let exceeds_unsigned_range =
+                _mm256_cmp_ps_mask::<17i32>(_mm256_set1_ps(4294967040.0), a);
+            converted = _mm256_mask_blend_epi32(
+                exceeds_unsigned_range,
+                converted,
+                _mm256_set1_epi32(u32::MAX.cast_signed()),
+            );
+            converted.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_cvttps_epi32(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        unsafe {
+            let a = a.into();
+            let mut converted = _mm256_cvttps_epi32(a);
+            let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0));
+            let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
+            if !all_in_range {
+                converted = _mm256_blendv_epi8(
+                    _mm256_set1_epi32(i32::MAX),
+                    converted,
+                    _mm256_castps_si256(in_range),
+                );
+                let is_not_nan = _mm256_castps_si256(_mm256_cmp_ps::<7i32>(a, a));
+                converted = _mm256_and_si256(converted, is_not_nan);
+            }
+            converted.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn splat_i8x32(self, val: i8) -> i8x32<Self> {
+        unsafe { _mm256_set1_epi8(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
+        i8x32 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
+        i8x32 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [i8; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [i8; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [i8; 32usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i8,
+                dest.as_mut_ptr(),
+                32usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
+        i8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe {
+            if SHIFT >= 32usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_i8x32(a).val.0,
+                idx,
+                self.cvt_to_bytes_i8x32(b).val.0,
+            );
+            self.cvt_from_bytes_i8x32(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i8x32<const SHIFT: usize>(
+        self,
+        a: i8x32<Self>,
+        b: i8x32<Self>,
+    ) -> i8x32<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_i8x32(a).val.0;
+            let b = self.cvt_to_bytes_i8x32(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT);
+            self.cvt_from_bytes_i8x32(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe {
+            let dst_even = _mm256_mullo_epi16(a.into(), b.into());
+            let dst_odd = _mm256_mullo_epi16(
+                _mm256_srli_epi16::<8>(a.into()),
+                _mm256_srli_epi16::<8>(b.into()),
+            );
+            _mm256_or_si256(
+                _mm256_slli_epi16(dst_odd, 8),
+                _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
+            let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
+            let lo_shifted = _mm256_sll_epi16(lo_16, shift_count);
+            let hi_shifted = _mm256_sll_epi16(hi_16, shift_count);
+            _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm256_setzero_si256();
+            let value_extend = zero;
+            let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm256_set1_epi16(0x00ff);
+            let lo_shifted = _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask);
+            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
+            let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
+            let lo_shifted = _mm256_sra_epi16(lo_16, shift_count);
+            let hi_shifted = _mm256_sra_epi16(hi_16, shift_count);
+            _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm256_setzero_si256();
+            let value_extend = _mm256_cmpgt_epi8(zero, val);
+            let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm256_set1_epi16(0x00ff);
+            let lo_shifted = _mm256_and_si256(_mm256_srav_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm256_and_si256(_mm256_srav_epi16(hi_values, hi_counts), byte_mask);
+            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmpeq_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmplt_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmple_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmpge_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmpgt_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42,
+                    11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57,
+                    26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40,
+                    42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41,
+                    43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10,
+                        42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25,
+                        57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38,
+                        40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39,
+                        41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_min_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_max_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
+        unsafe {
+            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
+        unsafe {
+            (
+                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
+                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_u8x32(self, val: u8) -> u8x32<Self> {
+        unsafe { _mm256_set1_epi8(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [u8; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [u8; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [u8; 32usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u8,
+                dest.as_mut_ptr(),
+                32usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            if SHIFT >= 32usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_u8x32(a).val.0,
+                idx,
+                self.cvt_to_bytes_u8x32(b).val.0,
+            );
+            self.cvt_from_bytes_u8x32(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u8x32<const SHIFT: usize>(
+        self,
+        a: u8x32<Self>,
+        b: u8x32<Self>,
+    ) -> u8x32<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_u8x32(a).val.0;
+            let b = self.cvt_to_bytes_u8x32(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT);
+            self.cvt_from_bytes_u8x32(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            let dst_even = _mm256_mullo_epi16(a.into(), b.into());
+            let dst_odd = _mm256_mullo_epi16(
+                _mm256_srli_epi16::<8>(a.into()),
+                _mm256_srli_epi16::<8>(b.into()),
+            );
+            _mm256_or_si256(
+                _mm256_slli_epi16(dst_odd, 8),
+                _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
+            let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
+            let lo_shifted = _mm256_sll_epi16(lo_16, shift_count);
+            let hi_shifted = _mm256_sll_epi16(hi_16, shift_count);
+            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm256_setzero_si256();
+            let value_extend = zero;
+            let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm256_set1_epi16(0x00ff);
+            let lo_shifted = _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask);
+            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
+            let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
+            let lo_shifted = _mm256_srl_epi16(lo_16, shift_count);
+            let hi_shifted = _mm256_srl_epi16(hi_16, shift_count);
+            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm256_setzero_si256();
+            let value_extend = zero;
+            let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm256_set1_epi16(0x00ff);
+            let lo_shifted = _mm256_and_si256(_mm256_srlv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm256_and_si256(_mm256_srlv_epi16(hi_values, hi_counts), byte_mask);
+            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmpeq_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmplt_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmple_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmpge_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmpgt_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42,
+                    11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57,
+                    26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40,
+                    42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41,
+                    43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10,
+                        42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25,
+                        57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38,
+                        40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39,
+                        41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_min_epu8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_max_epu8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
+        unsafe {
+            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
+        unsafe {
+            (
+                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
+                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_cvtepu8_epi16(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_mask8x32(self, val: bool) -> mask8x32<Self> {
+        mask8x32 {
+            val: (if val { 4294967295u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
+        unsafe {
+            let lanes = crate::transmute::checked_transmute_copy(&val);
+            mask8x32 {
+                val: _mm256_movepi8_mask(lanes),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
+        unsafe {
+            let lanes = _mm256_movm_epi8(a.val);
+            crate::transmute::checked_transmute_copy(&lanes)
+        }
+    }
+    #[inline(always)]
+    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
+        mask8x32 {
+            val: (bits & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
+        u64::from((a).val) & 4294967295u64
+    }
+    #[inline(always)]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask8x32 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        mask8x32 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        mask8x32 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        mask8x32 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
+        mask8x32 {
+            val: ((!u64::from((a).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask8x32(
+        self,
+        a: mask8x32<Self>,
+        b: mask8x32<Self>,
+        c: mask8x32<Self>,
+    ) -> mask8x32<Self> {
+        mask8x32 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        mask8x32 {
+            val: (!u64::from(a.val ^ b.val) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits == 4294967295u64
+    }
+    #[inline(always)]
+    fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits != 4294967295u64
+    }
+    #[inline(always)]
+    fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 32usize)) & u64::MAX;
+        mask8x64 {
+            val: bits,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask8x16 {
+                val: (bits & 65535u64) as _,
+                simd: self,
+            },
+            mask8x16 {
+                val: ((bits >> 16usize) & 65535u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i16x16(self, val: i16) -> i16x16<Self> {
+        unsafe { _mm256_set1_epi16(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
+        i16x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
+        i16x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [i16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [i16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [i16; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i16,
+                dest.as_mut_ptr(),
+                16usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
+        i16x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe {
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT * 2usize) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_i16x16(a).val.0,
+                idx,
+                self.cvt_to_bytes_i16x16(b).val.0,
+            );
+            self.cvt_from_bytes_i16x16(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i16x16<const SHIFT: usize>(
+        self,
+        a: i16x16<Self>,
+        b: i16x16<Self>,
+    ) -> i16x16<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_i16x16(a).val.0;
+            let b = self.cvt_to_bytes_i16x16(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT * 2usize);
+            self.cvt_from_bytes_i16x16(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
+        unsafe {
+            _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_sllv_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
+        unsafe {
+            _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_srav_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmpeq_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmplt_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmple_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmpge_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmpgt_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_min_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_max_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
+        unsafe {
+            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
+        unsafe {
+            (
+                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
+                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_u16x16(self, val: u16) -> u16x16<Self> {
+        unsafe { _mm256_set1_epi16(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
+        u16x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
+        u16x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [u16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [u16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [u16; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u16,
+                dest.as_mut_ptr(),
+                16usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
+        u16x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe {
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT * 2usize) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_u16x16(a).val.0,
+                idx,
+                self.cvt_to_bytes_u16x16(b).val.0,
+            );
+            self.cvt_from_bytes_u16x16(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u16x16<const SHIFT: usize>(
+        self,
+        a: u16x16<Self>,
+        b: u16x16<Self>,
+    ) -> u16x16<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_u16x16(a).val.0;
+            let b = self.cvt_to_bytes_u16x16(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT * 2usize);
+            self.cvt_from_bytes_u16x16(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+        unsafe {
+            _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_sllv_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+        unsafe {
+            _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_srlv_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmpeq_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmplt_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmple_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmpge_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmpgt_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_min_epu16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_max_epu16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
+        unsafe {
+            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
+        unsafe {
+            (
+                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
+                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
+        unsafe { _mm256_cvtepi16_epi8(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_mask16x16(self, val: bool) -> mask16x16<Self> {
+        mask16x16 {
+            val: (if val { 65535u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
+        unsafe {
+            let lanes = crate::transmute::checked_transmute_copy(&val);
+            mask16x16 {
+                val: _mm256_movepi16_mask(lanes),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
+        unsafe {
+            let lanes = _mm256_movm_epi16(a.val);
+            crate::transmute::checked_transmute_copy(&lanes)
+        }
+    }
+    #[inline(always)]
+    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
+        mask16x16 {
+            val: (bits & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
+        u64::from((a).val) & 65535u64
+    }
+    #[inline(always)]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask16x16 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        mask16x16 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        mask16x16 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        mask16x16 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
+        mask16x16 {
+            val: ((!u64::from((a).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask16x16(
+        self,
+        a: mask16x16<Self>,
+        b: mask16x16<Self>,
+        c: mask16x16<Self>,
+    ) -> mask16x16<Self> {
+        mask16x16 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        mask16x16 {
+            val: (!u64::from(a.val ^ b.val) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits == 65535u64
+    }
+    #[inline(always)]
+    fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits != 65535u64
+    }
+    #[inline(always)]
+    fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 16usize)) & 4294967295u64;
+        mask16x32 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask16x8 {
+                val: (bits & 255u64) as _,
+                simd: self,
+            },
+            mask16x8 {
+                val: ((bits >> 8usize) & 255u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i32x8(self, val: i32) -> i32x8<Self> {
+        unsafe { _mm256_set1_epi32(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
+        i32x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
+        i32x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [i32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [i32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [i32; 8usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i32,
+                dest.as_mut_ptr(),
+                8usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
+        i32x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe {
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT * 4usize) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_i32x8(a).val.0,
+                idx,
+                self.cvt_to_bytes_i32x8(b).val.0,
+            );
+            self.cvt_from_bytes_i32x8(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i32x8<const SHIFT: usize>(
+        self,
+        a: i32x8<Self>,
+        b: i32x8<Self>,
+    ) -> i32x8<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_i32x8(a).val.0;
+            let b = self.cvt_to_bytes_i32x8(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT * 4usize);
+            self.cvt_from_bytes_i32x8(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+        unsafe {
+            _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+        unsafe {
+            _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_srav_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmpeq_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmplt_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmple_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmpge_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmpgt_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
+                    .simd_into(self),
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
+                    .simd_into(self),
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_min_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_max_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
+        unsafe {
+            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
+        unsafe {
+            (
+                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
+                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_cvtepi32_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn splat_u32x8(self, val: u32) -> u32x8<Self> {
+        unsafe { _mm256_set1_epi32(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
+        u32x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
+        u32x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [u32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [u32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [u32; 8usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u32,
+                dest.as_mut_ptr(),
+                8usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
+        u32x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe {
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT * 4usize) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_u32x8(a).val.0,
+                idx,
+                self.cvt_to_bytes_u32x8(b).val.0,
+            );
+            self.cvt_from_bytes_u32x8(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u32x8<const SHIFT: usize>(
+        self,
+        a: u32x8<Self>,
+        b: u32x8<Self>,
+    ) -> u32x8<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_u32x8(a).val.0;
+            let b = self.cvt_to_bytes_u32x8(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT * 4usize);
+            self.cvt_from_bytes_u32x8(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
+        unsafe {
+            _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
+        unsafe {
+            _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_srlv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmpeq_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmplt_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmple_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmpge_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmpgt_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
+                    .simd_into(self),
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
+                    .simd_into(self),
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_min_epu32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_max_epu32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
+        unsafe {
+            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
+        unsafe {
+            (
+                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
+                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            let a = a.into();
+            let lo = _mm256_blend_epi16::<0xAA>(a, _mm256_set1_epi32(0x4B000000));
+            let hi = _mm256_blend_epi16::<0xAA>(
+                _mm256_srli_epi32::<16>(a),
+                _mm256_set1_epi32(0x53000000),
+            );
+            let fhi = _mm256_sub_ps(
+                _mm256_castsi256_ps(hi),
+                _mm256_set1_ps(f32::from_bits(0x53000080)),
+            );
+            let result = _mm256_add_ps(_mm256_castsi256_ps(lo), fhi);
+            result.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn splat_mask32x8(self, val: bool) -> mask32x8<Self> {
+        mask32x8 {
+            val: (if val { 255u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
+        unsafe {
+            let lanes = crate::transmute::checked_transmute_copy(&val);
+            mask32x8 {
+                val: _mm256_movepi32_mask(lanes),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
+        unsafe {
+            let lanes = _mm256_movm_epi32(a.val);
+            crate::transmute::checked_transmute_copy(&lanes)
+        }
+    }
+    #[inline(always)]
+    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
+        mask32x8 {
+            val: (bits & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
+        u64::from((a).val) & 255u64
+    }
+    #[inline(always)]
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask32x8 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        mask32x8 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        mask32x8 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        mask32x8 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
+        mask32x8 {
+            val: ((!u64::from((a).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask32x8(
+        self,
+        a: mask32x8<Self>,
+        b: mask32x8<Self>,
+        c: mask32x8<Self>,
+    ) -> mask32x8<Self> {
+        mask32x8 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        mask32x8 {
+            val: (!u64::from(a.val ^ b.val) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits == 255u64
+    }
+    #[inline(always)]
+    fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits != 255u64
+    }
+    #[inline(always)]
+    fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 8usize)) & 65535u64;
+        mask32x16 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask32x4 {
+                val: (bits & 15u64) as _,
+                simd: self,
+            },
+            mask32x4 {
+                val: ((bits >> 4usize) & 15u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_f64x4(self, val: f64) -> f64x4<Self> {
+        unsafe { _mm256_set1_pd(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
+        f64x4 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
+        f64x4 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
+        crate::transmute::checked_transmute_copy::<__m256d, [f64; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
+        crate::transmute::checked_cast_ref::<__m256d, [f64; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
+        crate::transmute::checked_cast_mut::<__m256d, [f64; 4usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const f64,
+                dest.as_mut_ptr(),
+                4usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
+        f64x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT * 8usize) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_f64x4(a).val.0,
+                idx,
+                self.cvt_to_bytes_f64x4(b).val.0,
+            );
+            self.cvt_from_bytes_f64x4(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f64x4<const SHIFT: usize>(
+        self,
+        a: f64x4<Self>,
+        b: f64x4<Self>,
+    ) -> f64x4<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 2usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_f64x4(a).val.0;
+            let b = self.cvt_to_bytes_f64x4(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT * 8usize);
+            self.cvt_from_bytes_f64x4(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_sqrt_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_rcp14_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_add_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_sub_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_mul_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_div_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            let mask = _mm256_set1_pd(-0.0);
+            _mm256_or_pd(
+                _mm256_and_pd(mask, b.into()),
+                _mm256_andnot_pd(mask, a.into()),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        unsafe {
+            mask64x4 {
+                val: _mm256_cmp_pd_mask::<0i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        unsafe {
+            mask64x4 {
+                val: _mm256_cmp_pd_mask::<17i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        unsafe {
+            mask64x4 {
+                val: _mm256_cmp_pd_mask::<18i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        unsafe {
+            mask64x4 {
+                val: _mm256_cmp_pd_mask::<29i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        unsafe {
+            mask64x4 {
+                val: _mm256_cmp_pd_mask::<30i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 4, 1, 5), b.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(2, 6, 3, 7), b.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 2, 4, 6), b.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(1, 3, 5, 7), b.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 4, 1, 5), b).simd_into(self),
+                _mm256_permutex2var_pd(a, _mm256_setr_epi64x(2, 6, 3, 7), b).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 2, 4, 6), b).simd_into(self),
+                _mm256_permutex2var_pd(a, _mm256_setr_epi64x(1, 3, 5, 7), b).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_max_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_min_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_range_pd::<5i32>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_range_pd::<4i32>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        a - self.trunc_f64x4(a)
+    }
+    #[inline(always)]
+    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_mask_blend_pd(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
+        unsafe {
+            _mm512_insertf64x4::<1>(_mm512_castpd256_pd512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
+        unsafe {
+            (
+                _mm256_extractf128_pd::<0>(a.into()).simd_into(self),
+                _mm256_extractf128_pd::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
+        unsafe { _mm256_castpd_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn splat_mask64x4(self, val: bool) -> mask64x4<Self> {
+        mask64x4 {
+            val: (if val { 15u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
+        unsafe {
+            let lanes = crate::transmute::checked_transmute_copy(&val);
+            mask64x4 {
+                val: _mm256_movepi64_mask(lanes),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
+        unsafe {
+            let lanes = _mm256_movm_epi64(a.val);
+            crate::transmute::checked_transmute_copy(&lanes)
+        }
+    }
+    #[inline(always)]
+    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
+        mask64x4 {
+            val: (bits & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
+        u64::from((a).val) & 15u64
+    }
+    #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask64x4 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: ((!u64::from((a).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask64x4(
+        self,
+        a: mask64x4<Self>,
+        b: mask64x4<Self>,
+        c: mask64x4<Self>,
+    ) -> mask64x4<Self> {
+        mask64x4 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: (!u64::from(a.val ^ b.val) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits == 15u64
+    }
+    #[inline(always)]
+    fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits != 15u64
+    }
+    #[inline(always)]
+    fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 4usize)) & 255u64;
+        mask64x8 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask64x2 {
+                val: (bits & 3u64) as _,
+                simd: self,
+            },
+            mask64x2 {
+                val: ((bits >> 2usize) & 3u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_f32x16(self, val: f32) -> f32x16<Self> {
+        unsafe { _mm512_set1_ps(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
+        f32x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
+        f32x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
+        crate::transmute::checked_transmute_copy::<__m512, [f32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
+        crate::transmute::checked_cast_ref::<__m512, [f32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
+        crate::transmute::checked_cast_mut::<__m512, [f32; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const f32,
+                dest.as_mut_ptr(),
+                16usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
+        f32x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT * 4usize) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_f32x16(a).val.0,
+                idx,
+                self.cvt_to_bytes_f32x16(b).val.0,
+            );
+            self.cvt_from_bytes_f32x16(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f32x16<const SHIFT: usize>(
+        self,
+        a: f32x16<Self>,
+        b: f32x16<Self>,
+    ) -> f32x16<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_f32x16(a).val.0;
+            let b = self.cvt_to_bytes_f32x16(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT * 4usize);
+            self.cvt_from_bytes_f32x16(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_andnot_ps(_mm512_set1_ps(-0.0), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_xor_ps(a.into(), _mm512_set1_ps(-0.0)).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_sqrt_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_rcp14_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_add_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_sub_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_mul_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_div_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            let mask = _mm512_set1_ps(-0.0);
+            _mm512_or_ps(
+                _mm512_and_ps(mask, b.into()),
+                _mm512_andnot_ps(mask, a.into()),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmp_ps_mask::<0i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmp_ps_mask::<17i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmp_ps_mask::<18i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmp_ps_mask::<29i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmp_ps_mask::<30i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_ps(
+                a.into(),
+                _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_ps(
+                a.into(),
+                _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_ps(
+                a.into(),
+                _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_ps(
+                a.into(),
+                _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_ps(
+                    a,
+                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_ps(
+                    a,
+                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_ps(
+                    a,
+                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_ps(
+                    a,
+                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_max_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_min_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_range_ps::<5i32>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_range_ps::<4i32>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            _mm512_roundscale_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        a - self.trunc_f32x16(a)
+    }
+    #[inline(always)]
+    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            _mm512_roundscale_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_mask_blend_ps(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        unsafe {
+            (
+                _mm512_castps512_ps256(a.into()).simd_into(self),
+                _mm512_extractf32x8_ps::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
+        unsafe { _mm512_castps_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_castps_si512(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
+        let lanes: __m512 = crate::transmute::checked_transmute_copy::<[f32; 16usize], __m512>(src);
+        unsafe {
+            _mm512_permutexvar_ps(
+                _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
+                lanes,
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+        unsafe {
+            let lanes = _mm512_permutexvar_ps(
+                _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
+                a.into(),
+            );
+            _mm512_storeu_ps(dest.as_mut_ptr() as *mut _, lanes);
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
+        unsafe { _mm512_castps_si512(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_castps_si512(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_cvttps_epu32(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        unsafe {
+            let a = _mm512_max_ps(a.into(), _mm512_setzero_ps());
+            let mut converted = _mm512_cvttps_epu32(a);
+            let exceeds_unsigned_range =
+                _mm512_cmp_ps_mask::<17i32>(_mm512_set1_ps(4294967040.0), a);
+            converted = _mm512_mask_blend_epi32(
+                exceeds_unsigned_range,
+                converted,
+                _mm512_set1_epi32(u32::MAX.cast_signed()),
+            );
+            converted.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_cvttps_epi32(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        unsafe {
+            let a = a.into();
+            let mut converted = _mm512_cvttps_epi32(a);
+            let in_range = _mm512_cmp_ps_mask::<17i32>(a, _mm512_set1_ps(2147483648.0));
+            converted = _mm512_mask_blend_epi32(in_range, _mm512_set1_epi32(i32::MAX), converted);
+            let is_not_nan = _mm512_cmp_ps_mask::<7i32>(a, a);
+            converted = _mm512_mask_blend_epi32(is_not_nan, _mm512_setzero_si512(), converted);
+            converted.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn splat_i8x64(self, val: i8) -> i8x64<Self> {
+        unsafe { _mm512_set1_epi8(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
+        i8x64 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
+        i8x64 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
+        crate::transmute::checked_transmute_copy::<__m512i, [i8; 64usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
+        crate::transmute::checked_cast_ref::<__m512i, [i8; 64usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
+        crate::transmute::checked_cast_mut::<__m512i, [i8; 64usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i8,
+                dest.as_mut_ptr(),
+                64usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
+        i8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe {
+            if SHIFT >= 64usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_i8x64(a).val.0,
+                idx,
+                self.cvt_to_bytes_i8x64(b).val.0,
+            );
+            self.cvt_from_bytes_i8x64(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i8x64<const SHIFT: usize>(
+        self,
+        a: i8x64<Self>,
+        b: i8x64<Self>,
+    ) -> i8x64<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_i8x64(a).val.0;
+            let b = self.cvt_to_bytes_i8x64(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT);
+            self.cvt_from_bytes_i8x64(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_add_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_sub_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe {
+            let dst_even = _mm512_mullo_epi16(a.into(), b.into());
+            let dst_odd = _mm512_mullo_epi16(
+                _mm512_srli_epi16::<8>(a.into()),
+                _mm512_srli_epi16::<8>(b.into()),
+            );
+            _mm512_or_si512(
+                _mm512_slli_epi16(dst_odd, 8),
+                _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm512_unpacklo_epi8(
+                val,
+                _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
+            );
+            let hi_16 = _mm512_unpackhi_epi8(
+                val,
+                _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
+            );
+            let lo_shifted = _mm512_sll_epi16(lo_16, shift_count);
+            let hi_shifted = _mm512_sll_epi16(hi_16, shift_count);
+            _mm512_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm512_setzero_si512();
+            let value_extend = zero;
+            let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm512_set1_epi16(0x00ff);
+            let lo_shifted = _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask);
+            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm512_unpacklo_epi8(
+                val,
+                _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
+            );
+            let hi_16 = _mm512_unpackhi_epi8(
+                val,
+                _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
+            );
+            let lo_shifted = _mm512_sra_epi16(lo_16, shift_count);
+            let hi_shifted = _mm512_sra_epi16(hi_16, shift_count);
+            _mm512_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm512_setzero_si512();
+            let value_extend = _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(zero, val));
+            let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm512_set1_epi16(0x00ff);
+            let lo_shifted = _mm512_and_si512(_mm512_srav_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm512_and_si512(_mm512_srav_epi16(hi_values, hi_counts), byte_mask);
+            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmpeq_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmplt_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmple_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmpge_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmpgt_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, 22,
+                    85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, 76, 12,
+                    75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1,
+                    64, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119,
+                    55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47,
+                    110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102,
+                    38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
+                    94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, 58, 56,
+                    54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16,
+                    14, 12, 10, 8, 6, 4, 2, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
+                    95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, 59, 57,
+                    55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17,
+                    15, 13, 11, 9, 7, 5, 3, 1,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86,
+                        22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13,
+                        76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3,
+                        66, 2, 65, 1, 64, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
+                        119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
+                        111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
+                        103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98,
+                        96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60,
+                        58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22,
+                        20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99,
+                        97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61,
+                        59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23,
+                        21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_min_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_max_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        unsafe {
+            (
+                _mm512_castsi512_si256(a.into()).simd_into(self),
+                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_sub_epi8(_mm512_setzero_si512(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_u8x64(self, val: u8) -> u8x64<Self> {
+        unsafe { _mm512_set1_epi8(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
+        crate::transmute::checked_transmute_copy::<__m512i, [u8; 64usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
+        crate::transmute::checked_cast_ref::<__m512i, [u8; 64usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
+        crate::transmute::checked_cast_mut::<__m512i, [u8; 64usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u8,
+                dest.as_mut_ptr(),
+                64usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            if SHIFT >= 64usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_u8x64(a).val.0,
+                idx,
+                self.cvt_to_bytes_u8x64(b).val.0,
+            );
+            self.cvt_from_bytes_u8x64(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u8x64<const SHIFT: usize>(
+        self,
+        a: u8x64<Self>,
+        b: u8x64<Self>,
+    ) -> u8x64<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_u8x64(a).val.0;
+            let b = self.cvt_to_bytes_u8x64(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT);
+            self.cvt_from_bytes_u8x64(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_add_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_sub_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            let dst_even = _mm512_mullo_epi16(a.into(), b.into());
+            let dst_odd = _mm512_mullo_epi16(
+                _mm512_srli_epi16::<8>(a.into()),
+                _mm512_srli_epi16::<8>(b.into()),
+            );
+            _mm512_or_si512(
+                _mm512_slli_epi16(dst_odd, 8),
+                _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm512_unpacklo_epi8(val, _mm512_setzero_si512());
+            let hi_16 = _mm512_unpackhi_epi8(val, _mm512_setzero_si512());
+            let lo_shifted = _mm512_sll_epi16(lo_16, shift_count);
+            let hi_shifted = _mm512_sll_epi16(hi_16, shift_count);
+            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm512_setzero_si512();
+            let value_extend = zero;
+            let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm512_set1_epi16(0x00ff);
+            let lo_shifted = _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask);
+            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm512_unpacklo_epi8(val, _mm512_setzero_si512());
+            let hi_16 = _mm512_unpackhi_epi8(val, _mm512_setzero_si512());
+            let lo_shifted = _mm512_srl_epi16(lo_16, shift_count);
+            let hi_shifted = _mm512_srl_epi16(hi_16, shift_count);
+            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm512_setzero_si512();
+            let value_extend = zero;
+            let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm512_set1_epi16(0x00ff);
+            let lo_shifted = _mm512_and_si512(_mm512_srlv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm512_and_si512(_mm512_srlv_epi16(hi_values, hi_counts), byte_mask);
+            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmpeq_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmplt_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmple_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmpge_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmpgt_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, 22,
+                    85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, 76, 12,
+                    75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1,
+                    64, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119,
+                    55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47,
+                    110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102,
+                    38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
+                    94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, 58, 56,
+                    54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16,
+                    14, 12, 10, 8, 6, 4, 2, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
+                    95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, 59, 57,
+                    55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17,
+                    15, 13, 11, 9, 7, 5, 3, 1,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86,
+                        22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13,
+                        76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3,
+                        66, 2, 65, 1, 64, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
+                        119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
+                        111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
+                        103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98,
+                        96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60,
+                        58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22,
+                        20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99,
+                        97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61,
+                        59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23,
+                        21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_min_epu8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_max_epu8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        unsafe {
+            (
+                _mm512_castsi512_si256(a.into()).simd_into(self),
+                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
+        let lanes: __m512i =
+            crate::transmute::checked_transmute_copy::<[u8; 64usize], __m512i>(src);
+        unsafe {
+            _mm512_permutexvar_epi8(
+                _mm512_set_epi8(
+                    63, 59, 55, 51, 47, 43, 39, 35, 31, 27, 23, 19, 15, 11, 7, 3, 62, 58, 54, 50,
+                    46, 42, 38, 34, 30, 26, 22, 18, 14, 10, 6, 2, 61, 57, 53, 49, 45, 41, 37, 33,
+                    29, 25, 21, 17, 13, 9, 5, 1, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16,
+                    12, 8, 4, 0,
+                ),
+                lanes,
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+        unsafe {
+            let lanes = _mm512_permutexvar_epi8(
+                _mm512_set_epi8(
+                    63, 47, 31, 15, 62, 46, 30, 14, 61, 45, 29, 13, 60, 44, 28, 12, 59, 43, 27, 11,
+                    58, 42, 26, 10, 57, 41, 25, 9, 56, 40, 24, 8, 55, 39, 23, 7, 54, 38, 22, 6, 53,
+                    37, 21, 5, 52, 36, 20, 4, 51, 35, 19, 3, 50, 34, 18, 2, 49, 33, 17, 1, 48, 32,
+                    16, 0,
+                ),
+                a.into(),
+            );
+            _mm512_storeu_si512(dest.as_mut_ptr() as *mut _, lanes);
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_mask8x64(self, val: bool) -> mask8x64<Self> {
+        mask8x64 {
+            val: if val { u64::MAX } else { 0 },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
+        unsafe {
+            let lanes = crate::transmute::checked_transmute_copy(&val);
+            mask8x64 {
+                val: _mm512_movepi8_mask(lanes),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
+        unsafe {
+            let lanes = _mm512_movm_epi8(a.val);
+            crate::transmute::checked_transmute_copy(&lanes)
+        }
+    }
+    #[inline(always)]
+    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
+        mask8x64 {
+            val: bits & u64::MAX,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
+        u64::from((a).val) & u64::MAX
+    }
+    #[inline(always)]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask8x64 {
+            val: bits,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: (u64::from((a).val) & u64::from((b).val)) & u64::MAX,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: (u64::from((a).val) | u64::from((b).val)) & u64::MAX,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: (u64::from((a).val) ^ u64::from((b).val)) & u64::MAX,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: (!u64::from((a).val)) & u64::MAX,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask8x64(
+        self,
+        a: mask8x64<Self>,
+        b: mask8x64<Self>,
+        c: mask8x64<Self>,
+    ) -> mask8x64<Self> {
+        mask8x64 {
+            val: ((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & u64::MAX,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: !u64::from(a.val ^ b.val) & u64::MAX,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let bits = u64::from((a).val) & u64::MAX;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let bits = u64::from((a).val) & u64::MAX;
+        bits == u64::MAX
+    }
+    #[inline(always)]
+    fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let bits = u64::from((a).val) & u64::MAX;
+        bits != u64::MAX
+    }
+    #[inline(always)]
+    fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let bits = u64::from((a).val) & u64::MAX;
+        bits == 0
+    }
+    #[inline(always)]
+    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask8x32 {
+                val: (bits & 4294967295u64) as _,
+                simd: self,
+            },
+            mask8x32 {
+                val: ((bits >> 32usize) & 4294967295u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i16x32(self, val: i16) -> i16x32<Self> {
+        unsafe { _mm512_set1_epi16(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
+        i16x32 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
+        i16x32 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
+        crate::transmute::checked_transmute_copy::<__m512i, [i16; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
+        crate::transmute::checked_cast_ref::<__m512i, [i16; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
+        crate::transmute::checked_cast_mut::<__m512i, [i16; 32usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i16,
+                dest.as_mut_ptr(),
+                32usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
+        i16x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe {
+            if SHIFT >= 32usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT * 2usize) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_i16x32(a).val.0,
+                idx,
+                self.cvt_to_bytes_i16x32(b).val.0,
+            );
+            self.cvt_from_bytes_i16x32(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i16x32<const SHIFT: usize>(
+        self,
+        a: i16x32<Self>,
+        b: i16x32<Self>,
+    ) -> i16x32<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_i16x32(a).val.0;
+            let b = self.cvt_to_bytes_i16x32(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT * 2usize);
+            self.cvt_from_bytes_i16x32(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_add_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_sub_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_mullo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
+        unsafe {
+            _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_sllv_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
+        unsafe {
+            _mm512_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_srav_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmpeq_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmplt_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmple_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmpge_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmpgt_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, 37,
+                    5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, 22,
+                    53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24,
+                    22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25,
+                    23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6,
+                        37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54,
+                        22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26,
+                        24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27,
+                        25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_min_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_max_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        unsafe {
+            (
+                _mm512_castsi512_si256(a.into()).simd_into(self),
+                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_sub_epi16(_mm512_setzero_si512(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_u16x32(self, val: u16) -> u16x32<Self> {
+        unsafe { _mm512_set1_epi16(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
+        u16x32 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
+        u16x32 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
+        crate::transmute::checked_transmute_copy::<__m512i, [u16; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
+        crate::transmute::checked_cast_ref::<__m512i, [u16; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
+        crate::transmute::checked_cast_mut::<__m512i, [u16; 32usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u16,
+                dest.as_mut_ptr(),
+                32usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
+        u16x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe {
+            if SHIFT >= 32usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT * 2usize) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_u16x32(a).val.0,
+                idx,
+                self.cvt_to_bytes_u16x32(b).val.0,
+            );
+            self.cvt_from_bytes_u16x32(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u16x32<const SHIFT: usize>(
+        self,
+        a: u16x32<Self>,
+        b: u16x32<Self>,
+    ) -> u16x32<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_u16x32(a).val.0;
+            let b = self.cvt_to_bytes_u16x32(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT * 2usize);
+            self.cvt_from_bytes_u16x32(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_add_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_sub_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_mullo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
+        unsafe {
+            _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_sllv_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
+        unsafe {
+            _mm512_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_srlv_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmpeq_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmplt_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmple_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmpge_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmpgt_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, 37,
+                    5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, 22,
+                    53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24,
+                    22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25,
+                    23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6,
+                        37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54,
+                        22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26,
+                        24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27,
+                        25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_min_epu16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_max_epu16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
+        unsafe {
+            (
+                _mm512_castsi512_si256(a.into()).simd_into(self),
+                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
+        let lanes: __m512i =
+            crate::transmute::checked_transmute_copy::<[u16; 32usize], __m512i>(src);
+        unsafe {
+            _mm512_permutexvar_epi16(
+                _mm512_set_epi16(
+                    31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2, 29, 25, 21, 17, 13,
+                    9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0,
+                ),
+                lanes,
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+        unsafe {
+            let lanes = _mm512_permutexvar_epi16(
+                _mm512_set_epi16(
+                    31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4, 27, 19, 11, 3, 26,
+                    18, 10, 2, 25, 17, 9, 1, 24, 16, 8, 0,
+                ),
+                a.into(),
+            );
+            _mm512_storeu_si512(dest.as_mut_ptr() as *mut _, lanes);
+        }
+    }
+    #[inline(always)]
+    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
+        unsafe { _mm512_cvtepi16_epi8(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_mask16x32(self, val: bool) -> mask16x32<Self> {
+        mask16x32 {
+            val: (if val { 4294967295u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
+        unsafe {
+            let lanes = crate::transmute::checked_transmute_copy(&val);
+            mask16x32 {
+                val: _mm512_movepi16_mask(lanes),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
+        unsafe {
+            let lanes = _mm512_movm_epi16(a.val);
+            crate::transmute::checked_transmute_copy(&lanes)
+        }
+    }
+    #[inline(always)]
+    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
+        mask16x32 {
+            val: (bits & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
+        u64::from((a).val) & 4294967295u64
+    }
+    #[inline(always)]
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask16x32 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: ((!u64::from((a).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask16x32(
+        self,
+        a: mask16x32<Self>,
+        b: mask16x32<Self>,
+        c: mask16x32<Self>,
+    ) -> mask16x32<Self> {
+        mask16x32 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: (!u64::from(a.val ^ b.val) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits == 4294967295u64
+    }
+    #[inline(always)]
+    fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits != 4294967295u64
+    }
+    #[inline(always)]
+    fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask16x16 {
+                val: (bits & 65535u64) as _,
+                simd: self,
+            },
+            mask16x16 {
+                val: ((bits >> 16usize) & 65535u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i32x16(self, val: i32) -> i32x16<Self> {
+        unsafe { _mm512_set1_epi32(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
+        i32x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
+        i32x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
+        crate::transmute::checked_transmute_copy::<__m512i, [i32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
+        crate::transmute::checked_cast_ref::<__m512i, [i32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
+        crate::transmute::checked_cast_mut::<__m512i, [i32; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i32,
+                dest.as_mut_ptr(),
+                16usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
+        i32x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe {
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT * 4usize) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_i32x16(a).val.0,
+                idx,
+                self.cvt_to_bytes_i32x16(b).val.0,
+            );
+            self.cvt_from_bytes_i32x16(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i32x16<const SHIFT: usize>(
+        self,
+        a: i32x16<Self>,
+        b: i32x16<Self>,
+    ) -> i32x16<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_i32x16(a).val.0;
+            let b = self.cvt_to_bytes_i32x16(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT * 4usize);
+            self.cvt_from_bytes_i32x16(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_add_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_sub_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_mullo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
+        unsafe {
+            _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_sllv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
+        unsafe {
+            _mm512_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_srav_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmpeq_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmplt_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmple_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmpge_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmpgt_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_min_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_max_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
+        unsafe {
+            (
+                _mm512_castsi512_si256(a.into()).simd_into(self),
+                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_sub_epi32(_mm512_setzero_si512(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_cvtepi32_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn splat_u32x16(self, val: u32) -> u32x16<Self> {
+        unsafe { _mm512_set1_epi32(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
+        u32x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
+        u32x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
+        crate::transmute::checked_transmute_copy::<__m512i, [u32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
+        crate::transmute::checked_cast_ref::<__m512i, [u32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
+        crate::transmute::checked_cast_mut::<__m512i, [u32; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u32,
+                dest.as_mut_ptr(),
+                16usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
+        u32x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe {
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT * 4usize) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_u32x16(a).val.0,
+                idx,
+                self.cvt_to_bytes_u32x16(b).val.0,
+            );
+            self.cvt_from_bytes_u32x16(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u32x16<const SHIFT: usize>(
+        self,
+        a: u32x16<Self>,
+        b: u32x16<Self>,
+    ) -> u32x16<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_u32x16(a).val.0;
+            let b = self.cvt_to_bytes_u32x16(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT * 4usize);
+            self.cvt_from_bytes_u32x16(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_add_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_sub_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_mullo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
+        unsafe {
+            _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_sllv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
+        unsafe {
+            _mm512_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_srlv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmpeq_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmplt_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmple_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmpge_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmpgt_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_min_epu32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_max_epu32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
+        unsafe {
+            (
+                _mm512_castsi512_si256(a.into()).simd_into(self),
+                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
+        let lanes: __m512i =
+            crate::transmute::checked_transmute_copy::<[u32; 16usize], __m512i>(src);
+        unsafe {
+            _mm512_permutexvar_epi32(
+                _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
+                lanes,
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
+        unsafe {
+            let lanes = _mm512_permutexvar_epi32(
+                _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
+                a.into(),
+            );
+            _mm512_storeu_si512(dest.as_mut_ptr() as *mut _, lanes);
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_cvtepu32_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn splat_mask32x16(self, val: bool) -> mask32x16<Self> {
+        mask32x16 {
+            val: (if val { 65535u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
+        unsafe {
+            let lanes = crate::transmute::checked_transmute_copy(&val);
+            mask32x16 {
+                val: _mm512_movepi32_mask(lanes),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
+        unsafe {
+            let lanes = _mm512_movm_epi32(a.val);
+            crate::transmute::checked_transmute_copy(&lanes)
+        }
+    }
+    #[inline(always)]
+    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
+        mask32x16 {
+            val: (bits & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
+        u64::from((a).val) & 65535u64
+    }
+    #[inline(always)]
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask32x16 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: ((!u64::from((a).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask32x16(
+        self,
+        a: mask32x16<Self>,
+        b: mask32x16<Self>,
+        c: mask32x16<Self>,
+    ) -> mask32x16<Self> {
+        mask32x16 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: (!u64::from(a.val ^ b.val) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits == 65535u64
+    }
+    #[inline(always)]
+    fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits != 65535u64
+    }
+    #[inline(always)]
+    fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask32x8 {
+                val: (bits & 255u64) as _,
+                simd: self,
+            },
+            mask32x8 {
+                val: ((bits >> 8usize) & 255u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_f64x8(self, val: f64) -> f64x8<Self> {
+        unsafe { _mm512_set1_pd(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
+        f64x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
+        f64x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m512d, [f64; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
+        crate::transmute::checked_cast_ref::<__m512d, [f64; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
+        crate::transmute::checked_cast_mut::<__m512d, [f64; 8usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const f64,
+                dest.as_mut_ptr(),
+                8usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
+        f64x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT * 8usize) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_f64x8(a).val.0,
+                idx,
+                self.cvt_to_bytes_f64x8(b).val.0,
+            );
+            self.cvt_from_bytes_f64x8(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f64x8<const SHIFT: usize>(
+        self,
+        a: f64x8<Self>,
+        b: f64x8<Self>,
+    ) -> f64x8<Self> {
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 2usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_f64x8(a).val.0;
+            let b = self.cvt_to_bytes_f64x8(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT * 8usize);
+            self.cvt_from_bytes_f64x8(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_andnot_pd(_mm512_set1_pd(-0.0), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_xor_pd(a.into(), _mm512_set1_pd(-0.0)).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_sqrt_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_rcp14_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_add_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_sub_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_mul_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_div_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            let mask = _mm512_set1_pd(-0.0);
+            _mm512_or_pd(
+                _mm512_and_pd(mask, b.into()),
+                _mm512_andnot_pd(mask, a.into()),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        unsafe {
+            mask64x8 {
+                val: _mm512_cmp_pd_mask::<0i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        unsafe {
+            mask64x8 {
+                val: _mm512_cmp_pd_mask::<17i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        unsafe {
+            mask64x8 {
+                val: _mm512_cmp_pd_mask::<18i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        unsafe {
+            mask64x8 {
+                val: _mm512_cmp_pd_mask::<29i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        unsafe {
+            mask64x8 {
+                val: _mm512_cmp_pd_mask::<30i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            _mm512_permutex2var_pd(
+                a.into(),
+                _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            _mm512_permutex2var_pd(
+                a.into(),
+                _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            _mm512_permutex2var_pd(
+                a.into(),
+                _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            _mm512_permutex2var_pd(
+                a.into(),
+                _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b)
+                    .simd_into(self),
+                _mm512_permutex2var_pd(a, _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b)
+                    .simd_into(self),
+                _mm512_permutex2var_pd(a, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_max_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_min_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_range_pd::<5i32>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_range_pd::<4i32>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            _mm512_roundscale_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        a - self.trunc_f64x8(a)
+    }
+    #[inline(always)]
+    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            _mm512_roundscale_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_mask_blend_pd(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        unsafe {
+            (
+                _mm512_castpd512_pd256(a.into()).simd_into(self),
+                _mm512_extractf64x4_pd::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
+        unsafe { _mm512_castpd_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn splat_mask64x8(self, val: bool) -> mask64x8<Self> {
+        mask64x8 {
+            val: (if val { 255u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
+        unsafe {
+            let lanes = crate::transmute::checked_transmute_copy(&val);
+            mask64x8 {
+                val: _mm512_movepi64_mask(lanes),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask64x8(self, a: mask64x8<Self>) -> [i64; 8usize] {
+        unsafe {
+            let lanes = _mm512_movm_epi64(a.val);
+            crate::transmute::checked_transmute_copy(&lanes)
+        }
+    }
+    #[inline(always)]
+    fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8<Self> {
+        mask64x8 {
+            val: (bits & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x8(self, a: mask64x8<Self>) -> u64 {
+        u64::from((a).val) & 255u64
+    }
+    #[inline(always)]
+    fn set_mask64x8(self, a: &mut mask64x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask64x8 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
+        mask64x8 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
+        mask64x8 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
+        mask64x8 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask64x8(self, a: mask64x8<Self>) -> mask64x8<Self> {
+        mask64x8 {
+            val: ((!u64::from((a).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask64x8(
+        self,
+        a: mask64x8<Self>,
+        b: mask64x8<Self>,
+        c: mask64x8<Self>,
+    ) -> mask64x8<Self> {
+        mask64x8 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
+        mask64x8 {
+            val: (!u64::from(a.val ^ b.val) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask64x8(self, a: mask64x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask64x8(self, a: mask64x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits == 255u64
+    }
+    #[inline(always)]
+    fn any_false_mask64x8(self, a: mask64x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits != 255u64
+    }
+    #[inline(always)]
+    fn all_false_mask64x8(self, a: mask64x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn split_mask64x8(self, a: mask64x8<Self>) -> (mask64x4<Self>, mask64x4<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask64x4 {
+                val: (bits & 15u64) as _,
+                simd: self,
+            },
+            mask64x4 {
+                val: ((bits >> 4usize) & 15u64) as _,
+                simd: self,
+            },
+        )
+    }
+}
+impl<S: Simd> SimdFrom<__mmask16, S> for mask8x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask16) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask8x16<S>> for __mmask16 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask8x16<S>) -> Self {
+        value.to_bitmask() as __mmask16
+    }
+}
+impl<S: Simd> SimdFrom<__mmask8, S> for mask16x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask8) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask16x8<S>> for __mmask8 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask16x8<S>) -> Self {
+        value.to_bitmask() as __mmask8
+    }
+}
+impl<S: Simd> SimdFrom<__mmask8, S> for mask32x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask8) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask32x4<S>> for __mmask8 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask32x4<S>) -> Self {
+        value.to_bitmask() as __mmask8
+    }
+}
+impl<S: Simd> SimdFrom<__mmask8, S> for mask64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask8) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask64x2<S>> for __mmask8 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask64x2<S>) -> Self {
+        value.to_bitmask() as __mmask8
+    }
+}
+impl<S: Simd> SimdFrom<__mmask32, S> for mask8x32<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask32) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask8x32<S>> for __mmask32 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask8x32<S>) -> Self {
+        value.to_bitmask() as __mmask32
+    }
+}
+impl<S: Simd> SimdFrom<__mmask16, S> for mask16x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask16) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask16x16<S>> for __mmask16 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask16x16<S>) -> Self {
+        value.to_bitmask() as __mmask16
+    }
+}
+impl<S: Simd> SimdFrom<__mmask8, S> for mask32x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask8) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask32x8<S>> for __mmask8 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask32x8<S>) -> Self {
+        value.to_bitmask() as __mmask8
+    }
+}
+impl<S: Simd> SimdFrom<__mmask8, S> for mask64x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask8) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask64x4<S>> for __mmask8 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask64x4<S>) -> Self {
+        value.to_bitmask() as __mmask8
+    }
+}
+impl<S: Simd> SimdFrom<__m512, S> for f32x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<f32x16<S>> for __m512 {
+    #[inline(always)]
+    fn from(value: f32x16<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
+impl<S: Simd> SimdFrom<__m512i, S> for i8x64<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512i) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<i8x64<S>> for __m512i {
+    #[inline(always)]
+    fn from(value: i8x64<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
+impl<S: Simd> SimdFrom<__m512i, S> for u8x64<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512i) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<u8x64<S>> for __m512i {
+    #[inline(always)]
+    fn from(value: u8x64<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
+impl<S: Simd> SimdFrom<__mmask64, S> for mask8x64<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask64) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask8x64<S>> for __mmask64 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask8x64<S>) -> Self {
+        value.to_bitmask() as __mmask64
+    }
+}
+impl<S: Simd> SimdFrom<__m512i, S> for i16x32<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512i) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<i16x32<S>> for __m512i {
+    #[inline(always)]
+    fn from(value: i16x32<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
+impl<S: Simd> SimdFrom<__m512i, S> for u16x32<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512i) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<u16x32<S>> for __m512i {
+    #[inline(always)]
+    fn from(value: u16x32<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
+impl<S: Simd> SimdFrom<__mmask32, S> for mask16x32<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask32) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask16x32<S>> for __mmask32 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask16x32<S>) -> Self {
+        value.to_bitmask() as __mmask32
+    }
+}
+impl<S: Simd> SimdFrom<__m512i, S> for i32x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512i) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<i32x16<S>> for __m512i {
+    #[inline(always)]
+    fn from(value: i32x16<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
+impl<S: Simd> SimdFrom<__m512i, S> for u32x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512i) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<u32x16<S>> for __m512i {
+    #[inline(always)]
+    fn from(value: u32x16<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
+impl<S: Simd> SimdFrom<__mmask16, S> for mask32x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask16) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask32x16<S>> for __mmask16 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask32x16<S>) -> Self {
+        value.to_bitmask() as __mmask16
+    }
+}
+impl<S: Simd> SimdFrom<__m512d, S> for f64x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512d) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<f64x8<S>> for __m512d {
+    #[inline(always)]
+    fn from(value: f64x8<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
+impl<S: Simd> SimdFrom<__mmask8, S> for mask64x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask8) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask64x8<S>> for __mmask8 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask64x8<S>) -> Self {
+        value.to_bitmask() as __mmask8
+    }
+}
+#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
+#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
+#[doc = r" Rust doesn't currently let you do math on const generics."]
+#[inline(always)]
+unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i {
+    unsafe {
+        match shift {
+            0usize => _mm_alignr_epi8::<0i32>(a, b),
+            1usize => _mm_alignr_epi8::<1i32>(a, b),
+            2usize => _mm_alignr_epi8::<2i32>(a, b),
+            3usize => _mm_alignr_epi8::<3i32>(a, b),
+            4usize => _mm_alignr_epi8::<4i32>(a, b),
+            5usize => _mm_alignr_epi8::<5i32>(a, b),
+            6usize => _mm_alignr_epi8::<6i32>(a, b),
+            7usize => _mm_alignr_epi8::<7i32>(a, b),
+            8usize => _mm_alignr_epi8::<8i32>(a, b),
+            9usize => _mm_alignr_epi8::<9i32>(a, b),
+            10usize => _mm_alignr_epi8::<10i32>(a, b),
+            11usize => _mm_alignr_epi8::<11i32>(a, b),
+            12usize => _mm_alignr_epi8::<12i32>(a, b),
+            13usize => _mm_alignr_epi8::<13i32>(a, b),
+            14usize => _mm_alignr_epi8::<14i32>(a, b),
+            15usize => _mm_alignr_epi8::<15i32>(a, b),
+            _ => unreachable!(),
+        }
+    }
+}
+#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
+#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
+#[doc = r" Rust doesn't currently let you do math on const generics."]
+#[inline(always)]
+unsafe fn dyn_alignr_256(a: __m256i, b: __m256i, shift: usize) -> __m256i {
+    unsafe {
+        match shift {
+            0usize => _mm256_alignr_epi8::<0i32>(a, b),
+            1usize => _mm256_alignr_epi8::<1i32>(a, b),
+            2usize => _mm256_alignr_epi8::<2i32>(a, b),
+            3usize => _mm256_alignr_epi8::<3i32>(a, b),
+            4usize => _mm256_alignr_epi8::<4i32>(a, b),
+            5usize => _mm256_alignr_epi8::<5i32>(a, b),
+            6usize => _mm256_alignr_epi8::<6i32>(a, b),
+            7usize => _mm256_alignr_epi8::<7i32>(a, b),
+            8usize => _mm256_alignr_epi8::<8i32>(a, b),
+            9usize => _mm256_alignr_epi8::<9i32>(a, b),
+            10usize => _mm256_alignr_epi8::<10i32>(a, b),
+            11usize => _mm256_alignr_epi8::<11i32>(a, b),
+            12usize => _mm256_alignr_epi8::<12i32>(a, b),
+            13usize => _mm256_alignr_epi8::<13i32>(a, b),
+            14usize => _mm256_alignr_epi8::<14i32>(a, b),
+            15usize => _mm256_alignr_epi8::<15i32>(a, b),
+            _ => unreachable!(),
+        }
+    }
+}
+#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
+#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
+#[doc = r" Rust doesn't currently let you do math on const generics."]
+#[inline(always)]
+unsafe fn dyn_alignr_512(a: __m512i, b: __m512i, shift: usize) -> __m512i {
+    unsafe {
+        match shift {
+            0usize => _mm512_alignr_epi8::<0i32>(a, b),
+            1usize => _mm512_alignr_epi8::<1i32>(a, b),
+            2usize => _mm512_alignr_epi8::<2i32>(a, b),
+            3usize => _mm512_alignr_epi8::<3i32>(a, b),
+            4usize => _mm512_alignr_epi8::<4i32>(a, b),
+            5usize => _mm512_alignr_epi8::<5i32>(a, b),
+            6usize => _mm512_alignr_epi8::<6i32>(a, b),
+            7usize => _mm512_alignr_epi8::<7i32>(a, b),
+            8usize => _mm512_alignr_epi8::<8i32>(a, b),
+            9usize => _mm512_alignr_epi8::<9i32>(a, b),
+            10usize => _mm512_alignr_epi8::<10i32>(a, b),
+            11usize => _mm512_alignr_epi8::<11i32>(a, b),
+            12usize => _mm512_alignr_epi8::<12i32>(a, b),
+            13usize => _mm512_alignr_epi8::<13i32>(a, b),
+            14usize => _mm512_alignr_epi8::<14i32>(a, b),
+            15usize => _mm512_alignr_epi8::<15i32>(a, b),
+            _ => unreachable!(),
+        }
+    }
+}
diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs
index 6ec6d23b9..1024b172a 100644
--- a/fearless_simd/src/generated/fallback.rs
+++ b/fearless_simd/src/generated/fallback.rs
@@ -1829,6 +1829,17 @@ impl Simd for Fallback {
         bits
     }
     #[inline(always)]
+    fn set_mask8x16(self, a: &mut mask8x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask8x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         [
             i8::bitand(a.val.0[0usize], &b.val.0[0usize]),
@@ -2986,6 +2997,17 @@ impl Simd for Fallback {
         bits
     }
     #[inline(always)]
+    fn set_mask16x8(self, a: &mut mask16x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask16x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         [
             i16::bitand(a.val.0[0usize], &b.val.0[0usize]),
@@ -3835,6 +3857,17 @@ impl Simd for Fallback {
         bits
     }
     #[inline(always)]
+    fn set_mask32x4(self, a: &mut mask32x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask32x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         [
             i32::bitand(a.val.0[0usize], &b.val.0[0usize]),
@@ -4248,6 +4281,17 @@ impl Simd for Fallback {
         bits
     }
     #[inline(always)]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let mut lanes = self.as_array_mask64x2(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x2(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         [
             i64::bitand(a.val.0[0usize], &b.val.0[0usize]),
@@ -5237,6 +5281,17 @@ impl Simd for Fallback {
         lo | (hi << 16usize)
     }
     #[inline(always)]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask8x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         let (a0, a1) = self.split_mask8x32(a);
         let (b0, b1) = self.split_mask8x32(b);
@@ -5890,6 +5945,17 @@ impl Simd for Fallback {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask16x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         let (a0, a1) = self.split_mask16x16(a);
         let (b0, b1) = self.split_mask16x16(b);
@@ -6523,6 +6589,17 @@ impl Simd for Fallback {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask32x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         let (a0, a1) = self.split_mask32x8(a);
         let (b0, b1) = self.split_mask32x8(b);
@@ -6941,6 +7018,17 @@ impl Simd for Fallback {
         lo | (hi << 2usize)
     }
     #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask64x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         let (a0, a1) = self.split_mask64x4(a);
         let (b0, b1) = self.split_mask64x4(b);
@@ -8018,6 +8106,17 @@ impl Simd for Fallback {
         lo | (hi << 32usize)
     }
     #[inline(always)]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
+        );
+        let mut lanes = self.as_array_mask8x64(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x64(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         let (a0, a1) = self.split_mask8x64(a);
         let (b0, b1) = self.split_mask8x64(b);
@@ -8699,6 +8798,17 @@ impl Simd for Fallback {
         lo | (hi << 16usize)
     }
     #[inline(always)]
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask16x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         let (a0, a1) = self.split_mask16x32(a);
         let (b0, b1) = self.split_mask16x32(b);
@@ -9344,6 +9454,17 @@ impl Simd for Fallback {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask32x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         let (a0, a1) = self.split_mask32x16(a);
         let (b0, b1) = self.split_mask32x16(b);
@@ -9748,6 +9869,17 @@ impl Simd for Fallback {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask64x8(self, a: &mut mask64x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask64x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         let (a0, a1) = self.split_mask64x8(a);
         let (b0, b1) = self.split_mask64x8(b);
diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
index f89f760f7..368197c97 100644
--- a/fearless_simd/src/generated/neon.rs
+++ b/fearless_simd/src/generated/neon.rs
@@ -807,6 +807,17 @@ impl Simd for Neon {
         }
     }
     #[inline(always)]
+    fn set_mask8x16(self, a: &mut mask8x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask8x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         unsafe { vandq_s8(a.into(), b.into()).simd_into(self) }
     }
@@ -1306,6 +1317,17 @@ impl Simd for Neon {
         }
     }
     #[inline(always)]
+    fn set_mask16x8(self, a: &mut mask16x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask16x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         unsafe { vandq_s16(a.into(), b.into()).simd_into(self) }
     }
@@ -1807,6 +1829,17 @@ impl Simd for Neon {
         }
     }
     #[inline(always)]
+    fn set_mask32x4(self, a: &mut mask32x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask32x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         unsafe { vandq_s32(a.into(), b.into()).simd_into(self) }
     }
@@ -2127,6 +2160,17 @@ impl Simd for Neon {
         }
     }
     #[inline(always)]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let mut lanes = self.as_array_mask64x2(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x2(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         unsafe { vandq_s64(a.into(), b.into()).simd_into(self) }
     }
@@ -3217,6 +3261,17 @@ impl Simd for Neon {
         lo | (hi << 16usize)
     }
     #[inline(always)]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask8x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         let (a0, a1) = self.split_mask8x32(a);
         let (b0, b1) = self.split_mask8x32(b);
@@ -3950,6 +4005,17 @@ impl Simd for Neon {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask16x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         let (a0, a1) = self.split_mask16x16(a);
         let (b0, b1) = self.split_mask16x16(b);
@@ -4676,6 +4742,17 @@ impl Simd for Neon {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask32x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         let (a0, a1) = self.split_mask32x8(a);
         let (b0, b1) = self.split_mask32x8(b);
@@ -5144,6 +5221,17 @@ impl Simd for Neon {
         lo | (hi << 2usize)
     }
     #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask64x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         let (a0, a1) = self.split_mask64x4(a);
         let (b0, b1) = self.split_mask64x4(b);
@@ -6306,6 +6394,17 @@ impl Simd for Neon {
         lo | (hi << 32usize)
     }
     #[inline(always)]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
+        );
+        let mut lanes = self.as_array_mask8x64(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x64(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         let (a0, a1) = self.split_mask8x64(a);
         let (b0, b1) = self.split_mask8x64(b);
@@ -7070,6 +7169,17 @@ impl Simd for Neon {
         lo | (hi << 16usize)
     }
     #[inline(always)]
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask16x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         let (a0, a1) = self.split_mask16x32(a);
         let (b0, b1) = self.split_mask16x32(b);
@@ -7816,6 +7926,17 @@ impl Simd for Neon {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask32x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         let (a0, a1) = self.split_mask32x16(a);
         let (b0, b1) = self.split_mask32x16(b);
@@ -8284,6 +8405,17 @@ impl Simd for Neon {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask64x8(self, a: &mut mask64x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask64x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         let (a0, a1) = self.split_mask64x8(a);
         let (b0, b1) = self.split_mask64x8(b);
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index 1510e2748..91b270d5c 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -27,8 +27,8 @@ use crate::{
 #[doc = r" # Associated Types"]
 #[doc = r""]
 #[doc = r#" The trait defines associated types for the highest "native" vector width of each scalar type (e.g. `f32s`,"#]
-#[doc = r" `u32s`). These are always at least 128 bits, but may be larger. Currently, they are 128 bits everywhere but"]
-#[doc = r" AVX2, where they are 256 bits."]
+#[doc = r" `u32s`). These are always at least 128 bits, but may be larger. Currently, they are 128 bits on the"]
+#[doc = r" fallback, NEON, WASM, and SSE4.2 backends, 256 bits on AVX2, and 512 bits on AVX-512."]
 #[doc = r""]
 #[doc = r" # Example"]
 #[doc = r""]
@@ -150,7 +150,7 @@ pub trait Simd:
     fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self>;
@@ -218,7 +218,7 @@ pub trait Simd:
     fn reinterpret_u8_f32x4(self, a: f32x4<Self>) -> u8x16<Self>;
     #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
     fn reinterpret_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self>;
-    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
+    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
     fn cvt_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self>;
     #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values are saturated to the closest in-range value. NaN becomes 0."]
     fn cvt_u32_precise_f32x4(self, a: f32x4<Self>) -> u32x4<Self>;
@@ -402,6 +402,8 @@ pub trait Simd:
     fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask8x16(self, a: &mut mask8x16<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -605,6 +607,8 @@ pub trait Simd:
     fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask16x8(self, a: &mut mask16x8<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -810,6 +814,8 @@ pub trait Simd:
     fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask32x4(self, a: &mut mask32x4<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -869,7 +875,7 @@ pub trait Simd:
     fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self>;
@@ -941,6 +947,8 @@ pub trait Simd:
     fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -1000,7 +1008,7 @@ pub trait Simd:
     fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self>;
@@ -1070,7 +1078,7 @@ pub trait Simd:
     fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self>;
     #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
     fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self>;
-    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
+    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
     fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self>;
     #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values are saturated to the closest in-range value. NaN becomes 0."]
     fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self>;
@@ -1258,6 +1266,8 @@ pub trait Simd:
     fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -1469,6 +1479,8 @@ pub trait Simd:
     fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -1680,6 +1692,8 @@ pub trait Simd:
     fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -1741,7 +1755,7 @@ pub trait Simd:
     fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self>;
@@ -1815,6 +1829,8 @@ pub trait Simd:
     fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -1876,7 +1892,7 @@ pub trait Simd:
     fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self>;
@@ -1948,7 +1964,7 @@ pub trait Simd:
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self>;
     #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
     fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self>;
-    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
+    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
     fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self>;
     #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values are saturated to the closest in-range value. NaN becomes 0."]
     fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self>;
@@ -2134,6 +2150,8 @@ pub trait Simd:
     fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -2343,6 +2361,8 @@ pub trait Simd:
     fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -2552,6 +2572,8 @@ pub trait Simd:
     fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -2611,7 +2633,7 @@ pub trait Simd:
     fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self>;
@@ -2683,6 +2705,8 @@ pub trait Simd:
     fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask64x8(self, a: mask64x8<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask64x8(self, a: &mut mask64x8<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -2862,7 +2886,7 @@ pub trait SimdFloat<S: Simd>:
     fn abs(self) -> Self;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt(self) -> Self;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip(self) -> Self;
     #[doc = "Return a vector with the magnitude of `self` and the sign of `rhs` for each element.\n\nThis operation copies the sign bit, so if an input element is NaN, the output element will be a NaN with the same payload and a copied sign bit."]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self;
diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs
index 416defc26..c05fa1b73 100644
--- a/fearless_simd/src/generated/simd_types.rs
+++ b/fearless_simd/src/generated/simd_types.rs
@@ -688,14 +688,7 @@ impl<S: Simd> SimdMask<S> for mask8x16<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 16,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16
-        );
-        let mut lanes = self.simd.as_array_mask8x16(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask8x16(lanes);
+        self.simd.set_mask8x16(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i8]) -> Self {
@@ -1156,14 +1149,7 @@ impl<S: Simd> SimdMask<S> for mask16x8<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 8,
-            "mask lane index {index} is out of bounds for {} lanes",
-            8
-        );
-        let mut lanes = self.simd.as_array_mask16x8(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask16x8(lanes);
+        self.simd.set_mask16x8(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i16]) -> Self {
@@ -1572,7 +1558,7 @@ impl<S: Simd> crate::SimdInt<S> for u32x4<S> {
     }
 }
 impl<S: Simd> SimdCvtTruncate<f32x4<S>> for u32x4<S> {
-    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
+    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
     #[inline(always)]
     fn truncate_from(x: f32x4<S>) -> Self {
         x.simd.cvt_u32_f32x4(x)
@@ -1648,14 +1634,7 @@ impl<S: Simd> SimdMask<S> for mask32x4<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 4,
-            "mask lane index {index} is out of bounds for {} lanes",
-            4
-        );
-        let mut lanes = self.simd.as_array_mask32x4(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask32x4(lanes);
+        self.simd.set_mask32x4(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i32]) -> Self {
@@ -1985,14 +1964,7 @@ impl<S: Simd> SimdMask<S> for mask64x2<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 2,
-            "mask lane index {index} is out of bounds for {} lanes",
-            2
-        );
-        let mut lanes = self.simd.as_array_mask64x2(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask64x2(lanes);
+        self.simd.set_mask64x2(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i64]) -> Self {
@@ -2727,14 +2699,7 @@ impl<S: Simd> SimdMask<S> for mask8x32<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 32,
-            "mask lane index {index} is out of bounds for {} lanes",
-            32
-        );
-        let mut lanes = self.simd.as_array_mask8x32(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask8x32(lanes);
+        self.simd.set_mask8x32(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i8]) -> Self {
@@ -3221,14 +3186,7 @@ impl<S: Simd> SimdMask<S> for mask16x16<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 16,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16
-        );
-        let mut lanes = self.simd.as_array_mask16x16(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask16x16(lanes);
+        self.simd.set_mask16x16(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i16]) -> Self {
@@ -3644,7 +3602,7 @@ impl<S: Simd> crate::SimdInt<S> for u32x8<S> {
     }
 }
 impl<S: Simd> SimdCvtTruncate<f32x8<S>> for u32x8<S> {
-    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
+    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
     #[inline(always)]
     fn truncate_from(x: f32x8<S>) -> Self {
         x.simd.cvt_u32_f32x8(x)
@@ -3727,14 +3685,7 @@ impl<S: Simd> SimdMask<S> for mask32x8<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 8,
-            "mask lane index {index} is out of bounds for {} lanes",
-            8
-        );
-        let mut lanes = self.simd.as_array_mask32x8(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask32x8(lanes);
+        self.simd.set_mask32x8(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i32]) -> Self {
@@ -4071,14 +4022,7 @@ impl<S: Simd> SimdMask<S> for mask64x4<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 4,
-            "mask lane index {index} is out of bounds for {} lanes",
-            4
-        );
-        let mut lanes = self.simd.as_array_mask64x4(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask64x4(lanes);
+        self.simd.set_mask64x4(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i64]) -> Self {
@@ -4801,14 +4745,7 @@ impl<S: Simd> SimdMask<S> for mask8x64<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 64,
-            "mask lane index {index} is out of bounds for {} lanes",
-            64
-        );
-        let mut lanes = self.simd.as_array_mask8x64(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask8x64(lanes);
+        self.simd.set_mask8x64(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i8]) -> Self {
@@ -5283,14 +5220,7 @@ impl<S: Simd> SimdMask<S> for mask16x32<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 32,
-            "mask lane index {index} is out of bounds for {} lanes",
-            32
-        );
-        let mut lanes = self.simd.as_array_mask16x32(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask16x32(lanes);
+        self.simd.set_mask16x32(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i16]) -> Self {
@@ -5713,7 +5643,7 @@ impl<S: Simd> crate::SimdInt<S> for u32x16<S> {
     }
 }
 impl<S: Simd> SimdCvtTruncate<f32x16<S>> for u32x16<S> {
-    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
+    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
     #[inline(always)]
     fn truncate_from(x: f32x16<S>) -> Self {
         x.simd.cvt_u32_f32x16(x)
@@ -5789,14 +5719,7 @@ impl<S: Simd> SimdMask<S> for mask32x16<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 16,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16
-        );
-        let mut lanes = self.simd.as_array_mask32x16(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask32x16(lanes);
+        self.simd.set_mask32x16(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i32]) -> Self {
@@ -6127,14 +6050,7 @@ impl<S: Simd> SimdMask<S> for mask64x8<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 8,
-            "mask lane index {index} is out of bounds for {} lanes",
-            8
-        );
-        let mut lanes = self.simd.as_array_mask64x8(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask64x8(lanes);
+        self.simd.set_mask64x8(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i64]) -> Self {
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index 1da1a6d00..d70c8f985 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -947,6 +947,17 @@ impl Simd for Sse4_2 {
         unsafe { _mm_movemask_epi8(a.into()) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask8x16(self, a: &mut mask8x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask8x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -1459,6 +1470,17 @@ impl Simd for Sse4_2 {
         }
     }
     #[inline(always)]
+    fn set_mask16x8(self, a: &mut mask16x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask16x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -1976,6 +1998,17 @@ impl Simd for Sse4_2 {
         unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask32x4(self, a: &mut mask32x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask32x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -2299,6 +2332,17 @@ impl Simd for Sse4_2 {
         unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let mut lanes = self.as_array_mask64x2(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x2(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -3323,6 +3367,17 @@ impl Simd for Sse4_2 {
         lo | (hi << 16usize)
     }
     #[inline(always)]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask8x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         let (a0, a1) = self.split_mask8x32(a);
         let (b0, b1) = self.split_mask8x32(b);
@@ -4014,6 +4069,17 @@ impl Simd for Sse4_2 {
         }
     }
     #[inline(always)]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask16x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         let (a0, a1) = self.split_mask16x16(a);
         let (b0, b1) = self.split_mask16x16(b);
@@ -4694,6 +4760,17 @@ impl Simd for Sse4_2 {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask32x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         let (a0, a1) = self.split_mask32x8(a);
         let (b0, b1) = self.split_mask32x8(b);
@@ -5138,6 +5215,17 @@ impl Simd for Sse4_2 {
         lo | (hi << 2usize)
     }
     #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask64x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         let (a0, a1) = self.split_mask64x4(a);
         let (b0, b1) = self.split_mask64x4(b);
@@ -6319,6 +6407,17 @@ impl Simd for Sse4_2 {
         lo | (hi << 32usize)
     }
     #[inline(always)]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
+        );
+        let mut lanes = self.as_array_mask8x64(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x64(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         let (a0, a1) = self.split_mask8x64(a);
         let (b0, b1) = self.split_mask8x64(b);
@@ -7066,6 +7165,17 @@ impl Simd for Sse4_2 {
         }
     }
     #[inline(always)]
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask16x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         let (a0, a1) = self.split_mask16x32(a);
         let (b0, b1) = self.split_mask16x32(b);
@@ -7780,6 +7890,17 @@ impl Simd for Sse4_2 {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask32x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         let (a0, a1) = self.split_mask32x16(a);
         let (b0, b1) = self.split_mask32x16(b);
@@ -8210,6 +8331,17 @@ impl Simd for Sse4_2 {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask64x8(self, a: &mut mask64x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask64x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         let (a0, a1) = self.split_mask64x8(a);
         let (b0, b1) = self.split_mask64x8(b);
@@ -8335,16 +8467,15 @@ impl<S: Simd> From<u8x16<S>> for __m128i {
 impl<S: Simd> SimdFrom<__m128i, S> for mask8x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
-        Self {
-            val: crate::transmute::checked_transmute_copy(&arch),
-            simd,
-        }
+        let lanes: [i8; 16usize] = crate::transmute::checked_transmute_copy(&arch);
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask8x16<S>> for __m128i {
     #[inline(always)]
     fn from(value: mask8x16<S>) -> Self {
-        crate::transmute::checked_transmute_copy(&value.val)
+        let lanes: [i8; 16usize] = value.into();
+        crate::transmute::checked_transmute_copy(&lanes)
     }
 }
 impl<S: Simd> SimdFrom<__m128i, S> for i16x8<S> {
@@ -8380,16 +8511,15 @@ impl<S: Simd> From<u16x8<S>> for __m128i {
 impl<S: Simd> SimdFrom<__m128i, S> for mask16x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
-        Self {
-            val: crate::transmute::checked_transmute_copy(&arch),
-            simd,
-        }
+        let lanes: [i16; 8usize] = crate::transmute::checked_transmute_copy(&arch);
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask16x8<S>> for __m128i {
     #[inline(always)]
     fn from(value: mask16x8<S>) -> Self {
-        crate::transmute::checked_transmute_copy(&value.val)
+        let lanes: [i16; 8usize] = value.into();
+        crate::transmute::checked_transmute_copy(&lanes)
     }
 }
 impl<S: Simd> SimdFrom<__m128i, S> for i32x4<S> {
@@ -8425,16 +8555,15 @@ impl<S: Simd> From<u32x4<S>> for __m128i {
 impl<S: Simd> SimdFrom<__m128i, S> for mask32x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
-        Self {
-            val: crate::transmute::checked_transmute_copy(&arch),
-            simd,
-        }
+        let lanes: [i32; 4usize] = crate::transmute::checked_transmute_copy(&arch);
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask32x4<S>> for __m128i {
     #[inline(always)]
     fn from(value: mask32x4<S>) -> Self {
-        crate::transmute::checked_transmute_copy(&value.val)
+        let lanes: [i32; 4usize] = value.into();
+        crate::transmute::checked_transmute_copy(&lanes)
     }
 }
 impl<S: Simd> SimdFrom<__m128d, S> for f64x2<S> {
@@ -8455,16 +8584,15 @@ impl<S: Simd> From<f64x2<S>> for __m128d {
 impl<S: Simd> SimdFrom<__m128i, S> for mask64x2<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
-        Self {
-            val: crate::transmute::checked_transmute_copy(&arch),
-            simd,
-        }
+        let lanes: [i64; 2usize] = crate::transmute::checked_transmute_copy(&arch);
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask64x2<S>> for __m128i {
     #[inline(always)]
     fn from(value: mask64x2<S>) -> Self {
-        crate::transmute::checked_transmute_copy(&value.val)
+        let lanes: [i64; 2usize] = value.into();
+        crate::transmute::checked_transmute_copy(&lanes)
     }
 }
 #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index d939bf95f..3a96d6620 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -857,6 +857,17 @@ impl Simd for WasmSimd128 {
         i8x16_bitmask(a.into()) as u64
     }
     #[inline(always)]
+    fn set_mask8x16(self, a: &mut mask8x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask8x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         v128_and(a.into(), b.into()).simd_into(self)
     }
@@ -1349,6 +1360,17 @@ impl Simd for WasmSimd128 {
         i16x8_bitmask(a.into()) as u64
     }
     #[inline(always)]
+    fn set_mask16x8(self, a: &mut mask16x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask16x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         v128_and(a.into(), b.into()).simd_into(self)
     }
@@ -1845,6 +1867,17 @@ impl Simd for WasmSimd128 {
         i32x4_bitmask(a.into()) as u64
     }
     #[inline(always)]
+    fn set_mask32x4(self, a: &mut mask32x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask32x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         v128_and(a.into(), b.into()).simd_into(self)
     }
@@ -2191,6 +2224,17 @@ impl Simd for WasmSimd128 {
         i64x2_bitmask(a.into()) as u64
     }
     #[inline(always)]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let mut lanes = self.as_array_mask64x2(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x2(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         v128_and(a.into(), b.into()).simd_into(self)
     }
@@ -3222,6 +3266,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 16usize)
     }
     #[inline(always)]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask8x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         let (a0, a1) = self.split_mask8x32(a);
         let (b0, b1) = self.split_mask8x32(b);
@@ -3909,6 +3964,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask16x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         let (a0, a1) = self.split_mask16x16(a);
         let (b0, b1) = self.split_mask16x16(b);
@@ -4589,6 +4655,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask32x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         let (a0, a1) = self.split_mask32x8(a);
         let (b0, b1) = self.split_mask32x8(b);
@@ -5033,6 +5110,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 2usize)
     }
     #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask64x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         let (a0, a1) = self.split_mask64x4(a);
         let (b0, b1) = self.split_mask64x4(b);
@@ -6177,6 +6265,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 32usize)
     }
     #[inline(always)]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
+        );
+        let mut lanes = self.as_array_mask8x64(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x64(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         let (a0, a1) = self.split_mask8x64(a);
         let (b0, b1) = self.split_mask8x64(b);
@@ -6902,6 +7001,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 16usize)
     }
     #[inline(always)]
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask16x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         let (a0, a1) = self.split_mask16x32(a);
         let (b0, b1) = self.split_mask16x32(b);
@@ -7609,6 +7719,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask32x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         let (a0, a1) = self.split_mask32x16(a);
         let (b0, b1) = self.split_mask32x16(b);
@@ -8039,6 +8160,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask64x8(self, a: &mut mask64x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask64x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         let (a0, a1) = self.split_mask64x8(a);
         let (b0, b1) = self.split_mask64x8(b);
diff --git a/fearless_simd/src/kernel_macros.rs b/fearless_simd/src/kernel_macros.rs
index c713657b9..f6695c258 100644
--- a/fearless_simd/src/kernel_macros.rs
+++ b/fearless_simd/src/kernel_macros.rs
@@ -8,7 +8,7 @@
 /// use platform-specific intrinsics for parts of the computation.
 ///
 /// The first argument must be a SIMD token written as `token: Neon`,
-/// `token: WasmSimd128`, `token: Sse4_2`, or `token: Avx2`.
+/// `token: WasmSimd128`, `token: Sse4_2`, `token: Avx2`, or `token: Avx512`.
 ///
 /// For levels with runtime-detected target features, the macro runs your body
 /// inside an inner function annotated with the appropriate `#[target_feature]`
@@ -54,7 +54,7 @@
 /// However, the body of the function can be as complex as you like.
 ///
 /// The SIMD token type must be written as a bare supported name:
-/// literally `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`. No paths or aliases.
+/// literally `Neon`, `WasmSimd128`, `Sse4_2`, `Avx2`, or `Avx512`. No paths or aliases.
 ///
 /// For soundness, this macro only accepts safe functions.
 ///
@@ -93,7 +93,7 @@ macro_rules! kernel {
     ) => {
         compile_error!(concat!(
             "fearless_simd::kernel! expects its SIMD token argument type to be written as ",
-            "one of `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`; got `",
+            "one of `Neon`, `WasmSimd128`, `Sse4_2`, `Avx2`, or `Avx512`; got `",
             stringify!($token_ty),
             "`",
         ));
@@ -153,13 +153,27 @@ macro_rules! __fearless_simd_kernel_dispatch {
         }
     };
 
+    (
+        Avx512,
+        $($body:tt)*
+    ) => {
+        $crate::__fearless_simd_kernel_impl! {
+            @cfg any(target_arch = "x86", target_arch = "x86_64");
+            @token_ty $crate::Avx512;
+            @kernel_attrs #[target_feature(
+                enable = "adx,aes,avx512bitalg,avx512bw,avx512cd,avx512dq,avx512f,avx512ifma,avx512vbmi,avx512vbmi2,avx512vl,avx512vnni,avx512vpopcntdq,bmi1,bmi2,cmpxchg16b,fma,gfni,lzcnt,movbe,pclmulqdq,popcnt,rdrand,rdseed,sha,vaes,vpclmulqdq,xsave,xsavec,xsaveopt,xsaves"
+            )];
+            $($body)*
+        }
+    };
+
     (
         $token_ty:ident,
         $($body:tt)*
     ) => {
         compile_error!(concat!(
             "fearless_simd::kernel! expects its SIMD token argument type to be written as ",
-            "one of `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`; got `",
+            "one of `Neon`, `WasmSimd128`, `Sse4_2`, `Avx2`, or `Avx512`; got `",
             stringify!($token_ty),
             "`",
         ));
@@ -216,9 +230,9 @@ mod tests {
     #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
     use core::arch::wasm32::{f32x4_add, v128};
     #[cfg(target_arch = "x86")]
-    use core::arch::x86::{__m256i, _mm256_add_epi32};
+    use core::arch::x86::{__m256i, __m512i, _mm256_add_epi32, _mm512_add_epi32};
     #[cfg(target_arch = "x86_64")]
-    use core::arch::x86_64::{__m256i, _mm256_add_epi32};
+    use core::arch::x86_64::{__m256i, __m512i, _mm256_add_epi32, _mm512_add_epi32};
 
     crate::kernel! {
         fn add_f32x4_neon(neon: Neon, a: float32x4_t, b: float32x4_t) -> float32x4_t {
@@ -238,6 +252,12 @@ mod tests {
         }
     }
 
+    crate::kernel! {
+        fn add_i32x16_avx512(avx512: Avx512, a: __m512i, b: __m512i) -> __m512i {
+            _mm512_add_epi32(a, b)
+        }
+    }
+
     #[cfg(target_arch = "aarch64")]
     #[test]
     fn kernel_instantiates_for_neon() {
@@ -291,4 +311,28 @@ mod tests {
             "`kernel!` should instantiate a working AVX2 kernel"
         );
     }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[test]
+    fn kernel_instantiates_for_avx512() {
+        let Some(avx512) = crate::Level::new().as_avx512() else {
+            return;
+        };
+
+        let a: crate::i32x16<_> =
+            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16].simd_into(avx512);
+        let b: crate::i32x16<_> = [
+            10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160,
+        ]
+        .simd_into(avx512);
+        let sum: crate::i32x16<_> = add_i32x16_avx512(avx512, a.into(), b.into()).simd_into(avx512);
+
+        assert_eq!(
+            <[i32; 16]>::from(sum),
+            [
+                11, 22, 33, 44, 55, 66, 77, 88, 99, 110, 121, 132, 143, 154, 165, 176
+            ],
+            "`kernel!` should instantiate a working AVX-512 kernel"
+        );
+    }
 }
diff --git a/fearless_simd/src/lib.rs b/fearless_simd/src/lib.rs
index 43609893c..035995aea 100644
--- a/fearless_simd/src/lib.rs
+++ b/fearless_simd/src/lib.rs
@@ -184,9 +184,46 @@ pub mod wasm32 {
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub mod x86 {
     pub use crate::generated::Avx2;
+    pub use crate::generated::Avx512;
     pub use crate::generated::Sse4_2;
 }
 
+#[cfg(all(feature = "std", any(target_arch = "x86", target_arch = "x86_64")))]
+#[inline]
+fn x86_detects_icelake_avx512() -> bool {
+    std::arch::is_x86_feature_detected!("adx")
+        && std::arch::is_x86_feature_detected!("aes")
+        && std::arch::is_x86_feature_detected!("avx512bitalg")
+        && std::arch::is_x86_feature_detected!("avx512bw")
+        && std::arch::is_x86_feature_detected!("avx512cd")
+        && std::arch::is_x86_feature_detected!("avx512dq")
+        && std::arch::is_x86_feature_detected!("avx512f")
+        && std::arch::is_x86_feature_detected!("avx512ifma")
+        && std::arch::is_x86_feature_detected!("avx512vbmi")
+        && std::arch::is_x86_feature_detected!("avx512vbmi2")
+        && std::arch::is_x86_feature_detected!("avx512vl")
+        && std::arch::is_x86_feature_detected!("avx512vnni")
+        && std::arch::is_x86_feature_detected!("avx512vpopcntdq")
+        && std::arch::is_x86_feature_detected!("bmi1")
+        && std::arch::is_x86_feature_detected!("bmi2")
+        && std::arch::is_x86_feature_detected!("cmpxchg16b")
+        && std::arch::is_x86_feature_detected!("fma")
+        && std::arch::is_x86_feature_detected!("gfni")
+        && std::arch::is_x86_feature_detected!("lzcnt")
+        && std::arch::is_x86_feature_detected!("movbe")
+        && std::arch::is_x86_feature_detected!("pclmulqdq")
+        && std::arch::is_x86_feature_detected!("popcnt")
+        && std::arch::is_x86_feature_detected!("rdrand")
+        && std::arch::is_x86_feature_detected!("rdseed")
+        && std::arch::is_x86_feature_detected!("sha")
+        && std::arch::is_x86_feature_detected!("vaes")
+        && std::arch::is_x86_feature_detected!("vpclmulqdq")
+        && std::arch::is_x86_feature_detected!("xsave")
+        && std::arch::is_x86_feature_detected!("xsavec")
+        && std::arch::is_x86_feature_detected!("xsaveopt")
+        && std::arch::is_x86_feature_detected!("xsaves")
+}
+
 /// The level enum with the specific SIMD capabilities available.
 ///
 /// The contained values serve as a proof that the associated target
@@ -248,6 +285,9 @@ pub enum Level {
         ))
     ))]
     Sse4_2(Sse4_2),
+    /// Ice Lake-class AVX-512 on (32 and 64 bit) x86.
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    Avx512(Avx512),
     /// The x86-64-v3 instruction set on (32 and 64 bit) x86, including AVX2 and FMA.
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     Avx2(Avx2),
@@ -299,6 +339,10 @@ impl Level {
         }
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         {
+            if x86_detects_icelake_avx512() {
+                return unsafe { Self::Avx512(Avx512::new_unchecked()) };
+            }
+
             // Feature list sourced from `rustc --print=cfg --target x86_64-unknown-linux-gnu -C target-cpu=x86-64-v3`
             // However, the following features are implied by avx2 and do not need to be spelled out:
             // avx,fxsr,sse,sse2,sse3,sse4.1,sse4.2,ssse3
@@ -472,6 +516,9 @@ impl Level {
     #[inline]
     pub fn as_sse4_2(self) -> Option<Sse4_2> {
         match self {
+            // Safety: The Avx512 struct represents an Ice Lake feature set, which includes the
+            // `sse4.2`, `cmpxchg16b`, and `popcnt` features required by Sse4_2.
+            Self::Avx512(_avx512) => unsafe { Some(Sse4_2::new_unchecked()) },
             // Safety: The Avx2 struct represents the x86-64-v3 feature set being enabled, which
             // includes the `sse4.2`, `cmpxchg16b`, and `popcnt` features required by Sse4_2.
             Self::Avx2(_avx) => unsafe { Some(Sse4_2::new_unchecked()) },
@@ -515,11 +562,29 @@ impl Level {
             reason = "On machines which statically support `avx2`, there is only one variant."
         )]
         match self {
+            // Safety: The Ice Lake AVX-512 feature set includes the x86-64-v3 features required by Avx2.
+            Self::Avx512(_avx512) => unsafe { Some(Avx2::new_unchecked()) },
             Self::Avx2(avx2) => Some(avx2),
             _ => None,
         }
     }
 
+    /// If this is a proof that the Ice Lake AVX-512 feature set is available, access that
+    /// instruction set.
+    ///
+    /// See [`Avx512::new_unchecked`] for the exact list of CPU features this token enables.
+    ///
+    /// This can be used in combination with the [kernel] macro to safely access level-specific
+    /// SIMD intrinsics.
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[inline]
+    pub fn as_avx512(self) -> Option<Avx512> {
+        match self {
+            Self::Avx512(avx512) => Some(avx512),
+            _ => None,
+        }
+    }
+
     /// Get the strongest statically supported SIMD level.
     ///
     /// That is, if your compilation run ambiently declares that a target feature is enabled,
@@ -562,6 +627,40 @@ impl Level {
         }
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         {
+            #[cfg(all(
+                target_feature = "adx",
+                target_feature = "aes",
+                target_feature = "avx512bitalg",
+                target_feature = "avx512bw",
+                target_feature = "avx512cd",
+                target_feature = "avx512dq",
+                target_feature = "avx512f",
+                target_feature = "avx512ifma",
+                target_feature = "avx512vbmi",
+                target_feature = "avx512vbmi2",
+                target_feature = "avx512vl",
+                target_feature = "avx512vnni",
+                target_feature = "avx512vpopcntdq",
+                target_feature = "bmi1",
+                target_feature = "bmi2",
+                target_feature = "cmpxchg16b",
+                target_feature = "fma",
+                target_feature = "gfni",
+                target_feature = "lzcnt",
+                target_feature = "movbe",
+                target_feature = "pclmulqdq",
+                target_feature = "popcnt",
+                target_feature = "rdrand",
+                target_feature = "rdseed",
+                target_feature = "sha",
+                target_feature = "vaes",
+                target_feature = "vpclmulqdq",
+                target_feature = "xsave",
+                target_feature = "xsavec",
+                target_feature = "xsaveopt",
+                target_feature = "xsaves"
+            ))]
+            return unsafe { Self::Avx512(Avx512::new_unchecked()) };
             #[cfg(all(
                 target_feature = "avx2",
                 target_feature = "bmi1",
@@ -572,7 +671,40 @@ impl Level {
                 target_feature = "lzcnt",
                 target_feature = "movbe",
                 target_feature = "popcnt",
-                target_feature = "xsave"
+                target_feature = "xsave",
+                not(all(
+                    target_feature = "adx",
+                    target_feature = "aes",
+                    target_feature = "avx512bitalg",
+                    target_feature = "avx512bw",
+                    target_feature = "avx512cd",
+                    target_feature = "avx512dq",
+                    target_feature = "avx512f",
+                    target_feature = "avx512ifma",
+                    target_feature = "avx512vbmi",
+                    target_feature = "avx512vbmi2",
+                    target_feature = "avx512vl",
+                    target_feature = "avx512vnni",
+                    target_feature = "avx512vpopcntdq",
+                    target_feature = "bmi1",
+                    target_feature = "bmi2",
+                    target_feature = "cmpxchg16b",
+                    target_feature = "fma",
+                    target_feature = "gfni",
+                    target_feature = "lzcnt",
+                    target_feature = "movbe",
+                    target_feature = "pclmulqdq",
+                    target_feature = "popcnt",
+                    target_feature = "rdrand",
+                    target_feature = "rdseed",
+                    target_feature = "sha",
+                    target_feature = "vaes",
+                    target_feature = "vpclmulqdq",
+                    target_feature = "xsave",
+                    target_feature = "xsavec",
+                    target_feature = "xsaveopt",
+                    target_feature = "xsaves"
+                ))
             ))]
             return unsafe { Self::Avx2(Avx2::new_unchecked()) };
             #[cfg(all(
diff --git a/fearless_simd/src/macros.rs b/fearless_simd/src/macros.rs
index 346913862..be73bd6d1 100644
--- a/fearless_simd/src/macros.rs
+++ b/fearless_simd/src/macros.rs
@@ -103,6 +103,15 @@ macro_rules! dispatch {
                 )
             }
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            $crate::Level::Avx512(avx512) => {
+                let $simd = launder(avx512);
+                $crate::Simd::vectorize(
+                    avx512,
+                    #[inline(always)]
+                    || $op,
+                )
+            }
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             $crate::Level::Avx2(avx2) => {
                 let $simd = launder(avx2);
                 $crate::Simd::vectorize(
diff --git a/fearless_simd/src/transmute.rs b/fearless_simd/src/transmute.rs
index 894d672d7..d02fcb0ba 100644
--- a/fearless_simd/src/transmute.rs
+++ b/fearless_simd/src/transmute.rs
@@ -21,9 +21,13 @@ use core::arch::aarch64::{
 #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
 use core::arch::wasm32::v128;
 #[cfg(target_arch = "x86")]
-use core::arch::x86::{__m128, __m128d, __m128i, __m256, __m256d, __m256i};
+use core::arch::x86::{
+    __m128, __m128d, __m128i, __m256, __m256d, __m256i, __m512, __m512d, __m512i,
+};
 #[cfg(target_arch = "x86_64")]
-use core::arch::x86_64::{__m128, __m128d, __m128i, __m256, __m256d, __m256i};
+use core::arch::x86_64::{
+    __m128, __m128d, __m128i, __m256, __m256d, __m256i, __m512, __m512d, __m512i,
+};
 
 /// Types that can be safely copied through an arbitrary same-sized bit representation.
 ///
@@ -124,6 +128,9 @@ const _: () = {
     unsafe impl SimdPod for __m256 {}
     unsafe impl SimdPod for __m256d {}
     unsafe impl SimdPod for __m256i {}
+    unsafe impl SimdPod for __m512 {}
+    unsafe impl SimdPod for __m512d {}
+    unsafe impl SimdPod for __m512i {}
 };
 
 #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
@@ -181,6 +188,9 @@ impl_aligned_simd_pod!(
     Aligned512<[__m256; 2]>,
     Aligned512<[__m256d; 2]>,
     Aligned512<[__m256i; 2]>,
+    Aligned512<__m512>,
+    Aligned512<__m512d>,
+    Aligned512<__m512i>,
 );
 
 #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
diff --git a/fearless_simd_dev_macros/src/lib.rs b/fearless_simd_dev_macros/src/lib.rs
index 438632cb9..78b301110 100644
--- a/fearless_simd_dev_macros/src/lib.rs
+++ b/fearless_simd_dev_macros/src/lib.rs
@@ -21,6 +21,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
     let neon_name = get_ident("neon");
     let sse4_name = get_ident("sse4");
     let avx2_name = get_ident("avx2");
+    let avx512_name = get_ident("avx512");
     let wasm_name = get_ident("wasm");
 
     let ignore_attr = |f: fn(&str) -> bool| {
@@ -40,6 +41,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
     let ignore_neon = ignore_attr(exclude_neon);
     let ignore_sse4 = ignore_attr(exclude_sse4);
     let ignore_avx2 = ignore_attr(exclude_avx2);
+    let ignore_avx512 = ignore_attr(exclude_avx512);
     let ignore_wasm = ignore_attr(exclude_wasm);
 
     let fallback_snippet = quote! {
@@ -116,6 +118,52 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
         }
     };
 
+    let avx512_snippet = quote! {
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        #[test]
+        #ignore_avx512
+        fn #avx512_name() {
+            if std::arch::is_x86_feature_detected!("adx")
+                && std::arch::is_x86_feature_detected!("aes")
+                && std::arch::is_x86_feature_detected!("avx512bitalg")
+                && std::arch::is_x86_feature_detected!("avx512bw")
+                && std::arch::is_x86_feature_detected!("avx512cd")
+                && std::arch::is_x86_feature_detected!("avx512dq")
+                && std::arch::is_x86_feature_detected!("avx512f")
+                && std::arch::is_x86_feature_detected!("avx512ifma")
+                && std::arch::is_x86_feature_detected!("avx512vbmi")
+                && std::arch::is_x86_feature_detected!("avx512vbmi2")
+                && std::arch::is_x86_feature_detected!("avx512vl")
+                && std::arch::is_x86_feature_detected!("avx512vnni")
+                && std::arch::is_x86_feature_detected!("avx512vpopcntdq")
+                && std::arch::is_x86_feature_detected!("bmi1")
+                && std::arch::is_x86_feature_detected!("bmi2")
+                && std::arch::is_x86_feature_detected!("cmpxchg16b")
+                && std::arch::is_x86_feature_detected!("fma")
+                && std::arch::is_x86_feature_detected!("gfni")
+                && std::arch::is_x86_feature_detected!("lzcnt")
+                && std::arch::is_x86_feature_detected!("movbe")
+                && std::arch::is_x86_feature_detected!("pclmulqdq")
+                && std::arch::is_x86_feature_detected!("popcnt")
+                && std::arch::is_x86_feature_detected!("rdrand")
+                && std::arch::is_x86_feature_detected!("rdseed")
+                && std::arch::is_x86_feature_detected!("sha")
+                && std::arch::is_x86_feature_detected!("vaes")
+                && std::arch::is_x86_feature_detected!("vpclmulqdq")
+                && std::arch::is_x86_feature_detected!("xsave")
+                && std::arch::is_x86_feature_detected!("xsavec")
+                && std::arch::is_x86_feature_detected!("xsaveopt")
+                && std::arch::is_x86_feature_detected!("xsaves")
+            {
+                let avx512 = unsafe { fearless_simd::x86::Avx512::new_unchecked() };
+                avx512.vectorize(
+                    #[inline(always)]
+                    || #input_fn_name(avx512)
+                );
+            }
+        }
+    };
+
     let wasm_snippet = quote! {
         #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
         #[test]
@@ -135,6 +183,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
         #wasm_snippet
         #sse4_snippet
         #avx2_snippet
+        #avx512_snippet
     }
     .into()
 }
@@ -158,6 +207,10 @@ fn exclude_avx2(_test_name: &str) -> bool {
     false
 }
 
+fn exclude_avx512(_test_name: &str) -> bool {
+    false
+}
+
 fn exclude_wasm(_test_name: &str) -> bool {
     false
 }
diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs
index 7e0fb8d0b..1b202e970 100644
--- a/fearless_simd_gen/src/generic.rs
+++ b/fearless_simd_gen/src/generic.rs
@@ -208,6 +208,9 @@ pub(crate) fn generic_op(op: &Op, ty: &VecType) -> TokenStream {
                 }
             }
         }
+        OpSig::MaskSet => {
+            panic!("Mask set must operate on the full mask vector")
+        }
         OpSig::LoadInterleaved {
             block_size,
             block_count,
@@ -375,10 +378,23 @@ pub(crate) fn generic_from_array(
     } else {
         quote! { val }
     };
-
     // There are architecture-specific "load" intrinsics, but they can actually be *worse* for performance. If they
     // lower to LLVM intrinsics, they will likely not be optimized until much later in the pipeline (if at all),
     // resulting in substantially worse codegen. See https://github.com/linebender/fearless_simd/pull/185.
+    //
+    // Safety: The native vector type backing any implementation will be:
+    // - A `#[repr(simd)]` type, which has the same layout as an array of scalars
+    // - An array of `#[repr(simd)]` types
+    // - For AArch64 specifically, a `#[repr(C)]` tuple of `#[repr(simd)]` types
+    //
+    // These all have the same layout as a flat array of the corresponding scalars. `checked_transmute_copy`
+    // statically verifies that the source and destination sizes match. The native vector types probably have
+    // greater alignment requirements than the source array type we're copying from, but that's explicitly allowed by
+    // transmute_copy:
+    //
+    // > This function will unsafely assume the pointer src is valid for size_of::<Dst> bytes by transmuting &Src to
+    // > &Dst and then reading the &Dst **(except that this is done in a way that is correct even when &Dst has
+    // > stricter alignment requirements than &Src).**
     let expr = quote! {
         crate::transmute::checked_transmute_copy(#inner_ref)
     };
@@ -497,3 +513,22 @@ pub(crate) fn generic_mask_to_bitmask(method_sig: TokenStream, vec_ty: &VecType)
         }
     }
 }
+
+pub(crate) fn generic_mask_set(method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
+    let from_array = generic_op_name("load_array", vec_ty);
+    let as_array = generic_op_name("as_array", vec_ty);
+    let len = vec_ty.len;
+
+    quote! {
+        #method_sig {
+            assert!(
+                index < #len,
+                "mask lane index {index} is out of bounds for {} lanes",
+                #len
+            );
+            let mut lanes = self.#as_array(*a);
+            lanes[index] = if value { !0 } else { 0 };
+            *a = self.#from_array(lanes);
+        }
+    }
+}
diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs
index 3312870af..357a45fc4 100644
--- a/fearless_simd_gen/src/level.rs
+++ b/fearless_simd_gen/src/level.rs
@@ -34,11 +34,22 @@ pub(crate) trait Level {
     /// type *larger* than [`Level::max_block_size`], since [`VecType::aligned_wrapper_ty`] will split those up into
     /// smaller blocks.
     fn arch_ty(&self, vec_ty: &VecType) -> TokenStream;
+    /// The associated storage type used by a public SIMD vector for this level.
+    ///
+    /// Most levels wrap their native storage in an `Aligned*` newtype, but some compact scalar-like
+    /// representations, such as AVX-512 masks, can store the native type directly.
+    fn arch_storage_ty(&self, vec_ty: &VecType) -> TokenStream {
+        vec_ty.aligned_wrapper_ty(|vec_ty| self.arch_ty(vec_ty), self.max_block_size())
+    }
     /// The docstring for this SIMD level token.
     fn token_doc(&self) -> &'static str;
     /// Any additional imports or supporting code necessary for the module (for instance, importing
     /// implementation-specific functions from `core::arch`).
     fn make_module_prelude(&self) -> TokenStream;
+    /// Inner attributes to place at the top of the generated module.
+    fn make_module_attrs(&self) -> TokenStream {
+        TokenStream::new()
+    }
     /// The body of the SIMD token's inherent `impl` block. By convention, this contains an unsafe `new_unchecked`
     /// method for constructing a SIMD token that may not be supported on current hardware, or a safe `new` method for
     /// constructing a SIMD token that is statically known to be supported.
@@ -59,8 +70,7 @@ pub(crate) trait Level {
         let mut assoc_types = vec![];
         for vec_ty in SIMD_TYPES {
             let ty_ident = vec_ty.rust();
-            let wrapper_ty =
-                vec_ty.aligned_wrapper_ty(|vec_ty| self.arch_ty(vec_ty), self.max_block_size());
+            let wrapper_ty = self.arch_storage_ty(vec_ty);
             assoc_types.push(quote! {
                 type #ty_ident = #wrapper_ty;
             });
@@ -90,6 +100,19 @@ pub(crate) trait Level {
         }
     }
 
+    fn should_impl_arch_type_conversion(&self, ty: &VecType) -> bool {
+        let n_bits = ty.n_bits();
+        n_bits <= self.max_block_size() && n_bits >= self.native_width()
+    }
+
+    fn should_use_bitmask_arch_type_conversion(&self, _ty: &VecType) -> bool {
+        false
+    }
+
+    fn custom_arch_type_conversion(&self, _ty: &VecType) -> Option<TokenStream> {
+        None
+    }
+
     fn make_simd_impl(&self) -> TokenStream {
         let level_tok = self.token();
         let native_width = self.native_width();
@@ -180,19 +203,40 @@ pub(crate) trait Level {
     }
 
     fn make_type_impl(&self) -> TokenStream {
-        let native_width = self.native_width();
-        let max_block_size = self.max_block_size();
         let mut result = vec![];
         for ty in SIMD_TYPES {
-            let n_bits = ty.n_bits();
             // If n_bits is below our native width (e.g. 128 bits for AVX2), another module will have already
             // implemented the conversion.
-            if n_bits > max_block_size || n_bits < native_width {
+            if !self.should_impl_arch_type_conversion(ty) {
                 continue;
             }
             let simd = ty.rust();
             let arch = self.arch_ty(ty);
-            result.push(quote! {
+            let type_impl = if let Some(type_impl) = self.custom_arch_type_conversion(ty) {
+                type_impl
+            } else if self.should_use_bitmask_arch_type_conversion(ty) {
+                assert_eq!(
+                    ty.scalar,
+                    ScalarType::Mask,
+                    "bitmask arch type conversions are only valid for mask types"
+                );
+                quote! {
+                    impl<S: Simd> SimdFrom<#arch, S> for #simd<S> {
+                        #[inline(always)]
+                        fn simd_from(simd: S, arch: #arch) -> Self {
+                            Self::from_bitmask(simd, u64::from(arch))
+                        }
+                    }
+                    impl<S: Simd> From<#simd<S>> for #arch {
+                        #[inline(always)]
+                        #[allow(trivial_numeric_casts, reason = "generated uniformly for all __mmask widths")]
+                        fn from(value: #simd<S>) -> Self {
+                            value.to_bitmask() as #arch
+                        }
+                    }
+                }
+            } else {
+                quote! {
                 impl<S: Simd> SimdFrom<#arch, S> for #simd<S> {
                     #[inline(always)]
                     fn simd_from(simd: S, arch: #arch) -> Self {
@@ -208,7 +252,9 @@ pub(crate) trait Level {
                         crate::transmute::checked_transmute_copy(&value.val)
                     }
                 }
-            });
+                }
+            };
+            result.push(type_impl);
         }
         quote! {
             #( #result )*
@@ -219,6 +265,7 @@ pub(crate) trait Level {
         let level_tok = self.token();
         let token_doc = self.token_doc();
         let imports = type_imports();
+        let module_attrs = self.make_module_attrs();
         let module_prelude = self.make_module_prelude();
         let impl_body = self.make_impl_body();
         let arch_types_impl = self.impl_arch_types();
@@ -227,6 +274,8 @@ pub(crate) trait Level {
         let footer = self.make_module_footer();
 
         quote! {
+            #module_attrs
+
             use crate::{prelude::*, seal::Seal, arch_types::ArchTypes, Level};
 
             #imports
diff --git a/fearless_simd_gen/src/main.rs b/fearless_simd_gen/src/main.rs
index 10efdfd99..57df1ba3a 100644
--- a/fearless_simd_gen/src/main.rs
+++ b/fearless_simd_gen/src/main.rs
@@ -36,6 +36,7 @@ enum Module {
     Fallback,
     Sse4_2,
     Avx2,
+    Avx512,
 }
 
 #[derive(Parser)]
@@ -66,6 +67,7 @@ impl Module {
             Self::Fallback => mk_fallback::Fallback.make_module(),
             Self::Sse4_2 => mk_x86::X86::Sse4_2.make_module(),
             Self::Avx2 => mk_x86::X86::Avx2.make_module(),
+            Self::Avx512 => mk_x86::X86::Avx512.make_module(),
         }
     }
 
@@ -105,6 +107,7 @@ impl Module {
             Self::Wasm => "wasm",
             Self::Sse4_2 => "sse4_2",
             Self::Avx2 => "avx2",
+            Self::Avx512 => "avx512",
         }
     }
 }
@@ -118,6 +121,7 @@ const MODULES: &[Module] = &[
     Module::Wasm,
     Module::Sse4_2,
     Module::Avx2,
+    Module::Avx512,
 ];
 
 const FILE_BASE: &str = "./fearless_simd/src/generated";
diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs
index 70122a9e7..92099258a 100644
--- a/fearless_simd_gen/src/mk_fallback.rs
+++ b/fearless_simd_gen/src/mk_fallback.rs
@@ -3,8 +3,8 @@
 
 use crate::arch::fallback;
 use crate::generic::{
-    generic_from_bytes, generic_mask_from_bitmask, generic_mask_to_bitmask, generic_op_name,
-    generic_to_bytes, integer_lane_mask_splat_arg,
+    generic_from_bytes, generic_mask_from_bitmask, generic_mask_set, generic_mask_to_bitmask,
+    generic_op_name, generic_to_bytes, integer_lane_mask_splat_arg,
 };
 use crate::level::Level;
 use crate::ops::{Op, OpSig, RefKind, valid_reinterpret};
@@ -466,6 +466,7 @@ impl Level for Fallback {
             }
             OpSig::MaskFromBitmask => generic_mask_from_bitmask(method_sig, vec_ty),
             OpSig::MaskToBitmask => generic_mask_to_bitmask(method_sig, vec_ty),
+            OpSig::MaskSet => generic_mask_set(method_sig, vec_ty),
             OpSig::LoadInterleaved {
                 block_size,
                 block_count,
diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs
index 129d4052f..73c17b2cb 100644
--- a/fearless_simd_gen/src/mk_neon.rs
+++ b/fearless_simd_gen/src/mk_neon.rs
@@ -5,8 +5,8 @@ use proc_macro2::{Ident, Literal, Span, TokenStream};
 use quote::{ToTokens as _, format_ident, quote};
 
 use crate::generic::{
-    generic_as_array, generic_from_array, generic_from_bytes, generic_op_name, generic_store_array,
-    generic_to_bytes, integer_lane_mask_splat_arg,
+    generic_as_array, generic_from_array, generic_from_bytes, generic_mask_set, generic_op_name,
+    generic_store_array, generic_to_bytes, integer_lane_mask_splat_arg,
 };
 use crate::level::Level;
 use crate::ops::{Op, SlideGranularity, valid_reinterpret};
@@ -532,6 +532,7 @@ impl Level for Neon {
             }
             OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(method_sig, vec_ty),
             OpSig::MaskToBitmask => self.handle_mask_to_bitmask(method_sig, vec_ty),
+            OpSig::MaskSet => generic_mask_set(method_sig, vec_ty),
             OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind),
             OpSig::AsArray { kind } => {
                 generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| {
diff --git a/fearless_simd_gen/src/mk_simd_trait.rs b/fearless_simd_gen/src/mk_simd_trait.rs
index 6b205022c..99fb91a76 100644
--- a/fearless_simd_gen/src/mk_simd_trait.rs
+++ b/fearless_simd_gen/src/mk_simd_trait.rs
@@ -43,8 +43,8 @@ pub(crate) fn mk_simd_trait() -> TokenStream {
         /// # Associated Types
         ///
         /// The trait defines associated types for the highest "native" vector width of each scalar type (e.g. `f32s`,
-        /// `u32s`). These are always at least 128 bits, but may be larger. Currently, they are 128 bits everywhere but
-        /// AVX2, where they are 256 bits.
+        /// `u32s`). These are always at least 128 bits, but may be larger. Currently, they are 128 bits on the
+        /// fallback, NEON, WASM, and SSE4.2 backends, 256 bits on AVX2, and 512 bits on AVX-512.
         ///
         /// # Example
         ///
diff --git a/fearless_simd_gen/src/mk_simd_types.rs b/fearless_simd_gen/src/mk_simd_types.rs
index 3960e3281..b6f2aafce 100644
--- a/fearless_simd_gen/src/mk_simd_types.rs
+++ b/fearless_simd_gen/src/mk_simd_types.rs
@@ -298,6 +298,7 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream {
     let splat = generic_op_name("splat", ty);
     let from_bitmask_op = generic_op_name("from_bitmask", ty);
     let to_bitmask_op = generic_op_name("to_bitmask", ty);
+    let set_op = generic_op_name("set", ty);
     let from_array_op = generic_op_name("load_array", ty);
     let as_array_op = generic_op_name("as_array", ty);
     let mut methods = vec![];
@@ -322,9 +323,6 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream {
         }
     }
 
-    // Current backends store masks as signed integer lanes, so `set` uses a generic
-    // spill/update/reload path. Future compact predicate backends such as AVX-512 can
-    // switch this implementation to `to_bitmask`/`from_bitmask`.
     quote! {
         impl<S: Simd> SimdMask<S> for #name<S> {
             type Element = #scalar;
@@ -352,14 +350,7 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream {
 
             #[inline(always)]
             fn set(&mut self, index: usize, value: bool) {
-                assert!(
-                    index < #len,
-                    "mask lane index {index} is out of bounds for {} lanes",
-                    #len
-                );
-                let mut lanes = self.simd.#as_array_op(*self);
-                lanes[index] = if value { !0 } else { 0 };
-                *self = self.simd.#from_array_op(lanes);
+                self.simd.#set_op(self, index, value);
             }
 
             #[inline(always)]
diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
index 1a9d35bfd..a955af48a 100644
--- a/fearless_simd_gen/src/mk_wasm.rs
+++ b/fearless_simd_gen/src/mk_wasm.rs
@@ -7,7 +7,7 @@ use quote::{format_ident, quote};
 use crate::arch::wasm::{arch_prefix, v128_intrinsic};
 use crate::generic::{
     generic_as_array, generic_block_combine, generic_block_split, generic_from_array,
-    generic_from_bytes, generic_op_name, generic_store_array, generic_to_bytes,
+    generic_from_bytes, generic_mask_set, generic_op_name, generic_store_array, generic_to_bytes,
     integer_lane_mask_splat_arg, scalar_binary,
 };
 use crate::level::Level;
@@ -594,6 +594,7 @@ impl Level for WasmSimd128 {
             }
             OpSig::MaskFromBitmask => mask_from_bitmask(method_sig, vec_ty),
             OpSig::MaskToBitmask => mask_to_bitmask(method_sig, vec_ty),
+            OpSig::MaskSet => generic_mask_set(method_sig, vec_ty),
             OpSig::LoadInterleaved {
                 block_size,
                 block_count,
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index eea35754c..7482dd759 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -8,7 +8,7 @@ use crate::arch::x86::{
 };
 use crate::generic::{
     generic_as_array, generic_block_combine, generic_block_split, generic_from_array,
-    generic_from_bytes, generic_op_name, generic_store_array, generic_to_bytes,
+    generic_from_bytes, generic_mask_set, generic_op_name, generic_store_array, generic_to_bytes,
     integer_lane_mask_splat_arg, scalar_binary,
 };
 use crate::level::Level;
@@ -21,13 +21,17 @@ use quote::{ToTokens as _, format_ident, quote};
 pub(crate) enum X86 {
     Sse4_2,
     Avx2,
+    Avx512,
 }
 
+pub(crate) const AVX512_FEATURES: &str = "adx,aes,avx512bitalg,avx512bw,avx512cd,avx512dq,avx512f,avx512ifma,avx512vbmi,avx512vbmi2,avx512vl,avx512vnni,avx512vpopcntdq,bmi1,bmi2,cmpxchg16b,fma,gfni,lzcnt,movbe,pclmulqdq,popcnt,rdrand,rdseed,sha,vaes,vpclmulqdq,xsave,xsavec,xsaveopt,xsaves";
+
 impl Level for X86 {
     fn name(&self) -> &'static str {
         match self {
             Self::Sse4_2 => "Sse4_2",
             Self::Avx2 => "Avx2",
+            Self::Avx512 => "Avx512",
         }
     }
 
@@ -35,6 +39,7 @@ impl Level for X86 {
         match self {
             Self::Sse4_2 => 128,
             Self::Avx2 => 256,
+            Self::Avx512 => 512,
         }
     }
 
@@ -46,16 +51,18 @@ impl Level for X86 {
         Some(match self {
             Self::Sse4_2 => "sse4.2,cmpxchg16b,popcnt",
             Self::Avx2 => "avx2,bmi1,bmi2,cmpxchg16b,f16c,fma,lzcnt,movbe,popcnt,xsave",
+            Self::Avx512 => AVX512_FEATURES,
         })
     }
 
     fn arch_ty(&self, vec_ty: &VecType) -> TokenStream {
-        // Future AVX-512 backends should be able to keep mask types opaque by storing them as
-        // `__mmask*` predicate registers instead of `__m*i` vectors: for example, `mask8x64`
-        // maps naturally to `__mmask64`, `mask16x32` to `__mmask32`, and `mask32x16`/`mask64x8`
-        // to `__mmask16`/`__mmask8`. Comparisons would return `_mm512_cmp*_mask`, selects would
-        // use `_mm512_mask_blend_*`, and legacy integer-lane interop could materialize vectors
-        // with `_mm512_movm_epi*` only at the API boundary.
+        // AVX-512 masks are compact predicate registers, not vector registers.
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
+            let bits = avx512_mask_register_bits(vec_ty);
+            let name = format!("__mmask{bits}");
+            return Ident::new(&name, Span::call_site()).into_token_stream();
+        }
+
         let suffix = match (vec_ty.scalar, vec_ty.scalar_bits) {
             (ScalarType::Float, 32) => "",
             (ScalarType::Float, 64) => "d",
@@ -66,6 +73,14 @@ impl Level for X86 {
         Ident::new(&name, Span::call_site()).into_token_stream()
     }
 
+    fn arch_storage_ty(&self, vec_ty: &VecType) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
+            self.arch_ty(vec_ty)
+        } else {
+            vec_ty.aligned_wrapper_ty(|vec_ty| self.arch_ty(vec_ty), self.max_block_size())
+        }
+    }
+
     fn token_doc(&self) -> &'static str {
         match self {
             Self::Sse4_2 => {
@@ -74,6 +89,9 @@ impl Level for X86 {
             Self::Avx2 => {
                 "A token for AVX2 intrinsics on `x86` and `x86_64`, representing the x86-64-v3 level."
             }
+            Self::Avx512 => {
+                "A token for AVX-512 intrinsics on `x86` and `x86_64`, representing an Ice Lake feature level."
+            }
         }
     }
 
@@ -86,11 +104,29 @@ impl Level for X86 {
         }
     }
 
+    fn make_module_attrs(&self) -> TokenStream {
+        if *self != Self::Avx512 {
+            return TokenStream::new();
+        }
+
+        quote! {
+            #![allow(
+                clippy::identity_op,
+                reason = "AVX-512 mask code is generated uniformly for all __mmask widths"
+            )]
+            #![allow(
+                clippy::useless_conversion,
+                reason = "AVX-512 mask code is generated uniformly for all __mmask widths"
+            )]
+        }
+    }
+
     fn make_module_footer(&self) -> TokenStream {
         let alignr_helpers = self.dyn_alignr_helpers();
         let slide_helpers = match self {
             Self::Sse4_2 => Self::sse42_slide_helpers(),
             Self::Avx2 => Self::avx2_slide_helpers(),
+            Self::Avx512 => TokenStream::new(),
         };
 
         quote! {
@@ -135,7 +171,51 @@ impl Level for X86 {
             Self::Avx2 => quote! {
                 Level::#level_tok(self)
             },
+            Self::Avx512 => quote! {
+                Level::#level_tok(self)
+            },
+        }
+    }
+
+    fn should_impl_arch_type_conversion(&self, ty: &VecType) -> bool {
+        let n_bits = ty.n_bits();
+        if *self == Self::Avx512 && ty.scalar == ScalarType::Mask {
+            return n_bits <= self.max_block_size();
         }
+        n_bits <= self.max_block_size() && n_bits >= self.native_width()
+    }
+
+    fn should_use_bitmask_arch_type_conversion(&self, ty: &VecType) -> bool {
+        *self == Self::Avx512 && ty.scalar == ScalarType::Mask
+    }
+
+    fn custom_arch_type_conversion(&self, ty: &VecType) -> Option<TokenStream> {
+        if *self == Self::Avx512 || ty.scalar != ScalarType::Mask {
+            return None;
+        }
+
+        let simd = ty.rust();
+        let arch = self.arch_ty(ty);
+        let lane_ty = ScalarType::Int.rust(ty.scalar_bits);
+        let len = ty.len;
+
+        Some(quote! {
+            impl<S: Simd> SimdFrom<#arch, S> for #simd<S> {
+                #[inline(always)]
+                fn simd_from(simd: S, arch: #arch) -> Self {
+                    let lanes: [#lane_ty; #len] =
+                        crate::transmute::checked_transmute_copy(&arch);
+                    lanes.simd_into(simd)
+                }
+            }
+            impl<S: Simd> From<#simd<S>> for #arch {
+                #[inline(always)]
+                fn from(value: #simd<S>) -> Self {
+                    let lanes: [#lane_ty; #len] = value.into();
+                    crate::transmute::checked_transmute_copy(&lanes)
+                }
+            }
+        })
     }
 
     fn make_impl_body(&self) -> TokenStream {
@@ -165,10 +245,35 @@ impl Level for X86 {
                     Self { _private: () }
                 }
             },
+            Self::Avx512 => quote! {
+                /// Create a SIMD token.
+                ///
+                /// # Safety
+                ///
+                /// The Ice Lake AVX-512 CPU feature set must be available.
+                #[inline]
+                pub const unsafe fn new_unchecked() -> Self {
+                    Self { _private: () }
+                }
+            },
         }
     }
 
     fn should_use_generic_op(&self, op: &Op, vec_ty: &VecType) -> bool {
+        if *self == Self::Avx512
+            && matches!(
+                op.sig,
+                OpSig::Slide {
+                    granularity: SlideGranularity::WithinBlocks,
+                    ..
+                }
+            )
+            && vec_ty.scalar == ScalarType::Mask
+            && vec_ty.n_bits() > 128
+        {
+            return true;
+        }
+
         let should_use_generic = op.sig.should_use_generic_op(vec_ty, self.native_width());
         if !should_use_generic {
             return false;
@@ -216,6 +321,10 @@ impl Level for X86 {
             } => self.handle_mask_reduce(method_sig, vec_ty, quantifier, condition),
             OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(method_sig, vec_ty),
             OpSig::MaskToBitmask => self.handle_mask_to_bitmask(method_sig, vec_ty),
+            OpSig::MaskSet if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask => {
+                self.handle_avx512_mask_set(method_sig, vec_ty)
+            }
+            OpSig::MaskSet => generic_mask_set(method_sig, vec_ty),
             OpSig::LoadInterleaved {
                 block_size,
                 block_count,
@@ -224,7 +333,17 @@ impl Level for X86 {
                 block_size,
                 block_count,
             } => self.handle_store_interleaved(method_sig, vec_ty, block_size, block_count),
+            OpSig::FromArray { kind }
+                if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask =>
+            {
+                self.handle_avx512_mask_from_array(method_sig, vec_ty, kind)
+            }
             OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind),
+            OpSig::AsArray { kind }
+                if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask =>
+            {
+                self.handle_avx512_mask_as_array(method_sig, vec_ty, kind)
+            }
             OpSig::AsArray { kind } => {
                 generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| {
                     self.arch_ty(vec_ty)
@@ -593,8 +712,199 @@ fn signed_literal(value: u64, bits: u32) -> TokenStream {
     }
 }
 
+fn avx512_mask_register_bits(vec_ty: &VecType) -> usize {
+    match vec_ty.len {
+        0..=8 => 8,
+        9..=16 => 16,
+        17..=32 => 32,
+        33..=64 => 64,
+        _ => unreachable!("SIMD masks never have more than 64 lanes"),
+    }
+}
+
+fn avx512_mask_lane_bits(vec_ty: &VecType) -> TokenStream {
+    if vec_ty.len == 64 {
+        quote! { u64::MAX }
+    } else {
+        let bits = (1_u64 << vec_ty.len) - 1;
+        quote! { #bits }
+    }
+}
+
+fn avx512_mask_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream {
+    let ty = vec_ty.rust();
+    let bits = if avx512_mask_register_bits(vec_ty) == 64 {
+        bits
+    } else {
+        quote! { (#bits) as _ }
+    };
+    quote! {
+        #ty {
+            val: #bits,
+            simd: self,
+        }
+    }
+}
+
+fn avx512_mask_register_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream {
+    let ty = vec_ty.rust();
+    quote! {
+        #ty {
+            val: #bits,
+            simd: self,
+        }
+    }
+}
+
+fn avx512_mask_bits_expr(expr: TokenStream) -> TokenStream {
+    quote! { u64::from((#expr).val) }
+}
+
+fn avx512_compare_op(method: &str) -> &'static str {
+    match method {
+        "simd_eq" => "cmpeq",
+        "simd_lt" => "cmplt",
+        "simd_le" => "cmple",
+        "simd_ge" => "cmpge",
+        "simd_gt" => "cmpgt",
+        _ => unreachable!(),
+    }
+}
+
+fn avx512_float_compare_predicate(method: &str) -> i32 {
+    match method {
+        "simd_eq" => 0x00,
+        "simd_lt" => 0x11,
+        "simd_le" => 0x12,
+        "simd_ge" => 0x1D,
+        "simd_gt" => 0x1E,
+        "ord" => 0x07,
+        "unord" => 0x03,
+        _ => unreachable!(),
+    }
+}
+
+fn avx512_mask_compare_expr(method: &str, vec_ty: &VecType) -> TokenStream {
+    let lane_mask = avx512_mask_lane_bits(vec_ty);
+    match method {
+        "simd_eq" => quote! { !u64::from(a.val ^ b.val) & #lane_mask },
+        _ => unreachable!("masks only support equality comparison"),
+    }
+}
+
+fn avx512_permutex2var_intrinsic(vec_ty: &VecType) -> Ident {
+    let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false);
+    intrinsic_ident("permutex2var", suffix, vec_ty.n_bits())
+}
+
+fn avx512_permutexvar_intrinsic(vec_ty: &VecType) -> Ident {
+    let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false);
+    intrinsic_ident("permutexvar", suffix, vec_ty.n_bits())
+}
+
+fn avx512_mask_blend_intrinsic(vec_ty: &VecType) -> Ident {
+    let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false);
+    intrinsic_ident("mask_blend", suffix, vec_ty.n_bits())
+}
+
+fn avx512_index_vector(vec_ty: &VecType, indices: impl IntoIterator<Item = usize>) -> TokenStream {
+    let indices: Vec<usize> = indices.into_iter().collect();
+    let n_bits = vec_ty.n_bits();
+    let scalar_bits = vec_ty.scalar_bits;
+    match (n_bits, scalar_bits) {
+        (128, 8) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 8));
+            quote! { _mm_setr_epi8(#(#lanes),*) }
+        }
+        (256, 8) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 8));
+            quote! { _mm256_setr_epi8(#(#lanes),*) }
+        }
+        (512, 8) => {
+            let lanes = indices
+                .into_iter()
+                .rev()
+                .map(|i| signed_literal(i as u64, 8));
+            quote! { _mm512_set_epi8(#(#lanes),*) }
+        }
+        (128, 16) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 16));
+            quote! { _mm_setr_epi16(#(#lanes),*) }
+        }
+        (256, 16) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 16));
+            quote! { _mm256_setr_epi16(#(#lanes),*) }
+        }
+        (512, 16) => {
+            let lanes = indices
+                .into_iter()
+                .rev()
+                .map(|i| signed_literal(i as u64, 16));
+            quote! { _mm512_set_epi16(#(#lanes),*) }
+        }
+        (128, 32) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 32));
+            quote! { _mm_setr_epi32(#(#lanes),*) }
+        }
+        (256, 32) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 32));
+            quote! { _mm256_setr_epi32(#(#lanes),*) }
+        }
+        (512, 32) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 32));
+            quote! { _mm512_setr_epi32(#(#lanes),*) }
+        }
+        (128, 64) => {
+            let mut lanes = indices
+                .into_iter()
+                .map(|i| signed_literal(i as u64, 64))
+                .collect::<Vec<_>>();
+            lanes.reverse();
+            quote! { _mm_set_epi64x(#(#lanes),*) }
+        }
+        (256, 64) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 64));
+            quote! { _mm256_setr_epi64x(#(#lanes),*) }
+        }
+        (512, 64) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 64));
+            quote! { _mm512_setr_epi64(#(#lanes),*) }
+        }
+        _ => unreachable!(),
+    }
+}
+
+fn interleaved_load_indices(len: usize, block_count: usize) -> Vec<usize> {
+    let stream_len = len / block_count;
+    (0..block_count)
+        .flat_map(|stream| (0..stream_len).map(move |i| i * block_count + stream))
+        .collect()
+}
+
+fn interleaved_store_indices(len: usize, block_count: usize) -> Vec<usize> {
+    let stream_len = len / block_count;
+    (0..stream_len)
+        .flat_map(|i| (0..block_count).map(move |stream| stream * stream_len + i))
+        .collect()
+}
+
 impl X86 {
     pub(crate) fn handle_splat(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
+            let lane_mask = avx512_mask_lane_bits(vec_ty);
+            let result = avx512_mask_value(
+                vec_ty,
+                quote! {
+                    if val { #lane_mask } else { 0 }
+                },
+            );
+            return quote! {
+                #method_sig {
+                    #result
+                }
+            };
+        }
+
         let intrinsic = set1_intrinsic(vec_ty);
         let cast = match vec_ty.scalar {
             ScalarType::Unsigned => quote!(.cast_signed()),
@@ -612,6 +922,9 @@ impl X86 {
     }
 
     fn has_specialized_mask_from_bitmask(&self, vec_ty: &VecType) -> bool {
+        if *self == Self::Avx512 {
+            return true;
+        }
         self.has_wide_byte_mask_from_bitmask(vec_ty) || self.has_wide_avx2_mask_from_bitmask(vec_ty)
     }
 
@@ -631,9 +944,105 @@ impl X86 {
     }
 
     fn has_specialized_mask_to_bitmask(&self, vec_ty: &VecType) -> bool {
+        if *self == Self::Avx512 {
+            return true;
+        }
         vec_ty.scalar == ScalarType::Mask && vec_ty.scalar_bits == 16
     }
 
+    pub(crate) fn handle_avx512_mask_from_array(
+        &self,
+        method_sig: TokenStream,
+        vec_ty: &VecType,
+        kind: crate::ops::RefKind,
+    ) -> TokenStream {
+        assert_eq!(
+            vec_ty.scalar,
+            ScalarType::Mask,
+            "AVX-512 mask array loads only operate on mask types"
+        );
+        let movepi_mask = intrinsic_ident(
+            &format!("movepi{}", vec_ty.scalar_bits),
+            "mask",
+            vec_ty.n_bits(),
+        );
+        let transmute_src = if kind == crate::ops::RefKind::Value {
+            quote! { &val }
+        } else {
+            quote! { val }
+        };
+        // Mask arrays are specified as either 0 or -1 per lane, so the sign bit is the
+        // truth value. Other lane values have unspecified results.
+        let result = avx512_mask_register_value(vec_ty, quote! { #movepi_mask(lanes) });
+        quote! {
+            #method_sig {
+                unsafe {
+                    let lanes = crate::transmute::checked_transmute_copy(#transmute_src);
+                    #result
+                }
+            }
+        }
+    }
+
+    pub(crate) fn handle_avx512_mask_as_array(
+        &self,
+        method_sig: TokenStream,
+        vec_ty: &VecType,
+        kind: crate::ops::RefKind,
+    ) -> TokenStream {
+        assert_eq!(
+            vec_ty.scalar,
+            ScalarType::Mask,
+            "AVX-512 mask array stores only operate on mask types"
+        );
+        assert!(
+            kind == crate::ops::RefKind::Value,
+            "mask array references are not exposed"
+        );
+        let movm = intrinsic_ident(
+            "movm",
+            op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true),
+            vec_ty.n_bits(),
+        );
+        quote! {
+            #method_sig {
+                unsafe {
+                    let lanes = #movm(a.val);
+                    crate::transmute::checked_transmute_copy(&lanes)
+                }
+            }
+        }
+    }
+
+    pub(crate) fn handle_avx512_mask_set(
+        &self,
+        method_sig: TokenStream,
+        vec_ty: &VecType,
+    ) -> TokenStream {
+        assert_eq!(
+            vec_ty.scalar,
+            ScalarType::Mask,
+            "AVX-512 mask set only operates on mask types"
+        );
+        let len = vec_ty.len;
+        let bits = avx512_mask_bits_expr(quote! { a });
+        let result = avx512_mask_value(vec_ty, quote! { bits });
+
+        quote! {
+            #method_sig {
+                assert!(
+                    index < #len,
+                    "mask lane index {index} is out of bounds for {} lanes",
+                    #len
+                );
+                let bit = 1u64 << index;
+                let bits = #bits;
+                let bits = if value { bits | bit } else { bits & !bit };
+                *a = #result;
+            }
+        }
+    }
+
     pub(crate) fn handle_mask_from_bitmask(
         &self,
         method_sig: TokenStream,
@@ -645,6 +1054,16 @@ impl X86 {
             "mask bitmask conversion only operates on masks"
         );
 
+        if *self == Self::Avx512 {
+            let lane_mask = avx512_mask_lane_bits(vec_ty);
+            let result = avx512_mask_value(vec_ty, quote! { bits & #lane_mask });
+            return quote! {
+                #method_sig {
+                    #result
+                }
+            };
+        }
+
         if self.has_wide_byte_mask_from_bitmask(vec_ty) {
             let expr = mask_from_bitmask_wide_bytes(self.native_width(), vec_ty);
             return quote! {
@@ -703,6 +1122,16 @@ impl X86 {
             "mask bitmask conversion only operates on masks"
         );
 
+        if *self == Self::Avx512 {
+            let lane_mask = avx512_mask_lane_bits(vec_ty);
+            let bits = avx512_mask_bits_expr(quote! { a });
+            return quote! {
+                #method_sig {
+                    #bits & #lane_mask
+                }
+            };
+        }
+
         match vec_ty.scalar_bits {
             8 => {
                 let bits_ty = vec_ty.reinterpret(ScalarType::Int, 8);
@@ -749,6 +1178,39 @@ impl X86 {
         method: &str,
         vec_ty: &VecType,
     ) -> TokenStream {
+        if *self == Self::Avx512 {
+            if vec_ty.scalar == ScalarType::Mask {
+                let expr = avx512_mask_compare_expr(method, vec_ty);
+                let result = avx512_mask_value(vec_ty, expr);
+                return quote! {
+                    #method_sig {
+                        #result
+                    }
+                };
+            }
+
+            let mask_ty = vec_ty.mask_ty();
+            let result = if vec_ty.scalar == ScalarType::Float {
+                let predicate = avx512_float_compare_predicate(method);
+                let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false);
+                let intrinsic = intrinsic_ident("cmp", &format!("{suffix}_mask"), vec_ty.n_bits());
+                avx512_mask_register_value(
+                    &mask_ty,
+                    quote! { #intrinsic::<#predicate>(a.into(), b.into()) },
+                )
+            } else {
+                let cmp = avx512_compare_op(method);
+                let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true);
+                let intrinsic = intrinsic_ident(cmp, &format!("{suffix}_mask"), vec_ty.n_bits());
+                avx512_mask_register_value(&mask_ty, quote! { #intrinsic(a.into(), b.into()) })
+            };
+            return quote! {
+                #method_sig {
+                    unsafe { #result }
+                }
+            };
+        }
+
         let args = [quote! { a.into() }, quote! { b.into() }];
 
         let expr = if vec_ty.scalar != ScalarType::Float {
@@ -830,6 +1292,68 @@ impl X86 {
         method: &str,
         vec_ty: &VecType,
     ) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
+            let body = match method {
+                "not" => {
+                    let lane_mask = avx512_mask_lane_bits(vec_ty);
+                    let bits = avx512_mask_bits_expr(quote! { a });
+                    let result = avx512_mask_value(vec_ty, quote! { (!#bits) & #lane_mask });
+                    quote! { #result }
+                }
+                _ => unreachable!(),
+            };
+            return quote! {
+                #method_sig {
+                    #body
+                }
+            };
+        }
+
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Float {
+            let body = match method {
+                "floor" | "ceil" | "round_ties_even" | "trunc" if vec_ty.n_bits() == 512 => {
+                    let intrinsic = intrinsic_ident(
+                        "roundscale",
+                        op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true),
+                        vec_ty.n_bits(),
+                    );
+                    let rounding_mode = match method {
+                        "floor" => quote! { _MM_FROUND_TO_NEG_INF },
+                        "ceil" => quote! { _MM_FROUND_TO_POS_INF },
+                        "round_ties_even" => quote! { _MM_FROUND_TO_NEAREST_INT },
+                        "trunc" => quote! { _MM_FROUND_TO_ZERO },
+                        _ => unreachable!(),
+                    };
+                    quote! {
+                        unsafe {
+                            #intrinsic::<{ #rounding_mode | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+                        }
+                    }
+                }
+                "approximate_recip" => {
+                    let intrinsic = intrinsic_ident(
+                        "rcp14",
+                        op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true),
+                        vec_ty.n_bits(),
+                    );
+                    quote! {
+                        unsafe {
+                            #intrinsic(a.into()).simd_into(self)
+                        }
+                    }
+                }
+                _ => TokenStream::new(),
+            };
+
+            if !body.is_empty() {
+                return quote! {
+                    #method_sig {
+                        #body
+                    }
+                };
+            }
+        }
+
         match method {
             "fract" => {
                 let trunc_op = generic_op_name("trunc", vec_ty);
@@ -885,7 +1409,20 @@ impl X86 {
         let expr = match method {
             "widen" => {
                 match (self, dst_width, vec_ty.n_bits()) {
-                    (Self::Avx2, 256, 128) => {
+                    (Self::Avx2 | Self::Avx512, 256, 128) => {
+                        let extend = extend_intrinsic(
+                            vec_ty.scalar,
+                            vec_ty.scalar_bits,
+                            target_ty.scalar_bits,
+                            dst_width,
+                        );
+                        quote! {
+                            unsafe {
+                                #extend(a.into()).simd_into(self)
+                            }
+                        }
+                    }
+                    (Self::Avx512, 512, 256) => {
                         let extend = extend_intrinsic(
                             vec_ty.scalar,
                             vec_ty.scalar_bits,
@@ -946,6 +1483,14 @@ impl X86 {
             }
             "narrow" => {
                 match (self, dst_width, vec_ty.n_bits()) {
+                    (Self::Avx512, 128, 256) | (Self::Avx512, 256, 512) => {
+                        let narrow = intrinsic_ident("cvtepi16", "epi8", vec_ty.n_bits());
+                        quote! {
+                            unsafe {
+                                #narrow(a.into()).simd_into(self)
+                            }
+                        }
+                    }
                     (Self::Avx2, 128, 256) => {
                         let mask = match target_ty.scalar_bits {
                             8 => {
@@ -1034,6 +1579,44 @@ impl X86 {
         method: &str,
         vec_ty: &VecType,
     ) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
+            let lane_mask = avx512_mask_lane_bits(vec_ty);
+            let a_bits = avx512_mask_bits_expr(quote! { a });
+            let b_bits = avx512_mask_bits_expr(quote! { b });
+            let expr = match method {
+                "and" => quote! { (#a_bits & #b_bits) & #lane_mask },
+                "or" => quote! { (#a_bits | #b_bits) & #lane_mask },
+                "xor" => quote! { (#a_bits ^ #b_bits) & #lane_mask },
+                _ => unreachable!(),
+            };
+            let result = avx512_mask_value(vec_ty, expr);
+            return quote! {
+                #method_sig {
+                    #result
+                }
+            };
+        }
+
+        if *self == Self::Avx512
+            && vec_ty.scalar == ScalarType::Float
+            && matches!(method, "min_precise" | "max_precise")
+        {
+            let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true);
+            let range = intrinsic_ident("range", suffix, vec_ty.n_bits());
+            let imm = if method == "max_precise" {
+                0b0101
+            } else {
+                0b0100
+            };
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        #range::<#imm>(a.into(), b.into()).simd_into(self)
+                    }
+                }
+            };
+        }
+
         let body = match method {
             "mul" if vec_ty.scalar_bits == 8 => {
                 // https://stackoverflow.com/questions/8193601/sse-multiplication-16-x-uint8-t
@@ -1052,7 +1635,16 @@ impl X86 {
                     }
                 }
             }
-            "shlv" | "shrv" if *self == Self::Avx2 && vec_ty.scalar_bits >= 32 => {
+            "shlv" | "shrv"
+                if *self == Self::Avx512
+                    && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned)
+                    && matches!(vec_ty.scalar_bits, 8 | 16) =>
+            {
+                self.handle_avx512_narrow_variable_shift(method, vec_ty)
+            }
+            "shlv" | "shrv"
+                if matches!(self, Self::Avx2 | Self::Avx512) && vec_ty.scalar_bits >= 32 =>
+            {
                 let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false);
                 let name = match (method, vec_ty.scalar) {
                     ("shrv", ScalarType::Int) => "srav",
@@ -1084,6 +1676,66 @@ impl X86 {
         }
     }
 
+    fn handle_avx512_narrow_variable_shift(&self, method: &str, vec_ty: &VecType) -> TokenStream {
+        assert!(
+            *self == Self::Avx512,
+            "narrow variable shifts are specialized for AVX-512"
+        );
+        assert!(
+            matches!(vec_ty.scalar_bits, 8 | 16),
+            "narrow variable shifts only handle 8-bit and 16-bit lanes"
+        );
+        let name = match (method, vec_ty.scalar) {
+            ("shrv", ScalarType::Int) => "srav",
+            ("shrv", _) => "srlv",
+            ("shlv", _) => "sllv",
+            _ => unreachable!(),
+        };
+        let shift_intrinsic = intrinsic_ident(name, "epi16", vec_ty.n_bits());
+
+        if vec_ty.scalar_bits == 16 {
+            return quote! {
+                unsafe { #shift_intrinsic(a.into(), b.into()).simd_into(self) }
+            };
+        }
+
+        let ty_bits = vec_ty.n_bits();
+        let unpack_hi = unpack_intrinsic(ScalarType::Int, 8, false, ty_bits);
+        let unpack_lo = unpack_intrinsic(ScalarType::Int, 8, true, ty_bits);
+        let set0 = intrinsic_ident("setzero", coarse_type(vec_ty), ty_bits);
+        let and = intrinsic_ident("and", coarse_type(vec_ty), ty_bits);
+        let set1_epi16 = intrinsic_ident("set1", "epi16", ty_bits);
+        let pack = pack_intrinsic(16, false, ty_bits);
+        let value_extend = match (method, vec_ty.scalar) {
+            ("shlv", _) | (_, ScalarType::Unsigned) => quote! { zero },
+            ("shrv", ScalarType::Int) if ty_bits == 512 => {
+                quote! { _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(zero, val)) }
+            }
+            ("shrv", ScalarType::Int) => {
+                let cmpgt = intrinsic_ident("cmpgt", "epi8", ty_bits);
+                quote! { #cmpgt(zero, val) }
+            }
+            _ => unreachable!(),
+        };
+
+        quote! {
+            unsafe {
+                let val = a.into();
+                let counts = b.into();
+                let zero = #set0();
+                let value_extend = #value_extend;
+                let lo_values = #unpack_lo(val, value_extend);
+                let hi_values = #unpack_hi(val, value_extend);
+                let lo_counts = #unpack_lo(counts, zero);
+                let hi_counts = #unpack_hi(counts, zero);
+                let byte_mask = #set1_epi16(0x00ff);
+                let lo_shifted = #and(#shift_intrinsic(lo_values, lo_counts), byte_mask);
+                let hi_shifted = #and(#shift_intrinsic(hi_values, hi_counts), byte_mask);
+                #pack(lo_shifted, hi_shifted).simd_into(self)
+            }
+        }
+    }
+
     pub(crate) fn handle_shift(
         &self,
         method_sig: TokenStream,
@@ -1112,9 +1764,16 @@ impl X86 {
                     #expr(val, #set0())
                 },
                 ScalarType::Int => {
-                    let cmp_intrinsic = intrinsic_ident("cmpgt", "epi8", ty_bits);
+                    let sign_bits = if *self == Self::Avx512 && ty_bits == 512 {
+                        quote! {
+                            _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(#set0(), val))
+                        }
+                    } else {
+                        let cmp_intrinsic = intrinsic_ident("cmpgt", "epi8", ty_bits);
+                        quote! { #cmp_intrinsic(#set0(), val) }
+                    };
                     quote! {
-                        #expr(val, #cmp_intrinsic(#set0(), val))
+                        #expr(val, #sign_bits)
                     }
                 }
                 _ => unimplemented!(),
@@ -1156,7 +1815,7 @@ impl X86 {
         vec_ty: &VecType,
     ) -> TokenStream {
         match method {
-            "mul_add" if *self == Self::Avx2 => {
+            "mul_add" if matches!(self, Self::Avx2 | Self::Avx512) => {
                 let intrinsic = simple_intrinsic("fmadd", vec_ty);
                 quote! {
                     #method_sig {
@@ -1164,7 +1823,7 @@ impl X86 {
                     }
                 }
             }
-            "mul_sub" if *self == Self::Avx2 => {
+            "mul_sub" if matches!(self, Self::Avx2 | Self::Avx512) => {
                 let intrinsic = simple_intrinsic("fmsub", vec_ty);
                 quote! {
                     #method_sig {
@@ -1204,6 +1863,33 @@ impl X86 {
     }
 
     pub(crate) fn handle_select(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
+        if *self == Self::Avx512 {
+            if vec_ty.scalar == ScalarType::Mask {
+                let lane_mask = avx512_mask_lane_bits(vec_ty);
+                let a_bits = avx512_mask_bits_expr(quote! { a });
+                let b_bits = avx512_mask_bits_expr(quote! { b });
+                let c_bits = avx512_mask_bits_expr(quote! { c });
+                let result = avx512_mask_value(
+                    vec_ty,
+                    quote! { ((#a_bits & #b_bits) | ((!#a_bits) & #c_bits)) & #lane_mask },
+                );
+                return quote! {
+                    #method_sig {
+                        #result
+                    }
+                };
+            }
+
+            let blend = avx512_mask_blend_intrinsic(vec_ty);
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        #blend(a.val, c.into(), b.into()).simd_into(self)
+                    }
+                }
+            };
+        }
+
         // Our select ops' argument order is mask, a, b; Intel's intrinsics are b, a, mask
         let args = [
             quote! { c.into() },
@@ -1237,7 +1923,49 @@ impl X86 {
         vec_ty: &VecType,
         half_ty: &VecType,
     ) -> TokenStream {
-        if *self == Self::Avx2 && half_ty.n_bits() == 128 {
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
+            let half_rust = half_ty.rust();
+            let half_len = half_ty.len;
+            let half_mask = avx512_mask_lane_bits(half_ty);
+            return quote! {
+                #method_sig {
+                    let bits = u64::from(a.val);
+                    (
+                        #half_rust { val: (bits & #half_mask) as _, simd: self },
+                        #half_rust { val: ((bits >> #half_len) & #half_mask) as _, simd: self },
+                    )
+                }
+            };
+        }
+
+        if *self == Self::Avx512 && half_ty.n_bits() == 256 {
+            let (lo, hi) = match vec_ty.scalar {
+                ScalarType::Float if vec_ty.scalar_bits == 32 => (
+                    quote! { _mm512_castps512_ps256(a.into()) },
+                    quote! { _mm512_extractf32x8_ps::<1>(a.into()) },
+                ),
+                ScalarType::Float if vec_ty.scalar_bits == 64 => (
+                    quote! { _mm512_castpd512_pd256(a.into()) },
+                    quote! { _mm512_extractf64x4_pd::<1>(a.into()) },
+                ),
+                _ => (
+                    quote! { _mm512_castsi512_si256(a.into()) },
+                    quote! { _mm512_extracti64x4_epi64::<1>(a.into()) },
+                ),
+            };
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        (
+                            #lo.simd_into(self),
+                            #hi.simd_into(self),
+                        )
+                    }
+                }
+            };
+        }
+
+        if matches!(self, Self::Avx2 | Self::Avx512) && half_ty.n_bits() == 128 {
             let extract_op = match vec_ty.scalar {
                 ScalarType::Float => "extractf128",
                 _ => "extracti128",
@@ -1264,7 +1992,45 @@ impl X86 {
         vec_ty: &VecType,
         combined_ty: &VecType,
     ) -> TokenStream {
-        if *self == Self::Avx2 && combined_ty.n_bits() == 256 {
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
+            let combined_rust = combined_ty.rust();
+            let shift = vec_ty.len;
+            let lane_mask = avx512_mask_lane_bits(combined_ty);
+            let bits = if avx512_mask_register_bits(combined_ty) == 64 {
+                quote! { bits }
+            } else {
+                quote! { bits as _ }
+            };
+            return quote! {
+                #method_sig {
+                    let bits = (u64::from(a.val) | (u64::from(b.val) << #shift)) & #lane_mask;
+                    #combined_rust { val: #bits, simd: self }
+                }
+            };
+        }
+
+        if *self == Self::Avx512 && combined_ty.n_bits() == 512 {
+            let expr = match vec_ty.scalar {
+                ScalarType::Float if vec_ty.scalar_bits == 32 => quote! {
+                    _mm512_insertf32x8::<1>(_mm512_castps256_ps512(a.into()), b.into())
+                },
+                ScalarType::Float if vec_ty.scalar_bits == 64 => quote! {
+                    _mm512_insertf64x4::<1>(_mm512_castpd256_pd512(a.into()), b.into())
+                },
+                _ => quote! {
+                    _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into())
+                },
+            };
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        #expr.simd_into(self)
+                    }
+                }
+            };
+        }
+
+        if matches!(self, Self::Avx2 | Self::Avx512) && combined_ty.n_bits() == 256 {
             let suffix = match (vec_ty.scalar, vec_ty.scalar_bits) {
                 (ScalarType::Float, 32) => "m128",
                 (ScalarType::Float, 64) => "m128d",
@@ -1289,6 +2055,27 @@ impl X86 {
         vec_ty: &VecType,
         select_low: bool,
     ) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 {
+            let offset = if select_low { 0 } else { vec_ty.len / 2 };
+            let indices = (0..vec_ty.len).map(|i| {
+                let source_lane = offset + (i / 2);
+                if i % 2 == 0 {
+                    source_lane
+                } else {
+                    vec_ty.len + source_lane
+                }
+            });
+            let idx = avx512_index_vector(vec_ty, indices);
+            let permute = avx512_permutex2var_intrinsic(vec_ty);
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        #permute(a.into(), #idx, b.into()).simd_into(self)
+                    }
+                }
+            };
+        }
+
         let expr = match vec_ty.n_bits() {
             128 => {
                 let op = if select_low { "unpacklo" } else { "unpackhi" };
@@ -1342,6 +2129,40 @@ impl X86 {
         method_sig: TokenStream,
         vec_ty: &VecType,
     ) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 {
+            let lo_indices = (0..vec_ty.len).map(|i| {
+                let source_lane = i / 2;
+                if i % 2 == 0 {
+                    source_lane
+                } else {
+                    vec_ty.len + source_lane
+                }
+            });
+            let hi_indices = (0..vec_ty.len).map(|i| {
+                let source_lane = (vec_ty.len / 2) + (i / 2);
+                if i % 2 == 0 {
+                    source_lane
+                } else {
+                    vec_ty.len + source_lane
+                }
+            });
+            let lo_idx = avx512_index_vector(vec_ty, lo_indices);
+            let hi_idx = avx512_index_vector(vec_ty, hi_indices);
+            let permute = avx512_permutex2var_intrinsic(vec_ty);
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        let a = a.into();
+                        let b = b.into();
+                        (
+                            #permute(a, #lo_idx, b).simd_into(self),
+                            #permute(a, #hi_idx, b).simd_into(self),
+                        )
+                    }
+                }
+            };
+        }
+
         match vec_ty.n_bits() {
             256 => {
                 // Optimized path: compute unpacklo and unpackhi once, then use permute2f128 to
@@ -1390,6 +2211,38 @@ impl X86 {
         method_sig: TokenStream,
         vec_ty: &VecType,
     ) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 {
+            let even_indices = (0..vec_ty.len).map(|i| {
+                if i < vec_ty.len / 2 {
+                    i * 2
+                } else {
+                    vec_ty.len + ((i - vec_ty.len / 2) * 2)
+                }
+            });
+            let odd_indices = (0..vec_ty.len).map(|i| {
+                if i < vec_ty.len / 2 {
+                    i * 2 + 1
+                } else {
+                    vec_ty.len + ((i - vec_ty.len / 2) * 2 + 1)
+                }
+            });
+            let even_idx = avx512_index_vector(vec_ty, even_indices);
+            let odd_idx = avx512_index_vector(vec_ty, odd_indices);
+            let permute = avx512_permutex2var_intrinsic(vec_ty);
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        let a = a.into();
+                        let b = b.into();
+                        (
+                            #permute(a, #even_idx, b).simd_into(self),
+                            #permute(a, #odd_idx, b).simd_into(self),
+                        )
+                    }
+                }
+            };
+        }
+
         match vec_ty.n_bits() {
             256 => {
                 // Optimized path: compute the per-input shuffles once, then use permute2f128 /
@@ -1482,6 +2335,26 @@ impl X86 {
         vec_ty: &VecType,
         select_even: bool,
     ) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 {
+            let lane_offset = if select_even { 0 } else { 1 };
+            let indices = (0..vec_ty.len).map(|i| {
+                if i < vec_ty.len / 2 {
+                    i * 2 + lane_offset
+                } else {
+                    vec_ty.len + ((i - vec_ty.len / 2) * 2 + lane_offset)
+                }
+            });
+            let idx = avx512_index_vector(vec_ty, indices);
+            let permute = avx512_permutex2var_intrinsic(vec_ty);
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        #permute(a.into(), #idx, b.into()).simd_into(self)
+                    }
+                }
+            };
+        }
+
         let expr = match (vec_ty.scalar, vec_ty.n_bits(), vec_ty.scalar_bits) {
             (ScalarType::Float, 128, _) => {
                 // 128-bit shuffle of floats or doubles; there are built-in SSE intrinsics for this
@@ -1588,6 +2461,68 @@ impl X86 {
         let to_bytes = generic_op_name("cvt_to_bytes", vec_ty);
         let from_bytes = generic_op_name("cvt_from_bytes", vec_ty);
 
+        if *self == Self::Avx512
+            && granularity == WithinBlocks
+            && vec_ty.scalar != ScalarType::Mask
+            && vec_ty.n_bits() >= 256
+        {
+            let alignr = format_ident!("dyn_alignr_{}", vec_ty.n_bits());
+            let byte_shift = if scalar_bytes == 1 {
+                quote! { SHIFT }
+            } else {
+                quote! { SHIFT * #scalar_bytes }
+            };
+
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        if SHIFT == 0 {
+                            return a;
+                        }
+                        if SHIFT >= #max_shift {
+                            return b;
+                        }
+
+                        let a = self.#to_bytes(a).val.0;
+                        let b = self.#to_bytes(b).val.0;
+                        let result = #alignr(b, a, #byte_shift);
+                        self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self })
+                    }
+                }
+            };
+        }
+
+        if *self == Self::Avx512 && granularity == AcrossBlocks && vec_ty.n_bits() >= 256 {
+            let byte_ty = vec_ty.reinterpret(ScalarType::Unsigned, 8);
+            let base_idx = avx512_index_vector(&byte_ty, 0..byte_ty.len);
+            let set_shift = set1_intrinsic(&byte_ty);
+            let add = simple_sign_unaware_intrinsic("add", &byte_ty);
+            let permute = avx512_permutex2var_intrinsic(&byte_ty);
+            let byte_shift = if scalar_bytes == 1 {
+                quote! { SHIFT }
+            } else {
+                quote! { SHIFT * #scalar_bytes }
+            };
+
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        if SHIFT >= #max_shift {
+                            return b;
+                        }
+
+                        let idx = #add(#base_idx, #set_shift((#byte_shift) as i8));
+                        let result = #permute(
+                            self.#to_bytes(a).val.0,
+                            idx,
+                            self.#to_bytes(b).val.0,
+                        );
+                        self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self })
+                    }
+                }
+            };
+        }
+
         let alignr_op = match (granularity, vec_ty.n_bits(), self) {
             (WithinBlocks, 128, _) => {
                 panic!("This should have been handled by generic_op");
@@ -1641,6 +2576,107 @@ impl X86 {
             vec_ty.scalar_bits, target_scalar_bits,
             "we currently only support converting between types of the same width"
         );
+        if *self == Self::Avx512
+            && vec_ty.scalar == ScalarType::Float
+            && target_scalar == ScalarType::Unsigned
+        {
+            let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits);
+            let convert = intrinsic_ident("cvttps", "epu32", vec_ty.n_bits());
+            let expr = if precise {
+                let max = simple_intrinsic("max", vec_ty);
+                let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits());
+                let blend = avx512_mask_blend_intrinsic(&target_ty);
+                let set1_float = set1_intrinsic(vec_ty);
+                let set1_int = set1_intrinsic(&target_ty);
+                let set0_float = intrinsic_ident("setzero", coarse_type(vec_ty), vec_ty.n_bits());
+                let lt = avx512_float_compare_predicate("simd_lt");
+                quote! {
+                    unsafe {
+                        let a = #max(a.into(), #set0_float());
+                        let mut converted = #convert(a);
+                        let exceeds_unsigned_range = #cmp::<#lt>(#set1_float(4294967040.0), a);
+                        converted = #blend(
+                            exceeds_unsigned_range,
+                            converted,
+                            #set1_int(u32::MAX.cast_signed()),
+                        );
+                        converted.simd_into(self)
+                    }
+                }
+            } else {
+                quote! {
+                    unsafe {
+                        #convert(a.into()).simd_into(self)
+                    }
+                }
+            };
+
+            return quote! {
+                #method_sig {
+                    #expr
+                }
+            };
+        }
+
+        if *self == Self::Avx512 && vec_ty.n_bits() == 512 {
+            let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits);
+            let expr = match (vec_ty.scalar, target_scalar) {
+                (ScalarType::Float, ScalarType::Int) => {
+                    let convert = intrinsic_ident("cvttps", "epi32", vec_ty.n_bits());
+                    if precise {
+                        let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits());
+                        let blend = avx512_mask_blend_intrinsic(&target_ty);
+                        let set1_float = set1_intrinsic(vec_ty);
+                        let set1_int = set1_intrinsic(&target_ty);
+                        let set0_int =
+                            intrinsic_ident("setzero", coarse_type(&target_ty), target_ty.n_bits());
+                        let lt = avx512_float_compare_predicate("simd_lt");
+                        let ord = avx512_float_compare_predicate("ord");
+                        quote! {
+                            unsafe {
+                                let a = a.into();
+                                let mut converted = #convert(a);
+                                let in_range = #cmp::<#lt>(a, #set1_float(2147483648.0));
+                                converted = #blend(in_range, #set1_int(i32::MAX), converted);
+                                let is_not_nan = #cmp::<#ord>(a, a);
+                                converted = #blend(is_not_nan, #set0_int(), converted);
+                                converted.simd_into(self)
+                            }
+                        }
+                    } else {
+                        quote! {
+                            unsafe {
+                                #convert(a.into()).simd_into(self)
+                            }
+                        }
+                    }
+                }
+                (ScalarType::Int, ScalarType::Float) => {
+                    let intrinsic = simple_intrinsic("cvtepi32", &target_ty);
+                    quote! {
+                        unsafe {
+                            #intrinsic(a.into()).simd_into(self)
+                        }
+                    }
+                }
+                (ScalarType::Unsigned, ScalarType::Float) => {
+                    let intrinsic = simple_intrinsic("cvtepu32", &target_ty);
+                    quote! {
+                        unsafe {
+                            #intrinsic(a.into()).simd_into(self)
+                        }
+                    }
+                }
+                _ => unimplemented!(),
+            };
+
+            return quote! {
+                #method_sig {
+                    #expr
+                }
+            };
+        }
+
         let expr = match (vec_ty.scalar, target_scalar) {
             (ScalarType::Float, ScalarType::Int | ScalarType::Unsigned) => {
                 let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits);
@@ -1865,6 +2901,23 @@ impl X86 {
             "mask reduce ops only operate on masks"
         );
 
+        if *self == Self::Avx512 {
+            let lane_mask = avx512_mask_lane_bits(vec_ty);
+            let bits = avx512_mask_bits_expr(quote! { a });
+            let expr = match (quantifier, condition) {
+                (Quantifier::Any, true) => quote! { bits != 0 },
+                (Quantifier::Any, false) => quote! { bits != #lane_mask },
+                (Quantifier::All, true) => quote! { bits == #lane_mask },
+                (Quantifier::All, false) => quote! { bits == 0 },
+            };
+            return quote! {
+                #method_sig {
+                    let bits = #bits & #lane_mask;
+                    #expr
+                }
+            };
+        }
+
         let (movemask, all_ones) = match vec_ty.scalar_bits {
             32 | 64 => {
                 let float_ty = vec_ty.cast(ScalarType::Float);
@@ -1929,6 +2982,14 @@ impl X86 {
             "only 128-bit blocks are currently supported"
         );
         assert_eq!(block_count, 4, "only count of 4 is currently supported");
+        if *self == Self::Avx512 && vec_ty.n_bits() == 512 {
+            return self.handle_avx512_load_interleaved(
+                method_sig,
+                vec_ty,
+                block_size,
+                block_count,
+            );
+        }
         let expr = match vec_ty.scalar_bits {
             32 | 16 | 8 => {
                 let block_ty =
@@ -2058,6 +3119,45 @@ impl X86 {
         }
     }
 
+    pub(crate) fn handle_avx512_load_interleaved(
+        &self,
+        method_sig: TokenStream,
+        vec_ty: &VecType,
+        block_size: u16,
+        block_count: u16,
+    ) -> TokenStream {
+        assert_eq!(
+            block_size, 128,
+            "only 128-bit blocks are currently supported"
+        );
+        assert_eq!(block_count, 4, "only count of 4 is currently supported");
+        assert_eq!(
+            vec_ty.n_bits(),
+            512,
+            "AVX-512 interleaved loads only specialize 512-bit vectors"
+        );
+        let scalar_ty = vec_ty.scalar.rust(vec_ty.scalar_bits);
+        let native_ty = self.arch_ty(vec_ty);
+        let len = vec_ty.len;
+        let permute = avx512_permutexvar_intrinsic(vec_ty);
+        let indices = avx512_index_vector(
+            vec_ty,
+            interleaved_load_indices(vec_ty.len, block_count as usize),
+        );
+
+        quote! {
+            #method_sig {
+                let lanes: #native_ty =
+                    crate::transmute::checked_transmute_copy::<[#scalar_ty; #len], #native_ty>(
+                        src,
+                    );
+                unsafe {
+                    #permute(#indices, lanes).simd_into(self)
+                }
+            }
+        }
+    }
+
     pub(crate) fn handle_store_interleaved(
         &self,
         method_sig: TokenStream,
@@ -2070,6 +3170,14 @@ impl X86 {
             "only 128-bit blocks are currently supported"
         );
         assert_eq!(block_count, 4, "only count of 4 is currently supported");
+        if *self == Self::Avx512 && vec_ty.n_bits() == 512 {
+            return self.handle_avx512_store_interleaved(
+                method_sig,
+                vec_ty,
+                block_size,
+                block_count,
+            );
+        }
         let expr = match vec_ty.scalar_bits {
             32 | 16 | 8 => {
                 let block_ty =
@@ -2189,6 +3297,40 @@ impl X86 {
         }
     }
 
+    pub(crate) fn handle_avx512_store_interleaved(
+        &self,
+        method_sig: TokenStream,
+        vec_ty: &VecType,
+        block_size: u16,
+        block_count: u16,
+    ) -> TokenStream {
+        assert_eq!(
+            block_size, 128,
+            "only 128-bit blocks are currently supported"
+        );
+        assert_eq!(block_count, 4, "only count of 4 is currently supported");
+        assert_eq!(
+            vec_ty.n_bits(),
+            512,
+            "AVX-512 interleaved stores only specialize 512-bit vectors"
+        );
+        let store_unaligned = intrinsic_ident("storeu", coarse_type(vec_ty), vec_ty.n_bits());
+        let permute = avx512_permutexvar_intrinsic(vec_ty);
+        let indices = avx512_index_vector(
+            vec_ty,
+            interleaved_store_indices(vec_ty.len, block_count as usize),
+        );
+
+        quote! {
+            #method_sig {
+                unsafe {
+                    let lanes = #permute(#indices, a.into());
+                    #store_unaligned(dest.as_mut_ptr() as *mut _, lanes);
+                }
+            }
+        }
+    }
+
     /// Generates versions of the "alignr" intrinsics that take the shift amount as a regular argument instead of a
     /// const generic argument, to make them easier to use in higher-level operations. These are low-level helpers that
     /// inherit the semantics of the underlying `alignr` intrinsics, so the argument order is backwards from ARM's
@@ -2199,6 +3341,7 @@ impl X86 {
         let vec_widths: &[usize] = match self {
             Self::Sse4_2 => &[128],
             Self::Avx2 => &[128, 256],
+            Self::Avx512 => &[128, 256, 512],
         };
 
         for vec_ty in vec_widths
diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs
index c1129e6be..860a38382 100644
--- a/fearless_simd_gen/src/ops.rs
+++ b/fearless_simd_gen/src/ops.rs
@@ -110,6 +110,8 @@ pub(crate) enum OpSig {
     MaskFromBitmask,
     /// Takes a mask vector type and returns its compact bitmask representation.
     MaskToBitmask,
+    /// Takes a mutable mask vector, a lane index, and a boolean, and updates the lane in place.
+    MaskSet,
     /// Takes an argument of an array of a certain scalar type, with the length (`block_size` * `block_count`) / [scalar
     /// type's byte size]. Returns a vector type of that scalar type and length.
     ///
@@ -277,6 +279,12 @@ impl Op {
                 let arg0 = &arg_names[0];
                 quote! { (self, #arg0: #ty<Self>) -> u64 }
             }
+            OpSig::MaskSet => {
+                let arg0 = &arg_names[0];
+                let arg1 = &arg_names[1];
+                let arg2 = &arg_names[2];
+                quote! { (self, #arg0: &mut #ty<Self>, #arg1: usize, #arg2: bool) -> () }
+            }
             OpSig::Shift => {
                 let arg0 = &arg_names[0];
                 let arg1 = &arg_names[1];
@@ -353,7 +361,7 @@ impl Op {
             OpSig::LoadInterleaved { .. } | OpSig::StoreInterleaved { .. } | OpSig::StoreArray => {
                 return None;
             }
-            OpSig::MaskFromBitmask | OpSig::MaskToBitmask => return None,
+            OpSig::MaskFromBitmask | OpSig::MaskToBitmask | OpSig::MaskSet => return None,
             OpSig::Unary
             | OpSig::Cvt { .. }
             | OpSig::Reinterpret { .. }
@@ -583,6 +591,12 @@ const MASK_REPRESENTATION_OPS: &[Op] = &[
         OpSig::MaskToBitmask,
         "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared.",
     ),
+    Op::new(
+        "set",
+        OpKind::AssociatedOnly,
+        OpSig::MaskSet,
+        "Set one logical lane of a SIMD mask.",
+    ),
 ];
 
 const FLOAT_OPS: &[Op] = &[
@@ -612,7 +626,7 @@ const FLOAT_OPS: &[Op] = &[
         "Compute an approximate reciprocal (`1. / x`) for each element.\n\n\
          This uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\n\
          On x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. \
-         On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. \
+         On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. \
          The precision of this operation may change as new platform support is added.",
     ),
     Op::new(
@@ -1176,7 +1190,7 @@ pub(crate) const F32_TO_U32: Op = Op::new(
     },
     "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\n\
     Out-of-range values or NaN will produce implementation-defined results.\n\n\
-    On x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\n\
+    On x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\n\
     If you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards.",
 );
 pub(crate) const F32_TO_U32_PRECISE: Op = Op::new(
@@ -1511,6 +1525,7 @@ impl OpSig {
                 | Self::FromArray { .. }
                 | Self::AsArray { .. }
                 | Self::StoreArray
+                | Self::MaskSet
                 | Self::Slide {
                     granularity: SlideGranularity::AcrossBlocks,
                     ..
@@ -1540,6 +1555,7 @@ impl OpSig {
         match self {
             Self::Splat | Self::FromArray { .. } => &["val"],
             Self::MaskFromBitmask => &["bits"],
+            Self::MaskSet => &["a", "index", "value"],
             Self::Unary
             | Self::Split { .. }
             | Self::Cvt { .. }
@@ -1572,6 +1588,7 @@ impl OpSig {
             | Self::FromArray { .. }
             | Self::MaskFromBitmask
             | Self::MaskToBitmask
+            | Self::MaskSet
             | Self::FromBytes { .. }
             | Self::StoreArray => &[],
             Self::Unary
@@ -1634,6 +1651,7 @@ impl OpSig {
             | Self::Shift
             | Self::MaskFromBitmask
             | Self::MaskToBitmask
+            | Self::MaskSet
             | Self::LoadInterleaved { .. }
             | Self::StoreInterleaved { .. }
             | Self::FromArray { .. }
diff --git a/fearless_simd_tests/tests/harness/lm_generated.rs b/fearless_simd_tests/tests/harness/lm_generated.rs
index 789a8eb99..34de5b16e 100644
--- a/fearless_simd_tests/tests/harness/lm_generated.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated.rs
@@ -2,6 +2,10 @@
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
 mod extended_512;
-mod mask_methods;
+#[cfg(not(miri))] // too slow
+mod mask_roundtrip;
+#[cfg(not(miri))] // too slow
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod mask_roundtrip_x86;
 mod mod_256;
 mod mod_512;
diff --git a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs
index 2de317d3e..f1e03a25b 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs
@@ -512,6 +512,61 @@ fn fract_f32x16<S: Simd>(simd: S) {
     );
 }
 
+#[simd_test]
+fn fract_f64x8<S: Simd>(simd: S) {
+    let a = f64x8::from_slice(simd, &[1.7, -2.3, 3.9, -4.1, 5.5, -6.6, 7.2, -8.8]);
+    let result = simd.fract_f64x8(a);
+    assert_eq!(
+        *result,
+        [
+            0.7,
+            -0.2999999999999998,
+            0.8999999999999999,
+            -0.09999999999999964,
+            0.5,
+            -0.5999999999999996,
+            0.20000000000000018,
+            -0.8000000000000007
+        ]
+    );
+}
+
+#[simd_test]
+fn approximate_recip_f32x16<S: Simd>(simd: S) {
+    let a = f32x16::from_slice(
+        simd,
+        &[
+            1.0, -2.0, 23.0, 9.0, 0.5, -0.25, 128.0, -1024.0, 3.0, -7.0, 11.0, -13.0, 19.0, -29.0,
+            37.0, -41.0,
+        ],
+    );
+    let result = a.approximate_recip();
+    for i in 0..16 {
+        let expected = 1.0 / a[i];
+        let rel_error = ((result[i] - expected) / expected).abs();
+        assert!(
+            rel_error < 0.005,
+            "approximate_recip({}) rel_error = {rel_error}",
+            a[i]
+        );
+    }
+}
+
+#[simd_test]
+fn approximate_recip_f64x8<S: Simd>(simd: S) {
+    let a = f64x8::from_slice(simd, &[1.0, -2.0, 23.0, 9.0, 0.5, -0.25, 128.0, -1024.0]);
+    let result = a.approximate_recip();
+    for i in 0..8 {
+        let expected = 1.0 / a[i];
+        let rel_error = ((result[i] - expected) / expected).abs();
+        assert!(
+            rel_error < 0.005,
+            "approximate_recip({}) rel_error = {rel_error}",
+            a[i]
+        );
+    }
+}
+
 // =============================================================================
 // max_precise and min_precise tests (512-bit floats)
 // =============================================================================
@@ -688,6 +743,82 @@ fn min_precise_f32x16_with_nan<S: Simd>(simd: S) {
     assert_eq!(result[15], 5.0);
 }
 
+#[simd_test]
+fn max_precise_f64x8<S: Simd>(simd: S) {
+    let a = f64x8::from_slice(simd, &[2.0, -3.0, 0.0, 0.5, 1.0, 5.0, 3.0, 7.0]);
+    let b = f64x8::from_slice(simd, &[1.0, -2.0, 7.0, 3.0, 2.0, 4.0, 6.0, 5.0]);
+    assert_eq!(*a.max_precise(b), [2.0, -2.0, 7.0, 3.0, 2.0, 5.0, 6.0, 7.0]);
+}
+
+#[simd_test]
+fn min_precise_f64x8<S: Simd>(simd: S) {
+    let a = f64x8::from_slice(simd, &[2.0, -3.0, 0.0, 0.5, 1.0, 5.0, 3.0, 7.0]);
+    let b = f64x8::from_slice(simd, &[1.0, -2.0, 7.0, 3.0, 2.0, 4.0, 6.0, 5.0]);
+    assert_eq!(*a.min_precise(b), [1.0, -3.0, 0.0, 0.5, 1.0, 4.0, 3.0, 5.0]);
+}
+
+#[simd_test]
+fn max_precise_f64x8_with_nan<S: Simd>(simd: S) {
+    let a = f64x8::from_slice(
+        simd,
+        &[f64::NAN, -3.0, f64::INFINITY, 0.5, 1.0, f64::NAN, 3.0, 7.0],
+    );
+    let b = f64x8::from_slice(
+        simd,
+        &[
+            1.0,
+            f64::NAN,
+            7.0,
+            f64::NEG_INFINITY,
+            f64::NAN,
+            4.0,
+            6.0,
+            5.0,
+        ],
+    );
+    let result = a.max_precise(b);
+
+    assert_eq!(result[0], 1.0);
+    assert_eq!(result[1], -3.0);
+    assert_eq!(result[2], f64::INFINITY);
+    assert_eq!(result[3], 0.5);
+    assert_eq!(result[4], 1.0);
+    assert_eq!(result[5], 4.0);
+    assert_eq!(result[6], 6.0);
+    assert_eq!(result[7], 7.0);
+}
+
+#[simd_test]
+fn min_precise_f64x8_with_nan<S: Simd>(simd: S) {
+    let a = f64x8::from_slice(
+        simd,
+        &[f64::NAN, -3.0, f64::INFINITY, 0.5, 1.0, f64::NAN, 3.0, 7.0],
+    );
+    let b = f64x8::from_slice(
+        simd,
+        &[
+            1.0,
+            f64::NAN,
+            7.0,
+            f64::NEG_INFINITY,
+            f64::NAN,
+            4.0,
+            6.0,
+            5.0,
+        ],
+    );
+    let result = a.min_precise(b);
+
+    assert_eq!(result[0], 1.0);
+    assert_eq!(result[1], -3.0);
+    assert_eq!(result[2], 7.0);
+    assert_eq!(result[3], f64::NEG_INFINITY);
+    assert_eq!(result[4], 1.0);
+    assert_eq!(result[5], 4.0);
+    assert_eq!(result[6], 3.0);
+    assert_eq!(result[7], 5.0);
+}
+
 // =============================================================================
 // Shift operations tests (512-bit)
 // =============================================================================
@@ -941,6 +1072,114 @@ fn shl_u32x16<S: Simd>(simd: S) {
 }
 
 // Vector shift tests (shlv/shrv)
+#[simd_test]
+fn shlv_i8x64<S: Simd>(simd: S) {
+    const A: [i8; 16] = [64, 65, -64, -65, 1, 2, 3, 4, -1, -2, -3, -4, 15, 16, 31, 32];
+    const SHIFTS: [i8; 16] = [1, 2, 1, 2, 0, 1, 2, 3, 1, 2, 3, 4, 3, 2, 1, 0];
+    const EXPECTED: [i8; 16] = [
+        -128, 4, -128, -4, 1, 4, 12, 32, -2, -8, -24, -64, 120, 64, 62, 32,
+    ];
+    let a_vals: [i8; 64] = core::array::from_fn(|i| A[i % 16]);
+    let shift_vals: [i8; 64] = core::array::from_fn(|i| SHIFTS[i % 16]);
+    let expected: [i8; 64] = core::array::from_fn(|i| EXPECTED[i % 16]);
+    let a = i8x64::from_slice(simd, &a_vals);
+    let shifts = i8x64::from_slice(simd, &shift_vals);
+    assert_eq!(*(a << shifts), expected);
+}
+
+#[simd_test]
+fn shrv_i8x64<S: Simd>(simd: S) {
+    const A: [i8; 16] = [
+        -128, -64, -33, -1, 127, 64, 33, 1, -2, -4, -8, -16, 0, 2, 4, 8,
+    ];
+    const SHIFTS: [i8; 16] = [1, 2, 3, 7, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3];
+    const EXPECTED: [i8; 16] = [-64, -16, -5, -1, 63, 16, 4, 1, -1, -1, -1, -1, 0, 1, 1, 1];
+    let a_vals: [i8; 64] = core::array::from_fn(|i| A[i % 16]);
+    let shift_vals: [i8; 64] = core::array::from_fn(|i| SHIFTS[i % 16]);
+    let expected: [i8; 64] = core::array::from_fn(|i| EXPECTED[i % 16]);
+    let a = i8x64::from_slice(simd, &a_vals);
+    let shifts = i8x64::from_slice(simd, &shift_vals);
+    assert_eq!(*(a >> shifts), expected);
+}
+
+#[simd_test]
+fn shlv_u8x64<S: Simd>(simd: S) {
+    const A: [u8; 16] = [255, 128, 64, 32, 16, 8, 4, 2, 1, 3, 5, 7, 15, 31, 63, 127];
+    const SHIFTS: [u8; 16] = [4, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 3, 2, 1];
+    const EXPECTED: [u8; 16] = [240, 0, 0, 0, 0, 0, 0, 0, 1, 6, 20, 56, 240, 248, 252, 254];
+    let a_vals: [u8; 64] = core::array::from_fn(|i| A[i % 16]);
+    let shift_vals: [u8; 64] = core::array::from_fn(|i| SHIFTS[i % 16]);
+    let expected: [u8; 64] = core::array::from_fn(|i| EXPECTED[i % 16]);
+    let a = u8x64::from_slice(simd, &a_vals);
+    let shifts = u8x64::from_slice(simd, &shift_vals);
+    assert_eq!(*(a << shifts), expected);
+}
+
+#[simd_test]
+fn shrv_u8x64<S: Simd>(simd: S) {
+    const A: [u8; 16] = [255, 128, 64, 32, 16, 8, 4, 2, 1, 3, 5, 7, 15, 31, 63, 127];
+    const SHIFTS: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 1, 0, 1, 2, 3, 4, 3, 2, 1];
+    const EXPECTED: [u8; 16] = [127, 32, 8, 2, 0, 0, 0, 1, 1, 1, 1, 0, 0, 3, 15, 63];
+    let a_vals: [u8; 64] = core::array::from_fn(|i| A[i % 16]);
+    let shift_vals: [u8; 64] = core::array::from_fn(|i| SHIFTS[i % 16]);
+    let expected: [u8; 64] = core::array::from_fn(|i| EXPECTED[i % 16]);
+    let a = u8x64::from_slice(simd, &a_vals);
+    let shifts = u8x64::from_slice(simd, &shift_vals);
+    assert_eq!(*(a >> shifts), expected);
+}
+
+#[simd_test]
+fn shlv_i16x32<S: Simd>(simd: S) {
+    const A: [i16; 8] = [16384, 8192, -16384, -8192, 1, -1, 255, -256];
+    const SHIFTS: [i16; 8] = [1, 2, 1, 2, 15, 1, 4, 3];
+    const EXPECTED: [i16; 8] = [-32768, -32768, -32768, -32768, -32768, -2, 4080, -2048];
+    let a_vals: [i16; 32] = core::array::from_fn(|i| A[i % 8]);
+    let shift_vals: [i16; 32] = core::array::from_fn(|i| SHIFTS[i % 8]);
+    let expected: [i16; 32] = core::array::from_fn(|i| EXPECTED[i % 8]);
+    let a = i16x32::from_slice(simd, &a_vals);
+    let shifts = i16x32::from_slice(simd, &shift_vals);
+    assert_eq!(*(a << shifts), expected);
+}
+
+#[simd_test]
+fn shrv_i16x32<S: Simd>(simd: S) {
+    const A: [i16; 8] = [-32768, -16384, -1025, -1, 32767, 16384, 1025, 1];
+    const SHIFTS: [i16; 8] = [1, 2, 3, 15, 1, 2, 3, 0];
+    const EXPECTED: [i16; 8] = [-16384, -4096, -129, -1, 16383, 4096, 128, 1];
+    let a_vals: [i16; 32] = core::array::from_fn(|i| A[i % 8]);
+    let shift_vals: [i16; 32] = core::array::from_fn(|i| SHIFTS[i % 8]);
+    let expected: [i16; 32] = core::array::from_fn(|i| EXPECTED[i % 8]);
+    let a = i16x32::from_slice(simd, &a_vals);
+    let shifts = i16x32::from_slice(simd, &shift_vals);
+    assert_eq!(*(a >> shifts), expected);
+}
+
+#[simd_test]
+fn shlv_u16x32<S: Simd>(simd: S) {
+    const A: [u16; 8] = [65535, 32768, 16384, 8192, 1, 255, 1024, 4096];
+    const SHIFTS: [u16; 8] = [4, 1, 2, 3, 15, 4, 5, 0];
+    const EXPECTED: [u16; 8] = [65520, 0, 0, 0, 32768, 4080, 32768, 4096];
+    let a_vals: [u16; 32] = core::array::from_fn(|i| A[i % 8]);
+    let shift_vals: [u16; 32] = core::array::from_fn(|i| SHIFTS[i % 8]);
+    let expected: [u16; 32] = core::array::from_fn(|i| EXPECTED[i % 8]);
+    let a = u16x32::from_slice(simd, &a_vals);
+    let shifts = u16x32::from_slice(simd, &shift_vals);
+    assert_eq!(*(a << shifts), expected);
+}
+
+#[simd_test]
+fn shrv_u16x32<S: Simd>(simd: S) {
+    const A: [u16; 8] = [65535, 32768, 16384, 8192, 1, 255, 1024, 4096];
+    const SHIFTS: [u16; 8] = [1, 2, 3, 4, 0, 4, 5, 12];
+    const EXPECTED: [u16; 8] = [32767, 8192, 2048, 512, 1, 15, 32, 1];
+    let a_vals: [u16; 32] = core::array::from_fn(|i| A[i % 8]);
+    let shift_vals: [u16; 32] = core::array::from_fn(|i| SHIFTS[i % 8]);
+    let expected: [u16; 32] = core::array::from_fn(|i| EXPECTED[i % 8]);
+    let a = u16x32::from_slice(simd, &a_vals);
+    let shifts = u16x32::from_slice(simd, &shift_vals);
+    assert_eq!(*(a >> shifts), expected);
+}
+
 #[simd_test]
 fn shrv_i32x16<S: Simd>(simd: S) {
     let a = i32x16::from_slice(
@@ -1359,6 +1598,166 @@ fn unzip_high_u32x16<S: Simd>(simd: S) {
     );
 }
 
+#[simd_test]
+fn zip_unzip_i16x32<S: Simd>(simd: S) {
+    let a = i16x32::from_slice(
+        simd,
+        &[
+            -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4,
+            5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+        ],
+    );
+    let b = i16x32::from_slice(
+        simd,
+        &[
+            1000, 999, 998, 997, 996, 995, 994, 993, 992, 991, 990, 989, 988, 987, 986, 985, 984,
+            983, 982, 981, 980, 979, 978, 977, 976, 975, 974, 973, 972, 971, 970, 969,
+        ],
+    );
+
+    assert_eq!(
+        *simd.zip_low_i16x32(a, b),
+        [
+            -16, 1000, -15, 999, -14, 998, -13, 997, -12, 996, -11, 995, -10, 994, -9, 993, -8,
+            992, -7, 991, -6, 990, -5, 989, -4, 988, -3, 987, -2, 986, -1, 985
+        ]
+    );
+    assert_eq!(
+        *simd.zip_high_i16x32(a, b),
+        [
+            0, 984, 1, 983, 2, 982, 3, 981, 4, 980, 5, 979, 6, 978, 7, 977, 8, 976, 9, 975, 10,
+            974, 11, 973, 12, 972, 13, 971, 14, 970, 15, 969
+        ]
+    );
+    assert_eq!(
+        *simd.unzip_low_i16x32(a, b),
+        [
+            -16, -14, -12, -10, -8, -6, -4, -2, 0, 2, 4, 6, 8, 10, 12, 14, 1000, 998, 996, 994,
+            992, 990, 988, 986, 984, 982, 980, 978, 976, 974, 972, 970
+        ]
+    );
+    assert_eq!(
+        *simd.unzip_high_i16x32(a, b),
+        [
+            -15, -13, -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15, 999, 997, 995, 993, 991,
+            989, 987, 985, 983, 981, 979, 977, 975, 973, 971, 969
+        ]
+    );
+
+    let (interleaved_low, interleaved_high) = simd.interleave_i16x32(a, b);
+    assert_eq!(
+        *interleaved_low,
+        [
+            -16, 1000, -15, 999, -14, 998, -13, 997, -12, 996, -11, 995, -10, 994, -9, 993, -8,
+            992, -7, 991, -6, 990, -5, 989, -4, 988, -3, 987, -2, 986, -1, 985
+        ]
+    );
+    assert_eq!(
+        *interleaved_high,
+        [
+            0, 984, 1, 983, 2, 982, 3, 981, 4, 980, 5, 979, 6, 978, 7, 977, 8, 976, 9, 975, 10,
+            974, 11, 973, 12, 972, 13, 971, 14, 970, 15, 969
+        ]
+    );
+
+    let (roundtrip_a, roundtrip_b) = simd.deinterleave_i16x32(interleaved_low, interleaved_high);
+    assert_eq!(
+        *roundtrip_a,
+        [
+            -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4,
+            5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+        ]
+    );
+    assert_eq!(
+        *roundtrip_b,
+        [
+            1000, 999, 998, 997, 996, 995, 994, 993, 992, 991, 990, 989, 988, 987, 986, 985, 984,
+            983, 982, 981, 980, 979, 978, 977, 976, 975, 974, 973, 972, 971, 970, 969
+        ]
+    );
+}
+
+#[simd_test]
+fn zip_unzip_u16x32<S: Simd>(simd: S) {
+    let a = u16x32::from_slice(
+        simd,
+        &[
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        ],
+    );
+    let b = u16x32::from_slice(
+        simd,
+        &[
+            1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+            1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027,
+            1028, 1029, 1030, 1031,
+        ],
+    );
+
+    assert_eq!(
+        *simd.zip_low_u16x32(a, b),
+        [
+            0, 1000, 1, 1001, 2, 1002, 3, 1003, 4, 1004, 5, 1005, 6, 1006, 7, 1007, 8, 1008, 9,
+            1009, 10, 1010, 11, 1011, 12, 1012, 13, 1013, 14, 1014, 15, 1015
+        ]
+    );
+    assert_eq!(
+        *simd.zip_high_u16x32(a, b),
+        [
+            16, 1016, 17, 1017, 18, 1018, 19, 1019, 20, 1020, 21, 1021, 22, 1022, 23, 1023, 24,
+            1024, 25, 1025, 26, 1026, 27, 1027, 28, 1028, 29, 1029, 30, 1030, 31, 1031
+        ]
+    );
+    assert_eq!(
+        *simd.unzip_low_u16x32(a, b),
+        [
+            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1000, 1002, 1004, 1006,
+            1008, 1010, 1012, 1014, 1016, 1018, 1020, 1022, 1024, 1026, 1028, 1030
+        ]
+    );
+    assert_eq!(
+        *simd.unzip_high_u16x32(a, b),
+        [
+            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 1001, 1003, 1005, 1007,
+            1009, 1011, 1013, 1015, 1017, 1019, 1021, 1023, 1025, 1027, 1029, 1031
+        ]
+    );
+
+    let (interleaved_low, interleaved_high) = simd.interleave_u16x32(a, b);
+    assert_eq!(
+        *interleaved_low,
+        [
+            0, 1000, 1, 1001, 2, 1002, 3, 1003, 4, 1004, 5, 1005, 6, 1006, 7, 1007, 8, 1008, 9,
+            1009, 10, 1010, 11, 1011, 12, 1012, 13, 1013, 14, 1014, 15, 1015
+        ]
+    );
+    assert_eq!(
+        *interleaved_high,
+        [
+            16, 1016, 17, 1017, 18, 1018, 19, 1019, 20, 1020, 21, 1021, 22, 1022, 23, 1023, 24,
+            1024, 25, 1025, 26, 1026, 27, 1027, 28, 1028, 29, 1029, 30, 1030, 31, 1031
+        ]
+    );
+
+    let (roundtrip_a, roundtrip_b) = simd.deinterleave_u16x32(interleaved_low, interleaved_high);
+    assert_eq!(
+        *roundtrip_a,
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31
+        ]
+    );
+    assert_eq!(
+        *roundtrip_b,
+        [
+            1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+            1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027,
+            1028, 1029, 1030, 1031
+        ]
+    );
+}
+
 // =============================================================================
 // interleave tests (512-bit)
 // =============================================================================
diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs
similarity index 76%
rename from fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs
rename to fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs
index 15963b2a3..5433ce2a6 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs
@@ -4,6 +4,86 @@
 use fearless_simd::*;
 use fearless_simd_dev_macros::simd_test;
 
+/// Verifies that `SimdMask::set` can set and clear every lane while keeping
+/// `to_bitmask` and `test` in sync with the expected compact bitmask.
+fn assert_mask_set_roundtrip<S: Simd, M: SimdMask<S>>(simd: S) {
+    let mut mask = M::from_bitmask(simd, 0);
+    let mut expected = 0_u64;
+    for i in 0..M::N {
+        mask.set(i, true);
+        expected |= 1_u64 << i;
+        assert_eq!(mask.to_bitmask(), expected);
+        assert!(mask.test(i));
+    }
+
+    for i in 0..M::N {
+        mask.set(i, false);
+        expected &= !(1_u64 << i);
+        assert_eq!(mask.to_bitmask(), expected);
+        assert!(!mask.test(i));
+    }
+}
+
+#[simd_test]
+fn mask8x16_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask8x16<S>>(simd);
+}
+
+#[simd_test]
+fn mask16x8_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask16x8<S>>(simd);
+}
+
+#[simd_test]
+fn mask32x4_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask32x4<S>>(simd);
+}
+
+#[simd_test]
+fn mask64x2_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask64x2<S>>(simd);
+}
+
+#[simd_test]
+fn mask8x32_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask8x32<S>>(simd);
+}
+
+#[simd_test]
+fn mask16x16_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask16x16<S>>(simd);
+}
+
+#[simd_test]
+fn mask32x8_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask32x8<S>>(simd);
+}
+
+#[simd_test]
+fn mask64x4_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask64x4<S>>(simd);
+}
+
+#[simd_test]
+fn mask8x64_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask8x64<S>>(simd);
+}
+
+#[simd_test]
+fn mask16x32_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask16x32<S>>(simd);
+}
+
+#[simd_test]
+fn mask32x16_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask32x16<S>>(simd);
+}
+
+#[simd_test]
+fn mask64x8_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask64x8<S>>(simd);
+}
+
 #[simd_test]
 fn mask8x16_bitmask_roundtrip<S: Simd>(simd: S) {
     for bits in 0..=0xffff_u64 {
diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
new file mode 100644
index 000000000..cade583d3
--- /dev/null
+++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
@@ -0,0 +1,254 @@
+// Copyright 2026 the Fearless_SIMD Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+use core::convert::TryFrom;
+use core::mem::size_of;
+
+use fearless_simd::*;
+use fearless_simd_dev_macros::simd_test;
+
+fn lane_mask(lanes: usize) -> u64 {
+    if lanes == u64::BITS as usize {
+        u64::MAX
+    } else {
+        (1_u64 << lanes) - 1
+    }
+}
+
+fn lanes_from_bits<L, const LANES: usize>(bits: u64) -> [L; LANES]
+where
+    L: Copy + From<i8>,
+{
+    let bits = bits & lane_mask(LANES);
+    core::array::from_fn(|i| {
+        if ((bits >> i) & 1) != 0 {
+            L::from(-1)
+        } else {
+            L::from(0)
+        }
+    })
+}
+
+fn assert_native_vector_roundtrip<S, M, A, L, const LANES: usize>(simd: S, bits: u64)
+where
+    S: Simd,
+    M: SimdMask<S> + SimdFrom<A, S> + Into<A>,
+    A: Copy,
+    L: Copy + Eq + core::fmt::Debug + From<i8>,
+{
+    let expected_bits = bits & lane_mask(LANES);
+    let expected_lanes = lanes_from_bits::<L, LANES>(bits);
+
+    assert_eq!(size_of::<A>(), size_of::<[L; LANES]>());
+
+    let mask = M::from_bitmask(simd, bits);
+    let arch: A = mask.into();
+    // Safety: the size assertion above verifies that the x86 vector type has
+    // the same size as the signed integer lane representation used for masks.
+    let lanes = unsafe { core::mem::transmute_copy::<A, [L; LANES]>(&arch) };
+    assert_eq!(lanes, expected_lanes);
+
+    // Safety: this builds the native x86 vector value from the lane
+    // representation expected by the public mask conversion.
+    let arch = unsafe { core::mem::transmute_copy::<[L; LANES], A>(&expected_lanes) };
+    let mask = M::simd_from(simd, arch);
+    assert_eq!(mask.to_bitmask(), expected_bits);
+}
+
+fn assert_native_mask_roundtrip<S, M, A, const LANES: usize>(simd: S, bits: u64)
+where
+    S: Simd,
+    M: SimdMask<S> + SimdFrom<A, S> + Into<A>,
+    A: Copy + Eq + core::fmt::Debug + TryFrom<u64>,
+    A::Error: core::fmt::Debug,
+{
+    let expected_bits = bits & lane_mask(LANES);
+    let expected_arch = A::try_from(expected_bits).expect("masked bits fit in native mask type");
+
+    let mask = M::from_bitmask(simd, bits);
+    let arch: A = mask.into();
+    assert_eq!(arch, expected_arch);
+
+    let mask = M::simd_from(simd, expected_arch);
+    assert_eq!(mask.to_bitmask(), expected_bits);
+
+    let arch: A = mask.into();
+    assert_eq!(arch, expected_arch);
+}
+
+#[simd_test]
+fn mask8x16_m128i_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_vector_roundtrip::<S, mask8x16<S>, __m128i, i8, 16>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask16x8_m128i_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_vector_roundtrip::<S, mask16x8<S>, __m128i, i16, 8>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask32x4_m128i_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_vector_roundtrip::<S, mask32x4<S>, __m128i, i32, 4>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask64x2_m128i_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_vector_roundtrip::<S, mask64x2<S>, __m128i, i64, 2>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask8x32_m256i_roundtrip<S: Simd>(simd: S) {
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0x0000_0000);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0x0000_0001);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0x8000_0000);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0x0000_ffff);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0xffff_0000);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0x5555_5555);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0xaaaa_aaaa);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0x8000_aa55);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0xffff_ffff);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0xffff_ffff_0000_0000);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0xffff_ffff_8000_aa55);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0xffff_ffff_ffff_ffff);
+}
+
+#[simd_test]
+fn mask16x16_m256i_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_vector_roundtrip::<S, mask16x16<S>, __m256i, i16, 16>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask32x8_m256i_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_vector_roundtrip::<S, mask32x8<S>, __m256i, i32, 8>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask64x4_m256i_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_vector_roundtrip::<S, mask64x4<S>, __m256i, i64, 4>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask8x16_mmask16_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask8x16<S>, __mmask16, 16>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask16x8_mmask8_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask16x8<S>, __mmask8, 8>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask32x4_mmask8_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask32x4<S>, __mmask8, 4>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask64x2_mmask8_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask64x2<S>, __mmask8, 2>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask8x32_mmask32_roundtrip<S: Simd>(simd: S) {
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0x0000_0000);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0x0000_0001);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0x8000_0000);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0x0000_ffff);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0xffff_0000);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0x5555_5555);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0xaaaa_aaaa);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0x8000_aa55);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0xffff_ffff);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0xffff_ffff_0000_0000);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0xffff_ffff_8000_aa55);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0xffff_ffff_ffff_ffff);
+}
+
+#[simd_test]
+fn mask16x16_mmask16_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask16x16<S>, __mmask16, 16>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask32x8_mmask8_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask32x8<S>, __mmask8, 8>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask64x4_mmask8_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask64x4<S>, __mmask8, 4>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask8x64_mmask64_roundtrip<S: Simd>(simd: S) {
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0x0000_0000_0000_0000);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0x0000_0000_0000_0001);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0x8000_0000_0000_0000);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0x0000_0000_ffff_ffff);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0xffff_ffff_0000_0000);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0x5555_5555_5555_5555);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0xaaaa_aaaa_aaaa_aaaa);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0x8000_0001_5555_aaab);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0xffff_ffff_ffff_ffff);
+}
+
+#[simd_test]
+fn mask16x32_mmask32_roundtrip<S: Simd>(simd: S) {
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0x0000_0000);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0x0000_0001);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0x8000_0000);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0x0000_ffff);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0xffff_0000);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0x5555_5555);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0xaaaa_aaaa);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0x8000_aa55);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0xffff_ffff);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0xffff_ffff_0000_0000);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0xffff_ffff_8000_aa55);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0xffff_ffff_ffff_ffff);
+}
+
+#[simd_test]
+fn mask32x16_mmask16_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask32x16<S>, __mmask16, 16>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask64x8_mmask8_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask64x8<S>, __mmask8, 8>(simd, bits);
+    }
+}
diff --git a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
index 2d01f40e9..e82ac078e 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
@@ -65,6 +65,30 @@ fn sqrt_f32x8<S: Simd>(simd: S) {
     );
 }
 
+#[simd_test]
+fn approximate_recip_f32x8<S: Simd>(simd: S) {
+    let a = f32x8::from_slice(simd, &[1.0, -2.0, 23.0, 9.0, 3.5, -7.25, 13.0, 0.25]);
+    let result = a.approximate_recip();
+    let expected = [
+        1.0,
+        -0.5,
+        1. / 23.,
+        1. / 9.,
+        1. / 3.5,
+        1. / -7.25,
+        1. / 13.,
+        4.0,
+    ];
+    for i in 0..8 {
+        let rel_error = ((result[i] - expected[i]) / expected[i]).abs();
+        assert!(
+            rel_error < 0.005,
+            "approximate_recip({}) rel_error = {rel_error}",
+            a[i]
+        );
+    }
+}
+
 #[simd_test]
 fn div_f32x8<S: Simd>(simd: S) {
     let a = f32x8::from_slice(simd, &[4.0, 2.0, 1.0, 0.0, 10.0, 12.0, 15.0, 20.0]);
@@ -235,6 +259,44 @@ fn min_precise_f32x8_with_nan<S: Simd>(simd: S) {
     assert_eq!(result[7], 5.0);
 }
 
+#[simd_test]
+fn max_precise_f64x4<S: Simd>(simd: S) {
+    let a = f64x4::from_slice(simd, &[2.0, -3.0, 0.0, 0.5]);
+    let b = f64x4::from_slice(simd, &[1.0, -2.0, 7.0, 3.0]);
+    assert_eq!(*a.max_precise(b), [2.0, -2.0, 7.0, 3.0]);
+}
+
+#[simd_test]
+fn min_precise_f64x4<S: Simd>(simd: S) {
+    let a = f64x4::from_slice(simd, &[2.0, -3.0, 0.0, 0.5]);
+    let b = f64x4::from_slice(simd, &[1.0, -2.0, 7.0, 3.0]);
+    assert_eq!(*a.min_precise(b), [1.0, -3.0, 0.0, 0.5]);
+}
+
+#[simd_test]
+fn max_precise_f64x4_with_nan<S: Simd>(simd: S) {
+    let a = f64x4::from_slice(simd, &[f64::NAN, -3.0, f64::INFINITY, 0.5]);
+    let b = f64x4::from_slice(simd, &[1.0, f64::NAN, 7.0, f64::NEG_INFINITY]);
+    let result = a.max_precise(b);
+
+    assert_eq!(result[0], 1.0);
+    assert_eq!(result[1], -3.0);
+    assert_eq!(result[2], f64::INFINITY);
+    assert_eq!(result[3], 0.5);
+}
+
+#[simd_test]
+fn min_precise_f64x4_with_nan<S: Simd>(simd: S) {
+    let a = f64x4::from_slice(simd, &[f64::NAN, -3.0, f64::INFINITY, 0.5]);
+    let b = f64x4::from_slice(simd, &[1.0, f64::NAN, 7.0, f64::NEG_INFINITY]);
+    let result = a.min_precise(b);
+
+    assert_eq!(result[0], 1.0);
+    assert_eq!(result[1], -3.0);
+    assert_eq!(result[2], 7.0);
+    assert_eq!(result[3], f64::NEG_INFINITY);
+}
+
 #[simd_test]
 fn floor_f32x8<S: Simd>(simd: S) {
     let a = f32x8::from_slice(simd, &[2.0, -3.2, 0.0, 0.5, 1.7, -2.8, 3.1, -4.9]);
@@ -315,6 +377,55 @@ fn trunc_f32x8_special_values<S: Simd>(simd: S) {
     );
 }
 
+#[simd_test]
+fn cvt_u32_f32x8<S: Simd>(simd: S) {
+    let a = f32x8::from_slice(simd, &[1.0, 42.7, 3e9, -0.3, 0.0, 17.9, 255.99, 1024.1]);
+    assert_eq!(
+        *a.to_int::<u32x8<_>>(),
+        [1, 42, 3000000000, 0, 0, 17, 255, 1024]
+    );
+}
+
+#[simd_test]
+fn cvt_u32_precise_f32x8<S: Simd>(simd: S) {
+    let a = f32x8::from_slice(
+        simd,
+        &[-1.0, 42.7, 5e9, f32::NAN, 0.0, 1.9, 3000000000.0, -5e9],
+    );
+    assert_eq!(
+        *a.to_int_precise::<u32x8<_>>(),
+        [0, 42, u32::MAX, 0, 0, 1, 3000000000, 0]
+    );
+}
+
+#[simd_test]
+fn cvt_u32_f32x8_rounding<S: Simd>(simd: S) {
+    let a = f32x8::from_slice(simd, &[0.0, 0.49, 0.51, 0.99, 1.01, 1.99, 2.5, 3.75]);
+    assert_eq!(*a.to_int::<u32x8<_>>(), [0, 0, 0, 0, 1, 1, 2, 3]);
+}
+
+#[simd_test]
+fn cvt_u32_precise_f32x8_inf<S: Simd>(simd: S) {
+    let a = f32x8::from_slice(
+        simd,
+        &[
+            -10.3,
+            f32::NAN,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            u32::MAX as f32,
+            4294967040.0,
+            4294967296.0,
+            -0.5,
+        ],
+    );
+
+    assert_eq!(
+        *a.to_int_precise::<u32x8<_>>(),
+        [0, 0, u32::MAX, u32::MIN, u32::MAX, 4294967040, u32::MAX, 0]
+    );
+}
+
 #[simd_test]
 fn select_f32x8<S: Simd>(simd: S) {
     let a = f32x8::from_slice(simd, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
index 727b5b078..d75ec80af 100644
--- a/fearless_simd_tests/tests/harness/mod.rs
+++ b/fearless_simd_tests/tests/harness/mod.rs
@@ -204,6 +204,40 @@ fn min_precise_f32x4_with_nan<S: Simd>(simd: S) {
     assert_eq!(result[3], f32::NEG_INFINITY);
 }
 
+#[simd_test]
+fn max_precise_f64x2<S: Simd>(simd: S) {
+    let a = f64x2::from_slice(simd, &[2.0, -3.0]);
+    let b = f64x2::from_slice(simd, &[1.0, -2.0]);
+    assert_eq!(*a.max_precise(b), [2.0, -2.0]);
+}
+
+#[simd_test]
+fn min_precise_f64x2<S: Simd>(simd: S) {
+    let a = f64x2::from_slice(simd, &[2.0, -3.0]);
+    let b = f64x2::from_slice(simd, &[1.0, -2.0]);
+    assert_eq!(*a.min_precise(b), [1.0, -3.0]);
+}
+
+#[simd_test]
+fn max_precise_f64x2_with_nan<S: Simd>(simd: S) {
+    let a = f64x2::from_slice(simd, &[f64::NAN, -3.0]);
+    let b = f64x2::from_slice(simd, &[1.0, f64::NAN]);
+    let result = a.max_precise(b);
+
+    assert_eq!(result[0], 1.0);
+    assert_eq!(result[1], -3.0);
+}
+
+#[simd_test]
+fn min_precise_f64x2_with_nan<S: Simd>(simd: S) {
+    let a = f64x2::from_slice(simd, &[f64::NAN, -3.0]);
+    let b = f64x2::from_slice(simd, &[1.0, f64::NAN]);
+    let result = a.min_precise(b);
+
+    assert_eq!(result[0], 1.0);
+    assert_eq!(result[1], -3.0);
+}
+
 #[simd_test]
 fn floor_f32x4<S: Simd>(simd: S) {
     let a = f32x4::from_slice(simd, &[2.0, -3.2, 0.0, 0.5]);
@@ -694,47 +728,63 @@ fn combine_u8x16<S: Simd>(simd: S) {
 
 #[simd_test]
 fn and_mask8x16<S: Simd>(simd: S) {
-    let a = mask8x16::from_slice(simd, &[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
+    let a = mask8x16::from_slice(
+        simd,
+        &[-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0],
+    );
     let b = mask8x16::from_slice(
         simd,
         &[
-            85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         ],
     );
     assert_eq!(
         <[i8; 16]>::from(a & b),
-        [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
+        [-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0]
     );
 }
 
 #[simd_test]
 fn or_mask8x16<S: Simd>(simd: S) {
-    let a = mask8x16::from_slice(simd, &[0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8]);
-    let b = mask8x16::from_slice(simd, &[1, 1, 1, 1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0]);
+    let a = mask8x16::from_slice(
+        simd,
+        &[0, -1, 0, -1, 0, -1, 0, -1, -1, 0, -1, 0, -1, 0, -1, 0],
+    );
+    let b = mask8x16::from_slice(
+        simd,
+        &[0, 0, -1, -1, 0, 0, -1, -1, 0, -1, 0, -1, 0, -1, 0, -1],
+    );
     assert_eq!(
         <[i8; 16]>::from(a | b),
-        [1, 1, 3, 3, 6, 7, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8]
+        [0, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
     );
 }
 
 #[simd_test]
 fn xor_mask8x16<S: Simd>(simd: S) {
-    let a = mask8x16::from_slice(simd, &[0, 1, 2, 3, 4, 5, 6, 7, 1, 1, 1, 1, 0, 0, 0, 0]);
-    let b = mask8x16::from_slice(simd, &[1, 1, 0, 0, 5, 4, 7, 6, 1, 0, 1, 0, 1, 0, 1, 0]);
+    let a = mask8x16::from_slice(
+        simd,
+        &[0, -1, -1, 0, -1, 0, 0, -1, -1, -1, 0, 0, -1, -1, 0, 0],
+    );
+    let b = mask8x16::from_slice(
+        simd,
+        &[-1, -1, 0, 0, -1, -1, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0],
+    );
     assert_eq!(
         <[i8; 16]>::from(a ^ b),
-        [1, 0, 2, 3, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0]
+        [-1, 0, -1, 0, 0, -1, 0, -1, 0, -1, -1, 0, 0, -1, -1, 0]
     );
 }
 
 #[simd_test]
 fn not_mask8x16<S: Simd>(simd: S) {
-    let a = mask8x16::from_slice(simd, &[0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8]);
+    let a = mask8x16::from_slice(
+        simd,
+        &[0, -1, -1, 0, -1, 0, 0, -1, -1, 0, -1, 0, 0, -1, 0, -1],
+    );
     assert_eq!(
         <[i8; 16]>::from(!a),
-        [
-            -1, -2, -3, -4, -5, -6, -7, -8, -2, -3, -4, -5, -6, -7, -8, -9
-        ]
+        [-1, 0, 0, -1, 0, -1, -1, 0, 0, -1, 0, -1, -1, 0, -1, 0]
     );
 }
 
@@ -816,6 +866,51 @@ fn all_false_mask8x16<S: Simd>(simd: S) {
     assert!(!simd.all_false_mask8x16(one_neg));
 }
 
+#[simd_test]
+fn load_interleaved_128_f32x16<S: Simd>(simd: S) {
+    let data: [f32; 16] = [
+        0.0,
+        f32::NAN,
+        f32::INFINITY,
+        -3.0,
+        4.0,
+        -0.0,
+        6.0,
+        f32::NEG_INFINITY,
+        8.0,
+        9.0,
+        -10.0,
+        11.0,
+        f32::MIN,
+        13.0,
+        f32::MAX,
+        15.0,
+    ];
+    let result = simd.load_interleaved_128_f32x16(&data);
+
+    let expected = [
+        0.0,
+        4.0,
+        8.0,
+        f32::MIN,
+        f32::NAN,
+        -0.0,
+        9.0,
+        13.0,
+        f32::INFINITY,
+        6.0,
+        -10.0,
+        f32::MAX,
+        -3.0,
+        f32::NEG_INFINITY,
+        11.0,
+        15.0,
+    ];
+
+    // Note: f32::NAN != f32::NAN hence we compare the bit pattern.
+    assert_eq!((*result).map(f32::to_bits), expected.map(f32::to_bits));
+}
+
 #[simd_test]
 fn load_interleaved_128_u32x16<S: Simd>(simd: S) {
     #[rustfmt::skip]
@@ -2419,6 +2514,139 @@ fn shlv_u32x4_varied<S: Simd>(simd: S) {
     );
 }
 
+#[simd_test]
+fn shlv_i8x16<S: Simd>(simd: S) {
+    let a = i8x16::from_slice(
+        simd,
+        &[64, 65, -64, -65, 1, 2, 3, 4, -1, -2, -3, -4, 15, 16, 31, 32],
+    );
+    let shifts = i8x16::from_slice(simd, &[1, 2, 1, 2, 0, 1, 2, 3, 1, 2, 3, 4, 3, 2, 1, 0]);
+    assert_eq!(
+        *(a << shifts),
+        [
+            -128, 4, -128, -4, 1, 4, 12, 32, -2, -8, -24, -64, 120, 64, 62, 32
+        ]
+    );
+}
+
+#[simd_test]
+fn shrv_i8x16<S: Simd>(simd: S) {
+    let a = i8x16::from_slice(
+        simd,
+        &[
+            -128, -64, -33, -1, 127, 64, 33, 1, -2, -4, -8, -16, 0, 2, 4, 8,
+        ],
+    );
+    let shifts = i8x16::from_slice(simd, &[1, 2, 3, 7, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3]);
+    assert_eq!(
+        *(a >> shifts),
+        [-64, -16, -5, -1, 63, 16, 4, 1, -1, -1, -1, -1, 0, 1, 1, 1]
+    );
+}
+
+#[simd_test]
+fn shlv_u8x16<S: Simd>(simd: S) {
+    let a = u8x16::from_slice(
+        simd,
+        &[255, 128, 64, 32, 16, 8, 4, 2, 1, 3, 5, 7, 15, 31, 63, 127],
+    );
+    let shifts = u8x16::from_slice(simd, &[4, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 3, 2, 1]);
+    assert_eq!(
+        *(a << shifts),
+        [240, 0, 0, 0, 0, 0, 0, 0, 1, 6, 20, 56, 240, 248, 252, 254]
+    );
+}
+
+#[simd_test]
+fn shrv_u8x16<S: Simd>(simd: S) {
+    let a = u8x16::from_slice(
+        simd,
+        &[255, 128, 64, 32, 16, 8, 4, 2, 1, 3, 5, 7, 15, 31, 63, 127],
+    );
+    let shifts = u8x16::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 1, 0, 1, 2, 3, 4, 3, 2, 1]);
+    assert_eq!(
+        *(a >> shifts),
+        [127, 32, 8, 2, 0, 0, 0, 1, 1, 1, 1, 0, 0, 3, 15, 63]
+    );
+}
+
+#[simd_test]
+fn shlv_i16x8<S: Simd>(simd: S) {
+    let a = i16x8::from_slice(simd, &[16384, 8192, -16384, -8192, 1, -1, 255, -256]);
+    let shifts = i16x8::from_slice(simd, &[1, 2, 1, 2, 15, 1, 4, 3]);
+    assert_eq!(
+        *(a << shifts),
+        [-32768, -32768, -32768, -32768, -32768, -2, 4080, -2048]
+    );
+}
+
+#[simd_test]
+fn shrv_i16x8<S: Simd>(simd: S) {
+    let a = i16x8::from_slice(simd, &[-32768, -16384, -1025, -1, 32767, 16384, 1025, 1]);
+    let shifts = i16x8::from_slice(simd, &[1, 2, 3, 15, 1, 2, 3, 0]);
+    assert_eq!(
+        *(a >> shifts),
+        [-16384, -4096, -129, -1, 16383, 4096, 128, 1]
+    );
+}
+
+#[simd_test]
+fn shlv_u16x8<S: Simd>(simd: S) {
+    let a = u16x8::from_slice(simd, &[65535, 32768, 16384, 8192, 1, 255, 1024, 4096]);
+    let shifts = u16x8::from_slice(simd, &[4, 1, 2, 3, 15, 4, 5, 0]);
+    assert_eq!(*(a << shifts), [65520, 0, 0, 0, 32768, 4080, 32768, 4096]);
+}
+
+#[simd_test]
+fn shrv_u16x8<S: Simd>(simd: S) {
+    let a = u16x8::from_slice(simd, &[65535, 32768, 16384, 8192, 1, 255, 1024, 4096]);
+    let shifts = u16x8::from_slice(simd, &[1, 2, 3, 4, 0, 4, 5, 12]);
+    assert_eq!(*(a >> shifts), [32767, 8192, 2048, 512, 1, 15, 32, 1]);
+}
+
+#[simd_test]
+fn shlv_u8x32<S: Simd>(simd: S) {
+    let a = u8x32::from_slice(
+        simd,
+        &[
+            255, 128, 64, 32, 16, 8, 4, 2, 1, 3, 5, 7, 15, 31, 63, 127, 255, 128, 64, 32, 16, 8, 4,
+            2, 1, 3, 5, 7, 15, 31, 63, 127,
+        ],
+    );
+    let shifts = u8x32::from_slice(
+        simd,
+        &[
+            4, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 3, 2, 1, 4, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4,
+            3, 2, 1,
+        ],
+    );
+    assert_eq!(
+        *(a << shifts),
+        [
+            240, 0, 0, 0, 0, 0, 0, 0, 1, 6, 20, 56, 240, 248, 252, 254, 240, 0, 0, 0, 0, 0, 0, 0,
+            1, 6, 20, 56, 240, 248, 252, 254
+        ]
+    );
+}
+
+#[simd_test]
+fn shlv_u16x16<S: Simd>(simd: S) {
+    let a = u16x16::from_slice(
+        simd,
+        &[
+            65535, 32768, 16384, 8192, 1, 255, 1024, 4096, 65535, 32768, 16384, 8192, 1, 255, 1024,
+            4096,
+        ],
+    );
+    let shifts = u16x16::from_slice(simd, &[4, 1, 2, 3, 15, 4, 5, 0, 4, 1, 2, 3, 15, 4, 5, 0]);
+    assert_eq!(
+        *(a << shifts),
+        [
+            65520, 0, 0, 0, 32768, 4080, 32768, 4096, 65520, 0, 0, 0, 32768, 4080, 32768, 4096
+        ]
+    );
+}
+
 #[simd_test]
 fn add_i16x8<S: Simd>(simd: S) {
     let a = i16x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]);
@@ -3201,10 +3429,25 @@ fn sqrt_f64x2<S: Simd>(simd: S) {
 
 #[simd_test]
 fn approximate_recip_f64x2<S: Simd>(simd: S) {
+    let a = f64x2::from_slice(simd, &[1.0, -2.0]);
+    let result = a.approximate_recip();
+    let expected = [1.0, -0.5];
+    for i in 0..2 {
+        let rel_error = ((result[i] - expected[i]) / expected[i]).abs();
+        assert!(
+            rel_error < 0.005,
+            "approximate_recip({}) rel_error = {rel_error}",
+            a[i]
+        );
+    }
+}
+
+#[simd_test]
+fn approximate_recip_f64x4<S: Simd>(simd: S) {
     let a = f64x4::from_slice(simd, &[1.0, -2.0, 23.0, 9.0]);
     let result = a.approximate_recip();
     let expected = [1.0, -0.5, 1. / 23., 1. / 9.];
-    for i in 0..2 {
+    for i in 0..4 {
         let rel_error = ((result[i] - expected[i]) / expected[i]).abs();
         assert!(
             rel_error < 0.005,
diff --git a/fearless_simd_tests/tests/harness/slide_exhaustive.rs b/fearless_simd_tests/tests/harness/slide_exhaustive.rs
index 1b82d4548..f41752646 100644
--- a/fearless_simd_tests/tests/harness/slide_exhaustive.rs
+++ b/fearless_simd_tests/tests/harness/slide_exhaustive.rs
@@ -251,42 +251,3 @@ test_slide_exhaustive!(slide_exhaustive_i16x32, i16x32, i16, 32, vec32, block8);
 test_slide_exhaustive!(slide_exhaustive_u16x32, u16x32, u16, 32, vec32, block8);
 test_slide_exhaustive!(slide_exhaustive_i32x16, i32x16, i32, 16, vec16, block4);
 test_slide_exhaustive!(slide_exhaustive_u32x16, u32x16, u32, 16, vec16, block4);
-
-// Mask types (128-bit)
-test_slide_exhaustive!(slide_exhaustive_mask8x16, mask8x16, i8, 16, vec16, block16);
-test_slide_exhaustive!(slide_exhaustive_mask16x8, mask16x8, i16, 8, vec8, block8);
-test_slide_exhaustive!(slide_exhaustive_mask32x4, mask32x4, i32, 4, vec4, block4);
-test_slide_exhaustive!(slide_exhaustive_mask64x2, mask64x2, i64, 2, vec2, block2);
-
-// Mask types (256-bit)
-test_slide_exhaustive!(slide_exhaustive_mask8x32, mask8x32, i8, 32, vec32, block16);
-test_slide_exhaustive!(
-    slide_exhaustive_mask16x16,
-    mask16x16,
-    i16,
-    16,
-    vec16,
-    block8
-);
-test_slide_exhaustive!(slide_exhaustive_mask32x8, mask32x8, i32, 8, vec8, block4);
-test_slide_exhaustive!(slide_exhaustive_mask64x4, mask64x4, i64, 4, vec4, block2);
-
-// Mask types (512-bit)
-test_slide_exhaustive!(slide_exhaustive_mask8x64, mask8x64, i8, 64, vec64, block16);
-test_slide_exhaustive!(
-    slide_exhaustive_mask16x32,
-    mask16x32,
-    i16,
-    32,
-    vec32,
-    block8
-);
-test_slide_exhaustive!(
-    slide_exhaustive_mask32x16,
-    mask32x16,
-    i32,
-    16,
-    vec16,
-    block4
-);
-test_slide_exhaustive!(slide_exhaustive_mask64x8, mask64x8, i64, 8, vec8, block2);
diff --git a/fearless_simd_tests/tests/mod.rs b/fearless_simd_tests/tests/mod.rs
index 4d2f053d8..bd64c14c9 100644
--- a/fearless_simd_tests/tests/mod.rs
+++ b/fearless_simd_tests/tests/mod.rs
@@ -5,13 +5,53 @@
     missing_docs,
     reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
 )]
+#![allow(
+    clippy::disallowed_methods,
+    reason = "fearless_simd_tests has test-only transmute helpers that should not be forced through the library's private checked transmute machinery"
+)]
 
 use fearless_simd::*;
 use fearless_simd_dev_macros::simd_test;
 
 mod harness;
+#[cfg(not(miri))] // too slow
 mod soundness;
 
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn x86_detects_icelake_avx512() -> bool {
+    std::arch::is_x86_feature_detected!("adx")
+        && std::arch::is_x86_feature_detected!("aes")
+        && std::arch::is_x86_feature_detected!("avx512bitalg")
+        && std::arch::is_x86_feature_detected!("avx512bw")
+        && std::arch::is_x86_feature_detected!("avx512cd")
+        && std::arch::is_x86_feature_detected!("avx512dq")
+        && std::arch::is_x86_feature_detected!("avx512f")
+        && std::arch::is_x86_feature_detected!("avx512ifma")
+        && std::arch::is_x86_feature_detected!("avx512vbmi")
+        && std::arch::is_x86_feature_detected!("avx512vbmi2")
+        && std::arch::is_x86_feature_detected!("avx512vl")
+        && std::arch::is_x86_feature_detected!("avx512vnni")
+        && std::arch::is_x86_feature_detected!("avx512vpopcntdq")
+        && std::arch::is_x86_feature_detected!("bmi1")
+        && std::arch::is_x86_feature_detected!("bmi2")
+        && std::arch::is_x86_feature_detected!("cmpxchg16b")
+        && std::arch::is_x86_feature_detected!("fma")
+        && std::arch::is_x86_feature_detected!("gfni")
+        && std::arch::is_x86_feature_detected!("lzcnt")
+        && std::arch::is_x86_feature_detected!("movbe")
+        && std::arch::is_x86_feature_detected!("pclmulqdq")
+        && std::arch::is_x86_feature_detected!("popcnt")
+        && std::arch::is_x86_feature_detected!("rdrand")
+        && std::arch::is_x86_feature_detected!("rdseed")
+        && std::arch::is_x86_feature_detected!("sha")
+        && std::arch::is_x86_feature_detected!("vaes")
+        && std::arch::is_x86_feature_detected!("vpclmulqdq")
+        && std::arch::is_x86_feature_detected!("xsave")
+        && std::arch::is_x86_feature_detected!("xsavec")
+        && std::arch::is_x86_feature_detected!("xsaveopt")
+        && std::arch::is_x86_feature_detected!("xsaves")
+}
+
 // Ensure that we can cast between generic native-width vectors
 #[expect(dead_code, reason = "Compile only test")]
 fn generic_cast<S: Simd>(x: S::f32s) -> S::u32s {
@@ -45,7 +85,7 @@ fn supports_highest_level() {
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         assert!(
             level.as_avx2().is_some(),
-            "This machine does not support every `Level` supported by Fearless SIMD (currently AVX2 and below).\n{UNSUPPORTED_LEVEL_MESSAGE}",
+            "This machine does not support every routinely local-tested x86 `Level` supported by Fearless SIMD (currently AVX2 and below; AVX-512 is covered by the SDE CI job).\n{UNSUPPORTED_LEVEL_MESSAGE}",
         );
 
         #[cfg(target_arch = "aarch64")]
@@ -62,6 +102,53 @@ fn supports_highest_level() {
     );
 }
 
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[test]
+fn detects_avx512_when_available() {
+    if !x86_detects_icelake_avx512() {
+        return;
+    }
+
+    let level = Level::new();
+    assert!(
+        level.as_avx512().is_some(),
+        "Ice Lake AVX-512 should be selected when all required features are available"
+    );
+    assert!(
+        level.as_avx2().is_some(),
+        "AVX-512 should downgrade to an AVX2 proof"
+    );
+    assert!(
+        level.as_sse4_2().is_some(),
+        "AVX-512 should downgrade to an SSE4.2 proof"
+    );
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[test]
+fn avx512_masks_are_compact() {
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86::*;
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64::*;
+    use std::mem::size_of;
+
+    type A = Avx512;
+
+    assert_eq!(size_of::<mask8x16<A>>(), size_of::<__mmask16>());
+    assert_eq!(size_of::<mask16x8<A>>(), size_of::<__mmask8>());
+    assert_eq!(size_of::<mask32x4<A>>(), size_of::<__mmask8>());
+    assert_eq!(size_of::<mask64x2<A>>(), size_of::<__mmask8>());
+    assert_eq!(size_of::<mask8x32<A>>(), size_of::<__mmask32>());
+    assert_eq!(size_of::<mask16x16<A>>(), size_of::<__mmask16>());
+    assert_eq!(size_of::<mask32x8<A>>(), size_of::<__mmask8>());
+    assert_eq!(size_of::<mask64x4<A>>(), size_of::<__mmask8>());
+    assert_eq!(size_of::<mask8x64<A>>(), size_of::<__mmask64>());
+    assert_eq!(size_of::<mask16x32<A>>(), size_of::<__mmask32>());
+    assert_eq!(size_of::<mask32x16<A>>(), size_of::<__mmask16>());
+    assert_eq!(size_of::<mask64x8<A>>(), size_of::<__mmask8>());
+}
+
 #[simd_test]
 #[ignore]
 fn test_f32_to_i32_precise_exhaustive<S: Simd>(simd: S) {