From e80c786ca7a7f77f44263932a94d37ee9cbaedbc Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Wed, 27 May 2026 21:00:15 +0100
Subject: [PATCH] Replace remaining NEON and WASM unsafe loads with
 checked_transmute_copy

---
 fearless_simd/src/generated/neon.rs | 27 ++++++++++------
 fearless_simd/src/transmute.rs      |  2 ++
 fearless_simd_gen/src/mk_neon.rs    | 48 +++++++++++++++++++++--------
 3 files changed, 55 insertions(+), 22 deletions(-)
diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
index b848f69cd..f89f760f7 100644
--- a/fearless_simd/src/generated/neon.rs
+++ b/fearless_simd/src/generated/neon.rs
@@ -780,7 +780,9 @@ impl Simd for Neon {
     #[inline(always)]
     fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self> {
         unsafe {
-            let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr());
+            let shifts = crate::transmute::checked_transmute_copy::<[i16; 8], int16x8_t>(&[
+                15, 14, 13, 12, 11, 10, 9, 8,
+            ]);
             let lo = vshlq_u16(vdupq_n_u16(bits as u16), shifts);
             let hi = vshlq_u16(vdupq_n_u16((bits >> 8) as u16), shifts);
             let lo = vcltq_s16(vreinterpretq_s16_u16(lo), vdupq_n_s16(0));
@@ -795,8 +797,9 @@ impl Simd for Neon {
     #[inline(always)]
     fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64 {
         unsafe {
-            let weights =
-                vld1q_u8([1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128].as_ptr());
+            let weights = crate::transmute::checked_transmute_copy::<[u8; 16], uint8x16_t>(&[
+                1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128,
+            ]);
             let bits = vandq_u8(vreinterpretq_u8_s8(a.into()), weights);
             let lo = vaddv_u8(vget_low_u8(bits)) as u64;
             let hi = vaddv_u8(vget_high_u8(bits)) as u64;
@@ -1284,7 +1287,9 @@ impl Simd for Neon {
     #[inline(always)]
     fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
         unsafe {
-            let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr());
+            let shifts = crate::transmute::checked_transmute_copy::<[i16; 8], int16x8_t>(&[
+                15, 14, 13, 12, 11, 10, 9, 8,
+            ]);
             let shifted = vshlq_u16(vdupq_n_u16(bits as u16), shifts);
             let mask = vcltq_s16(vreinterpretq_s16_u16(shifted), vdupq_n_s16(0));
             vreinterpretq_s16_u16(mask).simd_into(self)
@@ -1293,7 +1298,9 @@ impl Simd for Neon {
     #[inline(always)]
     fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64 {
         unsafe {
-            let weights = vld1q_u16([1, 2, 4, 8, 16, 32, 64, 128].as_ptr());
+            let weights = crate::transmute::checked_transmute_copy::<[u16; 8], uint16x8_t>(&[
+                1, 2, 4, 8, 16, 32, 64, 128,
+            ]);
             let bits = vandq_u16(vreinterpretq_u16_s16(a.into()), weights);
             vaddvq_u16(bits) as u64
         }
@@ -1783,7 +1790,8 @@ impl Simd for Neon {
     #[inline(always)]
     fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
         unsafe {
-            let shifts = vld1q_s32([31, 30, 29, 28].as_ptr());
+            let shifts =
+                crate::transmute::checked_transmute_copy::<[i32; 4], int32x4_t>(&[31, 30, 29, 28]);
             let shifted = vshlq_u32(vdupq_n_u32(bits as u32), shifts);
             let mask = vcltq_s32(vreinterpretq_s32_u32(shifted), vdupq_n_s32(0));
             vreinterpretq_s32_u32(mask).simd_into(self)
@@ -1792,7 +1800,8 @@ impl Simd for Neon {
     #[inline(always)]
     fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64 {
         unsafe {
-            let weights = vld1q_u32([1, 2, 4, 8].as_ptr());
+            let weights =
+                crate::transmute::checked_transmute_copy::<[u32; 4], uint32x4_t>(&[1, 2, 4, 8]);
             let bits = vandq_u32(vreinterpretq_u32_s32(a.into()), weights);
             vaddvq_u32(bits) as u64
         }
@@ -2103,7 +2112,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
         unsafe {
-            let shifts = vld1q_s64([63, 62].as_ptr());
+            let shifts = crate::transmute::checked_transmute_copy::<[i64; 2], int64x2_t>(&[63, 62]);
             let shifted = vshlq_u64(vdupq_n_u64(bits), shifts);
             let mask = vcltq_s64(vreinterpretq_s64_u64(shifted), vdupq_n_s64(0));
             vreinterpretq_s64_u64(mask).simd_into(self)
@@ -2112,7 +2121,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
         unsafe {
-            let weights = vld1q_u64([1, 2].as_ptr());
+            let weights = crate::transmute::checked_transmute_copy::<[u64; 2], uint64x2_t>(&[1, 2]);
             let bits = vandq_u64(vreinterpretq_u64_s64(a.into()), weights);
             vaddvq_u64(bits)
         }
diff --git a/fearless_simd/src/transmute.rs b/fearless_simd/src/transmute.rs
index 8f8315283..894d672d7 100644
--- a/fearless_simd/src/transmute.rs
+++ b/fearless_simd/src/transmute.rs
@@ -16,6 +16,7 @@ use core::arch::aarch64::{
     int8x16_t, int8x16x2_t, int8x16x4_t, int16x8_t, int16x8x2_t, int16x8x4_t, int32x4_t,
     int32x4x2_t, int32x4x4_t, int64x2_t, int64x2x2_t, int64x2x4_t, uint8x16_t, uint8x16x2_t,
     uint8x16x4_t, uint16x8_t, uint16x8x2_t, uint16x8x4_t, uint32x4_t, uint32x4x2_t, uint32x4x4_t,
+    uint64x2_t,
 };
 #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
 use core::arch::wasm32::v128;
@@ -160,6 +161,7 @@ const _: () = {
     unsafe impl SimdPod for uint32x4_t {}
     unsafe impl SimdPod for uint32x4x2_t {}
     unsafe impl SimdPod for uint32x4x4_t {}
+    unsafe impl SimdPod for uint64x2_t {}
 };
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs
index 9765c06df..129d4052f 100644
--- a/fearless_simd_gen/src/mk_neon.rs
+++ b/fearless_simd_gen/src/mk_neon.rs
@@ -580,7 +580,10 @@ impl Neon {
             8 => quote! {
                 #method_sig {
                     unsafe {
-                        let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr());
+                        let shifts =
+                            crate::transmute::checked_transmute_copy::<[i16; 8], int16x8_t>(
+                                &[15, 14, 13, 12, 11, 10, 9, 8],
+                            );
                         let lo = vshlq_u16(vdupq_n_u16(bits as u16), shifts);
                         let hi = vshlq_u16(vdupq_n_u16((bits >> 8) as u16), shifts);
                         let lo = vcltq_s16(vreinterpretq_s16_u16(lo), vdupq_n_s16(0));
@@ -595,7 +598,10 @@ impl Neon {
             16 => quote! {
                 #method_sig {
                     unsafe {
-                        let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr());
+                        let shifts =
+                            crate::transmute::checked_transmute_copy::<[i16; 8], int16x8_t>(
+                                &[15, 14, 13, 12, 11, 10, 9, 8],
+                            );
                         let shifted = vshlq_u16(vdupq_n_u16(bits as u16), shifts);
                         let mask = vcltq_s16(vreinterpretq_s16_u16(shifted), vdupq_n_s16(0));
                         vreinterpretq_s16_u16(mask).simd_into(self)
@@ -605,7 +611,10 @@ impl Neon {
             32 => quote! {
                 #method_sig {
                     unsafe {
-                        let shifts = vld1q_s32([31, 30, 29, 28].as_ptr());
+                        let shifts =
+                            crate::transmute::checked_transmute_copy::<[i32; 4], int32x4_t>(
+                                &[31, 30, 29, 28],
+                            );
                         let shifted = vshlq_u32(vdupq_n_u32(bits as u32), shifts);
                         let mask = vcltq_s32(vreinterpretq_s32_u32(shifted), vdupq_n_s32(0));
                         vreinterpretq_s32_u32(mask).simd_into(self)
@@ -615,7 +624,10 @@ impl Neon {
             64 => quote! {
                 #method_sig {
                     unsafe {
-                        let shifts = vld1q_s64([63, 62].as_ptr());
+                        let shifts =
+                            crate::transmute::checked_transmute_copy::<[i64; 2], int64x2_t>(
+                                &[63, 62],
+                            );
                         let shifted = vshlq_u64(vdupq_n_u64(bits), shifts);
                         let mask = vcltq_s64(vreinterpretq_s64_u64(shifted), vdupq_n_s64(0));
                         vreinterpretq_s64_u64(mask).simd_into(self)
@@ -642,10 +654,13 @@ impl Neon {
             8 => quote! {
                 #method_sig {
                     unsafe {
-                        let weights = vld1q_u8([
-                            1, 2, 4, 8, 16, 32, 64, 128,
-                            1, 2, 4, 8, 16, 32, 64, 128,
-                        ].as_ptr());
+                        let weights =
+                            crate::transmute::checked_transmute_copy::<[u8; 16], uint8x16_t>(
+                                &[
+                                    1, 2, 4, 8, 16, 32, 64, 128,
+                                    1, 2, 4, 8, 16, 32, 64, 128,
+                                ],
+                            );
                         let bits = vandq_u8(vreinterpretq_u8_s8(a.into()), weights);
                         let lo = vaddv_u8(vget_low_u8(bits)) as u64;
                         let hi = vaddv_u8(vget_high_u8(bits)) as u64;
@@ -656,9 +671,10 @@ impl Neon {
             16 => quote! {
                 #method_sig {
                     unsafe {
-                        let weights = vld1q_u16([
-                            1, 2, 4, 8, 16, 32, 64, 128,
-                        ].as_ptr());
+                        let weights =
+                            crate::transmute::checked_transmute_copy::<[u16; 8], uint16x8_t>(
+                                &[1, 2, 4, 8, 16, 32, 64, 128],
+                            );
                         let bits = vandq_u16(vreinterpretq_u16_s16(a.into()), weights);
                         vaddvq_u16(bits) as u64
                     }
@@ -667,7 +683,10 @@ impl Neon {
             32 => quote! {
                 #method_sig {
                     unsafe {
-                        let weights = vld1q_u32([1, 2, 4, 8].as_ptr());
+                        let weights =
+                            crate::transmute::checked_transmute_copy::<[u32; 4], uint32x4_t>(
+                                &[1, 2, 4, 8],
+                            );
                         let bits = vandq_u32(vreinterpretq_u32_s32(a.into()), weights);
                         vaddvq_u32(bits) as u64
                     }
@@ -676,7 +695,10 @@ impl Neon {
             64 => quote! {
                 #method_sig {
                     unsafe {
-                        let weights = vld1q_u64([1, 2].as_ptr());
+                        let weights =
+                            crate::transmute::checked_transmute_copy::<[u64; 2], uint64x2_t>(
+                                &[1, 2],
+                            );
                         let bits = vandq_u64(vreinterpretq_u64_s64(a.into()), weights);
                         vaddvq_u64(bits)
                     }