From d240cefc2e55533ab368387eda420716b2c96184 Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Tue, 5 May 2026 14:06:57 -0400
Subject: [PATCH 1/2] feat: dynamic batch_bool masks + avx_128 / avx2_128 mask
 overloads

Adds runtime batch_bool mask overloads of xsimd::load_masked and
xsimd::store_masked across AVX, AVX2, AVX-512, SSE, SVE, RVV, and NEON;
generic common-path fallback collapsed to a whole-vector select. SVE
compile-time masked load/store forwarded through the runtime path so the
per-lane predicate is correct on SVE wider than 128 bits.

Adds arch-specific runtime-mask overloads of load_masked / store_masked
for the avx_128 and avx2_128 arches so they inherit the hardware
predicated load/store path on x86.

Squashed from:
  b57a7667  feat: add runtime batch_bool mask overloads for load_masked/store_masked
  d5f21c70  feat: add runtime batch_bool mask overloads for avx_128 / avx2_128
---
 docs/source/api/data_transfer.rst             |  12 +-
 .../xsimd/arch/common/xsimd_common_memory.hpp |  36 +++++
 include/xsimd/arch/xsimd_avx.hpp              |  33 +++++
 include/xsimd/arch/xsimd_avx2.hpp             |  46 +++++--
 include/xsimd/arch/xsimd_avx2_128.hpp         |  71 ++++++++--
 include/xsimd/arch/xsimd_avx_128.hpp          |  31 +++++
 include/xsimd/arch/xsimd_common_fwd.hpp       |   4 +
 include/xsimd/arch/xsimd_rvv.hpp              |  24 ++++
 include/xsimd/arch/xsimd_sve.hpp              |  43 +++++-
 include/xsimd/types/xsimd_api.hpp             |  64 +++++++++
 include/xsimd/types/xsimd_batch.hpp           |  28 +++-
 include/xsimd/types/xsimd_utils.hpp           |  10 ++
 test/test_load_store.cpp                      | 123 ++++++++++++------
 13 files changed, 452 insertions(+), 73 deletions(-)
diff --git a/docs/source/api/data_transfer.rst b/docs/source/api/data_transfer.rst
index 815f56293..b389a48ad 100644
--- a/docs/source/api/data_transfer.rst
+++ b/docs/source/api/data_transfer.rst
@@ -12,7 +12,7 @@ Data Transfers
 From memory:
 
 +---------------------------------------+----------------------------------------------------+
-| :cpp:func:`load`                      | load values from memory (optionally masked)        |
+| :cpp:func:`load`                      | load values from memory (optionally masked) [#m]_  |
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`load_aligned`              | load values from aligned memory                    |
 +---------------------------------------+----------------------------------------------------+
@@ -32,7 +32,7 @@ From a scalar:
 To memory:
 
 +---------------------------------------+----------------------------------------------------+
-| :cpp:func:`store`                     | store values to memory (optionally masked)         |
+| :cpp:func:`store`                     | store values to memory (optionally masked) [#m]_   |
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`store_aligned`             | store values to aligned memory                     |
 +---------------------------------------+----------------------------------------------------+
@@ -84,3 +84,11 @@ The following empty types are used for tag dispatching:
 
 .. doxygenstruct:: xsimd::unaligned_mode
    :project: xsimd
+
+.. rubric:: Footnotes
+
+.. [#m] Masked ``load`` / ``store`` come in two flavours. The
+   :cpp:class:`batch_bool_constant` overload encodes the mask in the type and
+   is resolved at compile time. The runtime :cpp:class:`batch_bool` overload
+   accepts a mask computed at runtime. Prefer the compile-time mask whenever
+   the selection is known at compile time.
diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp
index c8038334a..0639f5168 100644
--- a/include/xsimd/arch/common/xsimd_common_memory.hpp
+++ b/include/xsimd/arch/common/xsimd_common_memory.hpp
@@ -15,6 +15,7 @@
 #include <algorithm>
 #include <array>
 #include <complex>
+#include <cstdint>
 
 #include "../../types/xsimd_batch_constant.hpp"
 #include "./xsimd_common_details.hpp"
@@ -374,6 +375,23 @@ namespace xsimd
             return batch<T_out, A>::load(buffer.data(), aligned_mode {});
         }
 
+        template <class A, class T, class Mode>
+        XSIMD_INLINE batch<T, A>
+        load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<common>) noexcept
+        {
+            // Per-lane validity contract: only active lanes of ``mem`` are
+            // required to be addressable. An unconditional whole-vector load
+            // would touch inactive lanes and trip ASan/Valgrind on partial
+            // buffers, so stay scalar. Arches with hardware predicated loads
+            // (AVX2 32/64-bit, AVX-512, SVE, RVV) override this with a single
+            // intrinsic that suppresses inactive-lane reads in hardware.
+            constexpr std::size_t size = batch<T, A>::size;
+            alignas(A::alignment()) std::array<T, size> buffer;
+            for (std::size_t i = 0; i < size; ++i)
+                buffer[i] = mask.get(i) ? mem[i] : T(0);
+            return batch<T, A>::load_aligned(buffer.data());
+        }
+
         template <class A, class T_in, class T_out, bool... Values, class alignment>
         XSIMD_INLINE void
         store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...>, alignment, requires_arch<common>) noexcept
@@ -388,6 +406,24 @@ namespace xsimd
                 }
         }
 
+        template <class A, class T, class Mode>
+        XSIMD_INLINE void
+        store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<common>) noexcept
+        {
+            // Per-lane validity contract (matches native masked-store APIs):
+            // only active lanes of ``mem`` are touched. A load+select+store
+            // RMW would both read and write inactive bytes, breaking that
+            // contract — stay scalar. Arches with hardware predicated stores
+            // override this with a single intrinsic that suppresses inactive
+            // lanes in hardware.
+            constexpr std::size_t size = batch<T, A>::size;
+            alignas(A::alignment()) std::array<T, size> src_buf;
+            src.store_aligned(src_buf.data());
+            for (std::size_t i = 0; i < size; ++i)
+                if (mask.get(i))
+                    mem[i] = src_buf[i];
+        }
+
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
         {
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
index 305041f11..429637784 100644
--- a/include/xsimd/arch/xsimd_avx.hpp
+++ b/include/xsimd/arch/xsimd_avx.hpp
@@ -1015,6 +1015,23 @@ namespace xsimd
             }
         }
 
+        // Runtime-mask load for float/double on AVX. Both aligned_mode and
+        // unaligned_mode map to _mm256_maskload_* — the intrinsic does not fault
+        // on masked-off lanes, so partial loads across page boundaries are safe.
+        template <class A, class Mode>
+        XSIMD_INLINE batch<float, A>
+        load_masked(float const* mem, batch_bool<float, A> mask, convert<float>, Mode, requires_arch<avx>) noexcept
+        {
+            return _mm256_maskload_ps(mem, _mm256_castps_si256(mask));
+        }
+
+        template <class A, class Mode>
+        XSIMD_INLINE batch<double, A>
+        load_masked(double const* mem, batch_bool<double, A> mask, convert<double>, Mode, requires_arch<avx>) noexcept
+        {
+            return _mm256_maskload_pd(mem, _mm256_castpd_si256(mask));
+        }
+
         // store_masked
         namespace detail
         {
@@ -1031,6 +1048,22 @@ namespace xsimd
             }
         }
 
+        // Runtime-mask store for float/double on AVX. Same fault-suppression
+        // semantics as the masked loads above; alignment mode is irrelevant.
+        template <class A, class Mode>
+        XSIMD_INLINE void
+        store_masked(float* mem, batch<float, A> const& src, batch_bool<float, A> mask, Mode, requires_arch<avx>) noexcept
+        {
+            _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), src);
+        }
+
+        template <class A, class Mode>
+        XSIMD_INLINE void
+        store_masked(double* mem, batch<double, A> const& src, batch_bool<double, A> mask, Mode, requires_arch<avx>) noexcept
+        {
+            _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), src);
+        }
+
         template <class A, class T, bool... Values, class Mode>
         XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept
         {
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
index ebffd910b..f2666018b 100644
--- a/include/xsimd/arch/xsimd_avx2.hpp
+++ b/include/xsimd/arch/xsimd_avx2.hpp
@@ -118,10 +118,15 @@ namespace xsimd
             }
         }
 
-        // load_masked
-        // AVX2 low-level helpers (operate on raw SIMD registers)
+        // load_masked. AVX2 mask{load,store}_epi{32,64} take int*/long long*;
+        // these helpers exist so the dispatch picks the right intrinsic by
+        // pointer width. Static_asserts pin sizeof(int)/(long long) to the
+        // values the intrinsics expect — the dispatcher relies on it.
         namespace detail
         {
+            static_assert(sizeof(int) == 4, "AVX2 maskload/maskstore expects sizeof(int) == 4");
+            static_assert(sizeof(long long) == 8, "AVX2 maskload/maskstore expects sizeof(long long) == 8");
+
             XSIMD_INLINE __m256i maskload(const int32_t* mem, __m256i mask) noexcept
             {
                 return _mm256_maskload_epi32(mem, mask);
@@ -129,7 +134,7 @@ namespace xsimd
 
             XSIMD_INLINE __m256i maskload(const long long* mem, __m256i mask) noexcept
             {
-                return _mm256_maskload_epi64(reinterpret_cast<long long const*>(mem), mask);
+                return _mm256_maskload_epi64(mem, mask);
             }
 
             XSIMD_INLINE __m256i zero_extend(__m128i hi) noexcept
@@ -138,14 +143,12 @@ namespace xsimd
             }
         }
 
-        // single templated implementation for integer masked loads (32/64-bit)
         template <class A, class T, bool... Values, class Mode>
         XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4), batch<T, A>>
         load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
         {
             static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2");
             using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
-            // Use the raw register-level maskload helpers for the remaining cases.
             return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());
         }
 
@@ -175,19 +178,31 @@ namespace xsimd
             return bitwise_cast<uint64_t>(r);
         }
 
+        // Runtime-mask load for 32/64-bit integers on AVX2. 8/16-bit integers
+        // fall back to the scalar common path: AVX2 has no native maskload for
+        // those widths, and a load-then-blend would break fault-suppression at
+        // page boundaries (the main reason callers ask for a masked load).
+        // Both aligned_mode and unaligned_mode route to the same intrinsic —
+        // masked-off lanes do not fault regardless of alignment.
+        template <class A, class T, class Mode>
+        XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
+        load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
+        {
+            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
+            return detail::maskload(reinterpret_cast<const int_t*>(mem), __m256i(mask));
+        }
+
         // store_masked
         namespace detail
         {
-            template <class T, class A>
             XSIMD_INLINE void maskstore(int32_t* mem, __m256i mask, __m256i src) noexcept
             {
                 _mm256_maskstore_epi32(reinterpret_cast<int*>(mem), mask, src);
             }
 
-            template <class T, class A>
-            XSIMD_INLINE void maskstore(int64_t* mem, __m256i mask, __m256i src) noexcept
+            XSIMD_INLINE void maskstore(long long* mem, __m256i mask, __m256i src) noexcept
             {
-                _mm256_maskstore_epi64(reinterpret_cast<long long*>(mem), mask, src);
+                _mm256_maskstore_epi64(mem, mask, src);
             }
         }
 
@@ -195,15 +210,14 @@ namespace xsimd
         XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
         {
             constexpr size_t lanes_per_half = batch<T, A>::size / 2;
+            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
 
-            // confined to lower 128-bit half → forward to SSE
             XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)
             {
                 constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
                 const auto lo = detail::lower_half(src);
                 store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
             }
-            // confined to upper 128-bit half → forward to SSE
             else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half)
             {
                 constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
@@ -212,7 +226,7 @@ namespace xsimd
             }
             else
             {
-                detail::maskstore<T, A>(mem, mask.as_batch(), src);
+                detail::maskstore(reinterpret_cast<int_t*>(mem), mask.as_batch(), src);
             }
         }
 
@@ -230,6 +244,14 @@ namespace xsimd
             store_masked<A>(reinterpret_cast<int64_t*>(mem), s64, batch_bool_constant<int64_t, A, Values...> {}, Mode {}, avx2 {});
         }
 
+        template <class A, class T, class Mode>
+        XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
+        store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2>) noexcept
+        {
+            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
+            detail::maskstore(reinterpret_cast<int_t*>(mem), __m256i(mask), __m256i(src));
+        }
+
         // load_stream
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value, void>>
         XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx2>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx2_128.hpp b/include/xsimd/arch/xsimd_avx2_128.hpp
index 7e8b0d05a..1f2de84b3 100644
--- a/include/xsimd/arch/xsimd_avx2_128.hpp
+++ b/include/xsimd/arch/xsimd_avx2_128.hpp
@@ -89,48 +89,91 @@ namespace xsimd
             }
         }
 
-        // load_masked
+        namespace detail
+        {
+            XSIMD_INLINE __m128i maskload_128(int32_t const* mem, __m128i mask) noexcept
+            {
+                return _mm_maskload_epi32(mem, mask);
+            }
+            XSIMD_INLINE __m128i maskload_128(long long const* mem, __m128i mask) noexcept
+            {
+                return _mm_maskload_epi64(mem, mask);
+            }
+            XSIMD_INLINE void maskstore_128(int32_t* mem, __m128i mask, __m128i src) noexcept
+            {
+                _mm_maskstore_epi32(mem, mask, src);
+            }
+            XSIMD_INLINE void maskstore_128(long long* mem, __m128i mask, __m128i src) noexcept
+            {
+                _mm_maskstore_epi64(mem, mask, src);
+            }
+        }
+
+        // load_masked / store_masked. AVX2-128 has _mm_maskload/store for
+        // 32/64-bit integers only; 8/16-bit fall back to the common scalar
+        // path. Aligned and unaligned go through the same intrinsic — masked-
+        // off lanes do not fault regardless of alignment.
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2_128>) noexcept
         {
-            return _mm_maskload_epi32(mem, mask.as_batch());
+            return detail::maskload_128(mem, mask.as_batch());
         }
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...> mask, convert<uint32_t>, Mode, requires_arch<avx2_128>) noexcept
         {
-            return _mm_maskload_epi32((int32_t*)mem, mask.as_batch());
+            return detail::maskload_128(reinterpret_cast<int32_t const*>(mem), mask.as_batch());
         }
         template <class A, bool... Values, class Mode>
-        XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
+        XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<int64_t>, Mode, requires_arch<avx2_128>) noexcept
         {
-            return _mm_maskload_epi64(mem, mask.as_batch());
+            return detail::maskload_128(reinterpret_cast<long long const*>(mem), mask.as_batch());
         }
         template <class A, bool... Values, class Mode>
-        XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
+        XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...> mask, convert<uint64_t>, Mode, requires_arch<avx2_128>) noexcept
         {
-            return _mm_maskload_epi64((int64_t*)mem, mask.as_batch());
+            return detail::maskload_128(reinterpret_cast<long long const*>(mem), mask.as_batch());
         }
 
-        // store_masked
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
         {
-            return _mm_maskstore_epi32(mem, mask.as_batch(), src);
+            detail::maskstore_128(mem, mask.as_batch(), src);
         }
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
         {
-            return _mm_maskstore_epi32((int32_t*)mem, mask.as_batch(), src);
+            detail::maskstore_128(reinterpret_cast<int32_t*>(mem), mask.as_batch(), src);
         }
         template <class A, bool... Values, class Mode>
-        XSIMD_INLINE void store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
+        XSIMD_INLINE void store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
         {
-            return _mm_maskstore_epi64(mem, mask.as_batch(), src);
+            detail::maskstore_128(reinterpret_cast<long long*>(mem), mask.as_batch(), src);
         }
         template <class A, bool... Values, class Mode>
-        XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
+        XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
+        {
+            detail::maskstore_128(reinterpret_cast<long long*>(mem), mask.as_batch(), src);
+        }
+
+        // Runtime-mask path: route through the same helpers as the constant-
+        // mask overloads. Templated on T so the dispatcher does not need to
+        // see four near-identical functions; the runtime-mask overloads of
+        // the common arch take ``batch_bool<T, A>``, not the four signed/
+        // unsigned int variants, so there is no ambiguity here.
+        template <class A, class T, class Mode>
+        XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
+        load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2_128>) noexcept
+        {
+            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
+            return detail::maskload_128(reinterpret_cast<int_t const*>(mem), __m128i(mask));
+        }
+
+        template <class A, class T, class Mode>
+        XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
+        store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2_128>) noexcept
         {
-            return _mm_maskstore_epi64((int64_t*)mem, mask.as_batch(), src);
+            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
+            detail::maskstore_128(reinterpret_cast<int_t*>(mem), __m128i(mask), __m128i(src));
         }
 
         // gather
diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp
index af17568e1..98642dac6 100644
--- a/include/xsimd/arch/xsimd_avx_128.hpp
+++ b/include/xsimd/arch/xsimd_avx_128.hpp
@@ -115,6 +115,22 @@ namespace xsimd
             return _mm_maskload_pd(mem, mask.as_batch());
         }
 
+        // Runtime-mask load for float/double on AVX-128. Both aligned_mode and
+        // unaligned_mode map to _mm_maskload_* — the intrinsic does not fault
+        // on masked-off lanes, so partial loads across page boundaries are safe.
+        template <class A, class Mode>
+        XSIMD_INLINE batch<float, A>
+        load_masked(float const* mem, batch_bool<float, A> mask, convert<float>, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskload_ps(mem, _mm_castps_si128(mask));
+        }
+        template <class A, class Mode>
+        XSIMD_INLINE batch<double, A>
+        load_masked(double const* mem, batch_bool<double, A> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskload_pd(mem, _mm_castpd_si128(mask));
+        }
+
         // store_masked
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE void store_masked(float* mem, batch<float, A> const& src, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
@@ -128,6 +144,21 @@ namespace xsimd
             return _mm_maskstore_pd(mem, mask.as_batch(), src);
         }
 
+        // Runtime-mask store for float/double on AVX-128. Same fault-suppression
+        // semantics as the masked loads above; alignment mode is irrelevant.
+        template <class A, class Mode>
+        XSIMD_INLINE void
+        store_masked(float* mem, batch<float, A> const& src, batch_bool<float, A> mask, Mode, requires_arch<avx_128>) noexcept
+        {
+            _mm_maskstore_ps(mem, _mm_castps_si128(mask), src);
+        }
+        template <class A, class Mode>
+        XSIMD_INLINE void
+        store_masked(double* mem, batch<double, A> const& src, batch_bool<double, A> mask, Mode, requires_arch<avx_128>) noexcept
+        {
+            _mm_maskstore_pd(mem, _mm_castpd_si128(mask), src);
+        }
+
         // swizzle (dynamic mask)
         template <class A, class T, class ITy, class = std::enable_if_t<std::is_floating_point<T>::value && sizeof(T) == sizeof(ITy)>>
         XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<ITy, A> mask, requires_arch<avx_128>) noexcept
diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp
index e78864f6e..1474aeeb8 100644
--- a/include/xsimd/arch/xsimd_common_fwd.hpp
+++ b/include/xsimd/arch/xsimd_common_fwd.hpp
@@ -79,8 +79,12 @@ namespace xsimd
         XSIMD_INLINE batch<T, A> load(T const* mem, unaligned_mode, requires_arch<A>) noexcept;
         template <class A, class T_in, class T_out, bool... Values, class alignment>
         XSIMD_INLINE batch<T_out, A> load_masked(T_in const* mem, batch_bool_constant<T_out, A, Values...> mask, convert<T_out>, alignment, requires_arch<common>) noexcept;
+        template <class A, class T, class Mode>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<common>) noexcept;
         template <class A, class T_in, class T_out, bool... Values, class alignment>
         XSIMD_INLINE void store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...> mask, alignment, requires_arch<common>) noexcept;
+        template <class A, class T, class Mode>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<common>) noexcept;
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<A>) noexcept;
         template <class A, bool... Values, class Mode>
diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp
index 3ae649fdb..df64d3c1b 100644
--- a/include/xsimd/arch/xsimd_rvv.hpp
+++ b/include/xsimd/arch/xsimd_rvv.hpp
@@ -409,6 +409,11 @@ namespace xsimd
         {
             XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*))
             XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec))
+            // Masked load (mask-undisturbed with zero passthrough): inactive lanes read as 0,
+            // no memory access is performed for inactive lanes (page-fault safe).
+            XSIMD_RVV_OVERLOAD(rvvle_mu, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM _mu), , vec(bvec, vec, T const*))
+            // Masked store: inactive lanes are not written.
+            XSIMD_RVV_OVERLOAD(rvvse_m, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM _m), , void(bvec, T*, vec))
         }
 
         template <class A, class T, detail::enable_arithmetic_t<T> = 0>
@@ -423,6 +428,16 @@ namespace xsimd
             return load_aligned<A>(src, convert<T>(), rvv {});
         }
 
+        // load_masked (runtime mask): native vle*.v vd, (rs1), v0.t with zero-init
+        // passthrough so inactive lanes read as 0, matching xsimd's contract.
+        template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<rvv>) noexcept
+        {
+            using proj_t = map_to_sized_type_t<T>;
+            const auto zero = detail_rvv::rvvmv_splat(proj_t {});
+            return detail_rvv::rvvle_mu(mask, zero, reinterpret_cast<proj_t const*>(mem));
+        }
+
         // load_complex
         namespace detail_rvv
         {
@@ -500,6 +515,15 @@ namespace xsimd
             store_aligned<A>(dst, src, rvv {});
         }
 
+        // store_masked (runtime mask): native vse*.v vd, (rs1), v0.t — inactive lanes
+        // are not written (page-fault safe).
+        template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<rvv>) noexcept
+        {
+            using proj_t = map_to_sized_type_t<T>;
+            detail_rvv::rvvse_m(mask, reinterpret_cast<proj_t*>(mem), src);
+        }
+
         /******************
          * scatter/gather *
          ******************/
diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp
index c15404dd9..e5729e65e 100644
--- a/include/xsimd/arch/xsimd_sve.hpp
+++ b/include/xsimd/arch/xsimd_sve.hpp
@@ -101,11 +101,28 @@ namespace xsimd
             return load_aligned<A>(src, convert<T>(), sve {});
         }
 
-        // load_masked
-        template <class A, class T, bool... Values, class Mode, detail::enable_arithmetic_t<T> = 0>
-        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<float, A, Values...>, Mode, requires_arch<sve>) noexcept
+        // load_masked (compile-time mask): build a runtime predicate from
+        // the constant mask and reuse the runtime-mask path. ``pmask`` only
+        // constructs a 128-bit chunk predicate (svdupq_b{8,16,32,64}), which
+        // is replication-based and does not correctly express a per-lane
+        // mask on SVE wider than 128 bits — going through ``as_batch_bool``
+        // gives the right predicate for every vector width. ``int32``/
+        // ``int64``/``uint32``/``uint64`` are excluded so the common-arch
+        // dispatchers that reinterpret to ``float``/``double`` win partial
+        // ordering (otherwise we'd be ambiguous with ``requires_arch<A>``).
+        template <class A, class T, bool... Values, class Mode,
+                  detail::enable_arithmetic_t<T> = 0,
+                  std::enable_if_t<!(std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8)), int> = 0>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode m, requires_arch<sve>) noexcept
         {
-            return svld1(detail_sve::pmask<Values...>(), reinterpret_cast<map_to_sized_type_t<T> const*>(mem));
+            return load_masked<A>(mem, mask.as_batch_bool(), convert<T> {}, m, sve {});
+        }
+
+        // load_masked (runtime mask)
+        template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<sve>) noexcept
+        {
+            return svld1(mask, reinterpret_cast<map_to_sized_type_t<T> const*>(mem));
         }
 
         // load_complex
@@ -141,6 +158,24 @@ namespace xsimd
             store_aligned<A>(dst, src, sve {});
         }
 
+        // store_masked (compile-time mask): forward to the runtime-mask
+        // path for the same reason as load_masked above; same exclusion of
+        // 32/64-bit integers to defer to the common dispatchers.
+        template <class A, class T, bool... Values, class Mode,
+                  detail::enable_arithmetic_t<T> = 0,
+                  std::enable_if_t<!(std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8)), int> = 0>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode m, requires_arch<sve>) noexcept
+        {
+            store_masked<A>(mem, src, mask.as_batch_bool(), m, sve {});
+        }
+
+        // store_masked (runtime mask)
+        template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<sve>) noexcept
+        {
+            svst1(mask, reinterpret_cast<map_to_sized_type_t<T>*>(mem), src);
+        }
+
         // store_complex
         template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
index 6a7206116..341339abe 100644
--- a/include/xsimd/types/xsimd_api.hpp
+++ b/include/xsimd/types/xsimd_api.hpp
@@ -1550,6 +1550,27 @@ namespace xsimd
         return batch<T, A>::load(ptr, mask, aligned_mode {});
     }
 
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr using a runtime mask. Elements
+     * corresponding to \c false in the mask are not accessed in memory and are
+     * zero-initialized in the resulting batch. No type conversion is performed:
+     * \c ptr must point to \c T. Prefer the \c batch_bool_constant overload
+     * whenever the mask is known at compile time.
+     * @param ptr the memory buffer to read
+     * @param mask runtime selection mask for the elements to load
+     * @return a new batch instance
+     */
+    template <class T, class A = default_arch>
+    XSIMD_INLINE batch<T, A> load(T const* ptr,
+                                  batch_bool<T, A> mask,
+                                  aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::load(ptr, mask, aligned_mode {});
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
@@ -1570,6 +1591,16 @@ namespace xsimd
         return batch<T, A>::load(ptr, mask, unaligned_mode {});
     }
 
+    /// \overload
+    template <class T, class A = default_arch>
+    XSIMD_INLINE batch<T, A> load(T const* ptr,
+                                  batch_bool<T, A> mask,
+                                  unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::load(ptr, mask, unaligned_mode {});
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
@@ -2663,6 +2694,28 @@ namespace xsimd
         val.store(mem, mask, aligned_mode {});
     }
 
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy selected elements of batch \c val to the buffer \c mem using a
+     * runtime mask. Elements corresponding to \c false in the mask are not
+     * written and leave the contents of \c mem untouched. No type conversion
+     * is performed: \c mem must point to \c T. Prefer the \c
+     * batch_bool_constant overload whenever the mask is known at compile time.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy from
+     * @param mask runtime selection mask for the elements to store
+     */
+    template <class T, class A = default_arch>
+    XSIMD_INLINE void store(T* mem,
+                            batch<T, A> const& val,
+                            batch_bool<T, A> mask,
+                            aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        val.store(mem, mask, aligned_mode {});
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
@@ -2683,6 +2736,17 @@ namespace xsimd
         val.store(mem, mask, unaligned_mode {});
     }
 
+    /// \overload
+    template <class T, class A = default_arch>
+    XSIMD_INLINE void store(T* mem,
+                            batch<T, A> const& val,
+                            batch_bool<T, A> mask,
+                            unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        val.store(mem, mask, unaligned_mode {});
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp
index b584a2d81..063186150 100644
--- a/include/xsimd/types/xsimd_batch.hpp
+++ b/include/xsimd/types/xsimd_batch.hpp
@@ -144,9 +144,12 @@ namespace xsimd
         template <class U>
         XSIMD_INLINE void store(U* mem, stream_mode) const noexcept;
 
-        // Compile-time mask overloads
+        // Masked overloads
         template <class U, bool... Values, class Mode = aligned_mode>
         XSIMD_INLINE void store(U* mem, batch_bool_constant<T, A, Values...> mask, Mode) const noexcept;
+        /** \brief Runtime-mask store; see xsimd::store(T*, batch const&, batch_bool<T,A>, Mode). */
+        template <class Mode = aligned_mode>
+        XSIMD_INLINE void store(T* mem, batch_bool<T, A> mask, Mode = {}) const noexcept;
 
         template <class U>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(U const* mem) noexcept;
@@ -156,9 +159,12 @@ namespace xsimd
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, aligned_mode) noexcept;
         template <class U>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, unaligned_mode) noexcept;
-        // Compile-time mask overloads
+        // Masked overloads
         template <class U, bool... Values, class Mode = aligned_mode>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, batch_bool_constant<T, A, Values...> mask, Mode = {}) noexcept;
+        /** \brief Runtime-mask load; see xsimd::load(T const*, batch_bool<T,A>, Mode). */
+        template <class Mode = aligned_mode>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load(T const* mem, batch_bool<T, A> mask, Mode = {}) noexcept;
         template <class U>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept;
 
@@ -722,6 +728,14 @@ namespace xsimd
         }
     }
 
+    template <class T, class A>
+    template <class Mode>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load(T const* mem, batch_bool<T, A> mask, Mode mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::load_masked<A>(mem, mask, kernel::convert<T> {}, mode, A {});
+    }
+
     template <class T, class A>
     template <class U, bool... Values, class Mode>
     XSIMD_INLINE void batch<T, A>::store(U* mem,
@@ -745,6 +759,16 @@ namespace xsimd
         }
     }
 
+    template <class T, class A>
+    template <class Mode>
+    XSIMD_INLINE void batch<T, A>::store(T* mem,
+                                         batch_bool<T, A> mask,
+                                         Mode mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        kernel::store_masked<A>(mem, *this, mask, mode, A {});
+    }
+
     template <class T, class A>
     template <class U>
     XSIMD_INLINE batch<T, A> batch<T, A>::load(U const* mem, stream_mode) noexcept
diff --git a/include/xsimd/types/xsimd_utils.hpp b/include/xsimd/types/xsimd_utils.hpp
index 6af62c1a0..940ee084f 100644
--- a/include/xsimd/types/xsimd_utils.hpp
+++ b/include/xsimd/types/xsimd_utils.hpp
@@ -457,6 +457,16 @@ namespace xsimd
 
     template <class B>
     using complex_batch_type_t = typename complex_batch_type<B>::type;
+
+    namespace details
+    {
+        // Returns a bitmask with the lowest \c size bits set. Used by masked
+        // load/store fast paths to detect "all lanes active".
+        inline constexpr uint64_t full_mask(std::size_t size) noexcept
+        {
+            return size >= 64 ? ~uint64_t(0) : ((uint64_t(1) << size) - 1);
+        }
+    }
 }
 
 #endif
diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp
index a5266eeb3..48e5ac0d5 100644
--- a/test/test_load_store.cpp
+++ b/test/test_load_store.cpp
@@ -66,22 +66,6 @@ struct load_store_test
         static constexpr bool get(std::size_t index, std::size_t size) noexcept { return index >= (size / 2); }
     };
 
-    struct mask_first_n
-    {
-        static constexpr bool get(std::size_t index, std::size_t size) noexcept
-        {
-            return index < (size > 2 ? size / 3 : std::size_t(1));
-        }
-    };
-
-    struct mask_last_n
-    {
-        static constexpr bool get(std::size_t index, std::size_t size) noexcept
-        {
-            return index >= size - (size > 2 ? size / 3 : std::size_t(1));
-        }
-    };
-
     struct mask_even
     {
         static constexpr bool get(std::size_t index, std::size_t) noexcept { return (index % 2) == 0; }
@@ -105,6 +89,41 @@ struct load_store_test
         static constexpr bool get(std::size_t, std::size_t) noexcept { return true; }
     };
 
+    template <class Generator>
+    static batch_bool_type make_runtime_mask() noexcept
+    {
+        uint64_t bits = 0;
+        for (std::size_t i = 0; i < size; ++i)
+        {
+            if (Generator::get(i, size))
+            {
+                bits |= uint64_t(1) << i;
+            }
+        }
+        return batch_bool_type::from_mask(bits);
+    }
+
+    struct compile_time_mask
+    {
+        static constexpr const char* tag = "";
+        template <class Generator>
+        static auto make() noexcept
+            -> decltype(xsimd::make_batch_bool_constant<value_type, Generator, typename batch_type::arch_type>())
+        {
+            return xsimd::make_batch_bool_constant<value_type, Generator, typename batch_type::arch_type>();
+        }
+    };
+
+    struct runtime_mask
+    {
+        static constexpr const char* tag = " runtime";
+        template <class Generator>
+        static batch_bool_type make() noexcept
+        {
+            return make_runtime_mask<Generator>();
+        }
+    };
+
     int8_vector_type i8_vec;
     uint8_vector_type ui8_vec;
     int16_vector_type i16_vec;
@@ -367,16 +386,22 @@ struct load_store_test
     template <class V>
     void run_mask_tests(const V& v, const std::string& name, batch_type& b, const array_type& expected, std::true_type)
     {
-        run_load_mask_pattern<mask_none>(v, name, b, expected, " masked none");
-        run_load_mask_pattern<mask_first>(v, name, b, expected, " masked first element");
-        run_load_mask_pattern<mask_first_half>(v, name, b, expected, " masked first half");
-        run_load_mask_pattern<mask_last_half>(v, name, b, expected, " masked last half");
-        run_load_mask_pattern<mask_first_n>(v, name, b, expected, " masked first N");
-        run_load_mask_pattern<mask_last_n>(v, name, b, expected, " masked last N");
-        run_load_mask_pattern<mask_even>(v, name, b, expected, " masked even elements");
-        run_load_mask_pattern<mask_odd>(v, name, b, expected, " masked odd elements");
-        run_load_mask_pattern<mask_pseudo_random>(v, name, b, expected, " masked pseudo random");
-        run_load_mask_pattern<mask_all>(v, name, b, expected, " masked all elements");
+        run_load_mask_patterns<compile_time_mask>(v, name, b, expected);
+        run_load_mask_patterns<runtime_mask>(v, name, b, expected);
+    }
+
+    template <class MaskKind, class V>
+    void run_load_mask_patterns(const V& v, const std::string& name, batch_type& b, const array_type& expected)
+    {
+        const std::string p = std::string(MaskKind::tag) + " masked";
+        run_load_mask_pattern<MaskKind, mask_none>(v, name, b, expected, p + " none");
+        run_load_mask_pattern<MaskKind, mask_first>(v, name, b, expected, p + " first element");
+        run_load_mask_pattern<MaskKind, mask_first_half>(v, name, b, expected, p + " first half");
+        run_load_mask_pattern<MaskKind, mask_last_half>(v, name, b, expected, p + " last half");
+        run_load_mask_pattern<MaskKind, mask_even>(v, name, b, expected, p + " even elements");
+        run_load_mask_pattern<MaskKind, mask_odd>(v, name, b, expected, p + " odd elements");
+        run_load_mask_pattern<MaskKind, mask_pseudo_random>(v, name, b, expected, p + " pseudo random");
+        run_load_mask_pattern<MaskKind, mask_all>(v, name, b, expected, p + " all elements");
     }
 
     template <class V>
@@ -384,10 +409,10 @@ struct load_store_test
     {
     }
 
-    template <class Generator, class V>
+    template <class MaskKind, class Generator, class V>
     void run_load_mask_pattern(const V& v, const std::string& name, batch_type& b, const array_type& expected, const std::string& label)
     {
-        constexpr auto mask = xsimd::make_batch_bool_constant<value_type, Generator, typename batch_type::arch_type>();
+        const auto mask = MaskKind::template make<Generator>();
         array_type expected_masked { 0 };
 
         for (std::size_t i = 0; i < size; ++i)
@@ -404,10 +429,10 @@ struct load_store_test
         CHECK_BATCH_EQ(b, expected_masked);
     }
 
-    template <class Generator, class V>
+    template <class MaskKind, class Generator, class V>
     void run_store_mask_pattern(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked, const std::string& label)
     {
-        auto mask = xsimd::make_batch_bool_constant<value_type, Generator, typename batch_type::arch_type>();
+        const auto mask = MaskKind::template make<Generator>();
         for (std::size_t i = 0; i < size; ++i)
         {
             expected_masked[i] = Generator::get(i, size) ? v[i] : value_type();
@@ -425,15 +450,21 @@ struct load_store_test
     template <class V>
     void run_store_mask_tests(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked, std::true_type)
     {
-        run_store_mask_pattern<mask_first>(v, name, b, res, expected_masked, " masked first element");
-        run_store_mask_pattern<mask_first_half>(v, name, b, res, expected_masked, " masked first half");
-        run_store_mask_pattern<mask_last_half>(v, name, b, res, expected_masked, " masked last half");
-        run_store_mask_pattern<mask_first_n>(v, name, b, res, expected_masked, " masked first N");
-        run_store_mask_pattern<mask_last_n>(v, name, b, res, expected_masked, " masked last N");
-        run_store_mask_pattern<mask_even>(v, name, b, res, expected_masked, " masked even elements");
-        run_store_mask_pattern<mask_odd>(v, name, b, res, expected_masked, " masked odd elements");
-        run_store_mask_pattern<mask_pseudo_random>(v, name, b, res, expected_masked, " masked pseudo random");
-        run_store_mask_pattern<mask_all>(v, name, b, res, expected_masked, " masked all elements");
+        run_store_mask_patterns<compile_time_mask>(v, name, b, res, expected_masked);
+        run_store_mask_patterns<runtime_mask>(v, name, b, res, expected_masked);
+    }
+
+    template <class MaskKind, class V>
+    void run_store_mask_patterns(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked)
+    {
+        const std::string p = std::string(MaskKind::tag) + " masked";
+        run_store_mask_pattern<MaskKind, mask_first>(v, name, b, res, expected_masked, p + " first element");
+        run_store_mask_pattern<MaskKind, mask_first_half>(v, name, b, res, expected_masked, p + " first half");
+        run_store_mask_pattern<MaskKind, mask_last_half>(v, name, b, res, expected_masked, p + " last half");
+        run_store_mask_pattern<MaskKind, mask_even>(v, name, b, res, expected_masked, p + " even elements");
+        run_store_mask_pattern<MaskKind, mask_odd>(v, name, b, res, expected_masked, p + " odd elements");
+        run_store_mask_pattern<MaskKind, mask_pseudo_random>(v, name, b, res, expected_masked, p + " pseudo random");
+        run_store_mask_pattern<MaskKind, mask_all>(v, name, b, res, expected_masked, p + " all elements");
     }
 
     template <class V>
@@ -453,6 +484,7 @@ struct load_store_test
         V sentinel_expected(size, sentinel);
 
         auto zero_mask = xsimd::make_batch_bool_constant<value_type, mask_none, typename batch_type::arch_type>();
+        auto runtime_zero_mask = make_runtime_mask<mask_none>();
         std::fill(res.begin(), res.end(), sentinel);
         b.store(res.data(), zero_mask, xsimd::aligned_mode());
         INFO(name, " masked none aligned store");
@@ -470,6 +502,19 @@ struct load_store_test
         CHECK(std::all_of(scratch.begin(), scratch.end(), [](const value_type v)
                           { return v == sentinel; }));
 
+        std::fill(res.begin(), res.end(), sentinel);
+        xsimd::store(res.data(), b, runtime_zero_mask, xsimd::aligned_mode());
+        INFO(name, " runtime masked none aligned store");
+        CHECK_VECTOR_EQ(res, sentinel_expected);
+
+        std::fill(scratch.begin(), scratch.end(), sentinel);
+        xsimd::store(scratch_ptr, b, runtime_zero_mask, xsimd::unaligned_mode());
+        INFO(name, " runtime masked none unaligned store");
+        std::copy(scratch_ptr, scratch_ptr + scratch_slice.size(), scratch_slice.begin());
+        CHECK_VECTOR_EQ(scratch_slice, sentinel_expected);
+        CHECK(std::all_of(scratch.begin(), scratch.end(), [](const value_type v)
+                          { return v == sentinel; }));
+
         run_store_mask_tests(v, name, b, res, expected_masked, std::true_type {});
     }
 

From 0325fb4c0ba3693d38b6a8dd4f3b86b2894de193 Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Tue, 5 May 2026 14:42:57 -0400
Subject: [PATCH 2/2] refactor: trim masked load/store comments and consolidate
 AVX2-128 helpers

Shorten verbose comments around masked load/store paths, drop the
sizeof(int)/sizeof(long long) static_asserts (intrinsic boundaries now
reinterpret_cast at the call site), and collapse the four
maskload_128/maskstore_128 detail overloads into two XSIMD_IF_CONSTEXPR-
dispatched templates. Public surface unchanged.
---
 .../xsimd/arch/common/xsimd_common_memory.hpp | 16 ++---
 include/xsimd/arch/xsimd_avx.hpp              |  7 +-
 include/xsimd/arch/xsimd_avx2.hpp             | 34 ++++------
 include/xsimd/arch/xsimd_avx2_128.hpp         | 66 +++++++++----------
 include/xsimd/arch/xsimd_avx_128.hpp          |  7 +-
 5 files changed, 53 insertions(+), 77 deletions(-)

diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp
index 0639f5168..9d582943c 100644
--- a/include/xsimd/arch/common/xsimd_common_memory.hpp
+++ b/include/xsimd/arch/common/xsimd_common_memory.hpp
@@ -379,12 +379,8 @@ namespace xsimd
         XSIMD_INLINE batch<T, A>
         load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<common>) noexcept
         {
-            // Per-lane validity contract: only active lanes of ``mem`` are
-            // required to be addressable. An unconditional whole-vector load
-            // would touch inactive lanes and trip ASan/Valgrind on partial
-            // buffers, so stay scalar. Arches with hardware predicated loads
-            // (AVX2 32/64-bit, AVX-512, SVE, RVV) override this with a single
-            // intrinsic that suppresses inactive-lane reads in hardware.
+            // Scalar fallback: only active lanes are touched. Arches with
+            // hardware predicated loads override this.
             constexpr std::size_t size = batch<T, A>::size;
             alignas(A::alignment()) std::array<T, size> buffer;
             for (std::size_t i = 0; i < size; ++i)
@@ -410,12 +406,8 @@ namespace xsimd
         XSIMD_INLINE void
         store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<common>) noexcept
         {
-            // Per-lane validity contract (matches native masked-store APIs):
-            // only active lanes of ``mem`` are touched. A load+select+store
-            // RMW would both read and write inactive bytes, breaking that
-            // contract — stay scalar. Arches with hardware predicated stores
-            // override this with a single intrinsic that suppresses inactive
-            // lanes in hardware.
+            // Scalar fallback: only active lanes are touched. Arches with
+            // hardware predicated stores override this.
             constexpr std::size_t size = batch<T, A>::size;
             alignas(A::alignment()) std::array<T, size> src_buf;
             src.store_aligned(src_buf.data());
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
index 429637784..a477e30b3 100644
--- a/include/xsimd/arch/xsimd_avx.hpp
+++ b/include/xsimd/arch/xsimd_avx.hpp
@@ -1015,9 +1015,7 @@ namespace xsimd
             }
         }
 
-        // Runtime-mask load for float/double on AVX. Both aligned_mode and
-        // unaligned_mode map to _mm256_maskload_* — the intrinsic does not fault
-        // on masked-off lanes, so partial loads across page boundaries are safe.
+        // Runtime-mask load (float/double).
         template <class A, class Mode>
         XSIMD_INLINE batch<float, A>
         load_masked(float const* mem, batch_bool<float, A> mask, convert<float>, Mode, requires_arch<avx>) noexcept
@@ -1048,8 +1046,7 @@ namespace xsimd
             }
         }
 
-        // Runtime-mask store for float/double on AVX. Same fault-suppression
-        // semantics as the masked loads above; alignment mode is irrelevant.
+        // Runtime-mask store (float/double).
         template <class A, class Mode>
         XSIMD_INLINE void
         store_masked(float* mem, batch<float, A> const& src, batch_bool<float, A> mask, Mode, requires_arch<avx>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
index f2666018b..9f3415cf2 100644
--- a/include/xsimd/arch/xsimd_avx2.hpp
+++ b/include/xsimd/arch/xsimd_avx2.hpp
@@ -118,23 +118,18 @@ namespace xsimd
             }
         }
 
-        // load_masked. AVX2 mask{load,store}_epi{32,64} take int*/long long*;
-        // these helpers exist so the dispatch picks the right intrinsic by
-        // pointer width. Static_asserts pin sizeof(int)/(long long) to the
-        // values the intrinsics expect — the dispatcher relies on it.
+        // load_masked: AVX2 has _mm256_maskload_epi{32,64} for 32/64-bit
+        // integers; 8/16-bit fall back to the common scalar path.
         namespace detail
         {
-            static_assert(sizeof(int) == 4, "AVX2 maskload/maskstore expects sizeof(int) == 4");
-            static_assert(sizeof(long long) == 8, "AVX2 maskload/maskstore expects sizeof(long long) == 8");
-
             XSIMD_INLINE __m256i maskload(const int32_t* mem, __m256i mask) noexcept
             {
-                return _mm256_maskload_epi32(mem, mask);
+                return _mm256_maskload_epi32(reinterpret_cast<const int*>(mem), mask);
             }
 
-            XSIMD_INLINE __m256i maskload(const long long* mem, __m256i mask) noexcept
+            XSIMD_INLINE __m256i maskload(const int64_t* mem, __m256i mask) noexcept
             {
-                return _mm256_maskload_epi64(mem, mask);
+                return _mm256_maskload_epi64(reinterpret_cast<const long long*>(mem), mask);
             }
 
             XSIMD_INLINE __m256i zero_extend(__m128i hi) noexcept
@@ -148,7 +143,7 @@ namespace xsimd
         load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
         {
             static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2");
-            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
+            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, int64_t>;
             return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());
         }
 
@@ -178,17 +173,12 @@ namespace xsimd
             return bitwise_cast<uint64_t>(r);
         }
 
-        // Runtime-mask load for 32/64-bit integers on AVX2. 8/16-bit integers
-        // fall back to the scalar common path: AVX2 has no native maskload for
-        // those widths, and a load-then-blend would break fault-suppression at
-        // page boundaries (the main reason callers ask for a masked load).
-        // Both aligned_mode and unaligned_mode route to the same intrinsic —
-        // masked-off lanes do not fault regardless of alignment.
+        // Runtime-mask load (32/64-bit integers); 8/16-bit fall back to common.
         template <class A, class T, class Mode>
         XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
         load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
         {
-            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
+            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, int64_t>;
             return detail::maskload(reinterpret_cast<const int_t*>(mem), __m256i(mask));
         }
 
@@ -200,9 +190,9 @@ namespace xsimd
                 _mm256_maskstore_epi32(reinterpret_cast<int*>(mem), mask, src);
             }
 
-            XSIMD_INLINE void maskstore(long long* mem, __m256i mask, __m256i src) noexcept
+            XSIMD_INLINE void maskstore(int64_t* mem, __m256i mask, __m256i src) noexcept
             {
-                _mm256_maskstore_epi64(mem, mask, src);
+                _mm256_maskstore_epi64(reinterpret_cast<long long*>(mem), mask, src);
             }
         }
 
@@ -210,7 +200,7 @@ namespace xsimd
         XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
         {
             constexpr size_t lanes_per_half = batch<T, A>::size / 2;
-            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
+            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, int64_t>;
 
             XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)
             {
@@ -248,7 +238,7 @@ namespace xsimd
         XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
         store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2>) noexcept
         {
-            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
+            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, int64_t>;
             detail::maskstore(reinterpret_cast<int_t*>(mem), __m256i(mask), __m256i(src));
         }
 
diff --git a/include/xsimd/arch/xsimd_avx2_128.hpp b/include/xsimd/arch/xsimd_avx2_128.hpp
index 1f2de84b3..d4c29b211 100644
--- a/include/xsimd/arch/xsimd_avx2_128.hpp
+++ b/include/xsimd/arch/xsimd_avx2_128.hpp
@@ -89,91 +89,91 @@ namespace xsimd
             }
         }
 
+        // load_masked / store_masked: AVX2-128 has _mm_maskload/store for
+        // 32/64-bit integers; 8/16-bit fall back to the common scalar path.
         namespace detail
         {
-            XSIMD_INLINE __m128i maskload_128(int32_t const* mem, __m128i mask) noexcept
+            template <class T>
+            XSIMD_INLINE __m128i maskload_avx2_128(T const* mem, __m128i mask) noexcept
             {
-                return _mm_maskload_epi32(mem, mask);
-            }
-            XSIMD_INLINE __m128i maskload_128(long long const* mem, __m128i mask) noexcept
-            {
-                return _mm_maskload_epi64(mem, mask);
-            }
-            XSIMD_INLINE void maskstore_128(int32_t* mem, __m128i mask, __m128i src) noexcept
-            {
-                _mm_maskstore_epi32(mem, mask, src);
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_maskload_epi32(reinterpret_cast<int const*>(mem), mask);
+                }
+                else
+                {
+                    return _mm_maskload_epi64(reinterpret_cast<long long const*>(mem), mask);
+                }
             }
-            XSIMD_INLINE void maskstore_128(long long* mem, __m128i mask, __m128i src) noexcept
+
+            template <class T>
+            XSIMD_INLINE void maskstore_avx2_128(T* mem, __m128i mask, __m128i src) noexcept
             {
-                _mm_maskstore_epi64(mem, mask, src);
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    _mm_maskstore_epi32(reinterpret_cast<int*>(mem), mask, src);
+                }
+                else
+                {
+                    _mm_maskstore_epi64(reinterpret_cast<long long*>(mem), mask, src);
+                }
             }
         }
 
-        // load_masked / store_masked. AVX2-128 has _mm_maskload/store for
-        // 32/64-bit integers only; 8/16-bit fall back to the common scalar
-        // path. Aligned and unaligned go through the same intrinsic — masked-
-        // off lanes do not fault regardless of alignment.
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2_128>) noexcept
         {
-            return detail::maskload_128(mem, mask.as_batch());
+            return detail::maskload_avx2_128(mem, mask.as_batch());
         }
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...> mask, convert<uint32_t>, Mode, requires_arch<avx2_128>) noexcept
         {
-            return detail::maskload_128(reinterpret_cast<int32_t const*>(mem), mask.as_batch());
+            return detail::maskload_avx2_128(mem, mask.as_batch());
         }
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<int64_t>, Mode, requires_arch<avx2_128>) noexcept
         {
-            return detail::maskload_128(reinterpret_cast<long long const*>(mem), mask.as_batch());
+            return detail::maskload_avx2_128(mem, mask.as_batch());
         }
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...> mask, convert<uint64_t>, Mode, requires_arch<avx2_128>) noexcept
         {
-            return detail::maskload_128(reinterpret_cast<long long const*>(mem), mask.as_batch());
+            return detail::maskload_avx2_128(mem, mask.as_batch());
         }
 
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
         {
-            detail::maskstore_128(mem, mask.as_batch(), src);
+            detail::maskstore_avx2_128(mem, mask.as_batch(), src);
         }
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
         {
-            detail::maskstore_128(reinterpret_cast<int32_t*>(mem), mask.as_batch(), src);
+            detail::maskstore_avx2_128(mem, mask.as_batch(), __m128i(src));
         }
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE void store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
         {
-            detail::maskstore_128(reinterpret_cast<long long*>(mem), mask.as_batch(), src);
+            detail::maskstore_avx2_128(mem, mask.as_batch(), src);
         }
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
         {
-            detail::maskstore_128(reinterpret_cast<long long*>(mem), mask.as_batch(), src);
+            detail::maskstore_avx2_128(mem, mask.as_batch(), __m128i(src));
         }
 
-        // Runtime-mask path: route through the same helpers as the constant-
-        // mask overloads. Templated on T so the dispatcher does not need to
-        // see four near-identical functions; the runtime-mask overloads of
-        // the common arch take ``batch_bool<T, A>``, not the four signed/
-        // unsigned int variants, so there is no ambiguity here.
         template <class A, class T, class Mode>
         XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
         load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2_128>) noexcept
         {
-            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
-            return detail::maskload_128(reinterpret_cast<int_t const*>(mem), __m128i(mask));
+            return detail::maskload_avx2_128(mem, __m128i(mask));
         }
 
         template <class A, class T, class Mode>
         XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
         store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2_128>) noexcept
         {
-            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
-            detail::maskstore_128(reinterpret_cast<int_t*>(mem), __m128i(mask), __m128i(src));
+            detail::maskstore_avx2_128(mem, __m128i(mask), __m128i(src));
         }
 
         // gather
diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp
index 98642dac6..e282010f2 100644
--- a/include/xsimd/arch/xsimd_avx_128.hpp
+++ b/include/xsimd/arch/xsimd_avx_128.hpp
@@ -115,9 +115,7 @@ namespace xsimd
             return _mm_maskload_pd(mem, mask.as_batch());
         }
 
-        // Runtime-mask load for float/double on AVX-128. Both aligned_mode and
-        // unaligned_mode map to _mm_maskload_* — the intrinsic does not fault
-        // on masked-off lanes, so partial loads across page boundaries are safe.
+        // Runtime-mask load (float/double).
         template <class A, class Mode>
         XSIMD_INLINE batch<float, A>
         load_masked(float const* mem, batch_bool<float, A> mask, convert<float>, Mode, requires_arch<avx_128>) noexcept
@@ -144,8 +142,7 @@ namespace xsimd
             return _mm_maskstore_pd(mem, mask.as_batch(), src);
         }
 
-        // Runtime-mask store for float/double on AVX-128. Same fault-suppression
-        // semantics as the masked loads above; alignment mode is irrelevant.
+        // Runtime-mask store (float/double).
         template <class A, class Mode>
         XSIMD_INLINE void
         store_masked(float* mem, batch<float, A> const& src, batch_bool<float, A> mask, Mode, requires_arch<avx_128>) noexcept