Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions docs/source/api/data_transfer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Data Transfers
From memory:

+---------------------------------------+----------------------------------------------------+
| :cpp:func:`load` | load values from memory (optionally masked) |
| :cpp:func:`load` | load values from memory (optionally masked) [#m]_ |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`load_aligned` | load values from aligned memory |
+---------------------------------------+----------------------------------------------------+
Expand All @@ -32,7 +32,7 @@ From a scalar:
To memory:

+---------------------------------------+----------------------------------------------------+
| :cpp:func:`store` | store values to memory (optionally masked) |
| :cpp:func:`store` | store values to memory (optionally masked) [#m]_ |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`store_aligned` | store values to aligned memory |
+---------------------------------------+----------------------------------------------------+
Expand Down Expand Up @@ -84,3 +84,11 @@ The following empty types are used for tag dispatching:

.. doxygenstruct:: xsimd::unaligned_mode
:project: xsimd

.. rubric:: Footnotes

.. [#m] Masked ``load`` / ``store`` come in two flavours. The
:cpp:class:`batch_bool_constant` overload encodes the mask in the type and
is resolved at compile time. The runtime :cpp:class:`batch_bool` overload
accepts a mask computed at runtime. Prefer the compile-time mask whenever
the selection is known at compile time.
28 changes: 28 additions & 0 deletions include/xsimd/arch/common/xsimd_common_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <algorithm>
#include <array>
#include <complex>
#include <cstdint>

#include "../../types/xsimd_batch_constant.hpp"
#include "./xsimd_common_details.hpp"
Expand Down Expand Up @@ -374,6 +375,19 @@ namespace xsimd
return batch<T_out, A>::load(buffer.data(), aligned_mode {});
}

template <class A, class T, class Mode>
XSIMD_INLINE batch<T, A>
load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<common>) noexcept
{
// Scalar fallback: only active lanes are touched. Arches with
// hardware predicated loads override this.
constexpr std::size_t size = batch<T, A>::size;
alignas(A::alignment()) std::array<T, size> buffer;
for (std::size_t i = 0; i < size; ++i)
buffer[i] = mask.get(i) ? mem[i] : T(0);
return batch<T, A>::load_aligned(buffer.data());
}

template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE void
store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...>, alignment, requires_arch<common>) noexcept
Expand All @@ -388,6 +402,20 @@ namespace xsimd
}
}

template <class A, class T, class Mode>
XSIMD_INLINE void
store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<common>) noexcept
{
// Scalar fallback: only active lanes are touched. Arches with
// hardware predicated stores override this.
constexpr std::size_t size = batch<T, A>::size;
alignas(A::alignment()) std::array<T, size> src_buf;
src.store_aligned(src_buf.data());
for (std::size_t i = 0; i < size; ++i)
if (mask.get(i))
mem[i] = src_buf[i];
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
{
Expand Down
30 changes: 30 additions & 0 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1015,6 +1015,21 @@ namespace xsimd
}
}

// Runtime-mask load (float/double).
template <class A, class Mode>
XSIMD_INLINE batch<float, A>
load_masked(float const* mem, batch_bool<float, A> mask, convert<float>, Mode, requires_arch<avx>) noexcept
{
return _mm256_maskload_ps(mem, _mm256_castps_si256(mask));
}

template <class A, class Mode>
XSIMD_INLINE batch<double, A>
load_masked(double const* mem, batch_bool<double, A> mask, convert<double>, Mode, requires_arch<avx>) noexcept
{
return _mm256_maskload_pd(mem, _mm256_castpd_si256(mask));
}

// store_masked
namespace detail
{
Expand All @@ -1031,6 +1046,21 @@ namespace xsimd
}
}

// Runtime-mask store (float/double).
template <class A, class Mode>
XSIMD_INLINE void
store_masked(float* mem, batch<float, A> const& src, batch_bool<float, A> mask, Mode, requires_arch<avx>) noexcept
{
_mm256_maskstore_ps(mem, _mm256_castps_si256(mask), src);
}

template <class A, class Mode>
XSIMD_INLINE void
store_masked(double* mem, batch<double, A> const& src, batch_bool<double, A> mask, Mode, requires_arch<avx>) noexcept
{
_mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), src);
}

template <class A, class T, bool... Values, class Mode>
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept
{
Expand Down
38 changes: 25 additions & 13 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,18 +118,18 @@ namespace xsimd
}
}

// load_masked
// AVX2 low-level helpers (operate on raw SIMD registers)
// load_masked: AVX2 has _mm256_maskload_epi{32,64} for 32/64-bit
// integers; 8/16-bit fall back to the common scalar path.
namespace detail
{
XSIMD_INLINE __m256i maskload(const int32_t* mem, __m256i mask) noexcept
{
return _mm256_maskload_epi32(mem, mask);
return _mm256_maskload_epi32(reinterpret_cast<const int*>(mem), mask);
}

XSIMD_INLINE __m256i maskload(const long long* mem, __m256i mask) noexcept
XSIMD_INLINE __m256i maskload(const int64_t* mem, __m256i mask) noexcept
{
return _mm256_maskload_epi64(reinterpret_cast<long long const*>(mem), mask);
return _mm256_maskload_epi64(reinterpret_cast<const long long*>(mem), mask);
}

XSIMD_INLINE __m256i zero_extend(__m128i hi) noexcept
Expand All @@ -138,14 +138,12 @@ namespace xsimd
}
}

// single templated implementation for integer masked loads (32/64-bit)
template <class A, class T, bool... Values, class Mode>
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4), batch<T, A>>
load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
{
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2");
using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
// Use the raw register-level maskload helpers for the remaining cases.
using int_t = std::conditional_t<sizeof(T) == 4, int32_t, int64_t>;
return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());
}

Expand Down Expand Up @@ -175,16 +173,23 @@ namespace xsimd
return bitwise_cast<uint64_t>(r);
}

// Runtime-mask load (32/64-bit integers); 8/16-bit fall back to common.
template <class A, class T, class Mode>
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
{
using int_t = std::conditional_t<sizeof(T) == 4, int32_t, int64_t>;
return detail::maskload(reinterpret_cast<const int_t*>(mem), __m256i(mask));
}

// store_masked
namespace detail
{
template <class T, class A>
XSIMD_INLINE void maskstore(int32_t* mem, __m256i mask, __m256i src) noexcept
{
_mm256_maskstore_epi32(reinterpret_cast<int*>(mem), mask, src);
}

template <class T, class A>
XSIMD_INLINE void maskstore(int64_t* mem, __m256i mask, __m256i src) noexcept
{
_mm256_maskstore_epi64(reinterpret_cast<long long*>(mem), mask, src);
Expand All @@ -195,15 +200,14 @@ namespace xsimd
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
{
constexpr size_t lanes_per_half = batch<T, A>::size / 2;
using int_t = std::conditional_t<sizeof(T) == 4, int32_t, int64_t>;

// confined to lower 128-bit half → forward to SSE
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
const auto lo = detail::lower_half(src);
store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
}
// confined to upper 128-bit half → forward to SSE
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half)
{
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
Expand All @@ -212,7 +216,7 @@ namespace xsimd
}
else
{
detail::maskstore<T, A>(mem, mask.as_batch(), src);
detail::maskstore(reinterpret_cast<int_t*>(mem), mask.as_batch(), src);
}
}

Expand All @@ -230,6 +234,14 @@ namespace xsimd
store_masked<A>(reinterpret_cast<int64_t*>(mem), s64, batch_bool_constant<int64_t, A, Values...> {}, Mode {}, avx2 {});
}

template <class A, class T, class Mode>
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2>) noexcept
{
using int_t = std::conditional_t<sizeof(T) == 4, int32_t, int64_t>;
detail::maskstore(reinterpret_cast<int_t*>(mem), __m256i(mask), __m256i(src));
}

// load_stream
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value, void>>
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx2>) noexcept
Expand Down
71 changes: 57 additions & 14 deletions include/xsimd/arch/xsimd_avx2_128.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,48 +89,91 @@ namespace xsimd
}
}

// load_masked
// load_masked / store_masked: AVX2-128 has _mm_maskload/store for
// 32/64-bit integers; 8/16-bit fall back to the common scalar path.
namespace detail
{
template <class T>
XSIMD_INLINE __m128i maskload_avx2_128(T const* mem, __m128i mask) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm_maskload_epi32(reinterpret_cast<int const*>(mem), mask);
}
else
{
return _mm_maskload_epi64(reinterpret_cast<long long const*>(mem), mask);
}
}

template <class T>
XSIMD_INLINE void maskstore_avx2_128(T* mem, __m128i mask, __m128i src) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
_mm_maskstore_epi32(reinterpret_cast<int*>(mem), mask, src);
}
else
{
_mm_maskstore_epi64(reinterpret_cast<long long*>(mem), mask, src);
}
}
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2_128>) noexcept
{
return _mm_maskload_epi32(mem, mask.as_batch());
return detail::maskload_avx2_128(mem, mask.as_batch());
}
template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...> mask, convert<uint32_t>, Mode, requires_arch<avx2_128>) noexcept
{
return _mm_maskload_epi32((int32_t*)mem, mask.as_batch());
return detail::maskload_avx2_128(mem, mask.as_batch());
}
template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<int64_t>, Mode, requires_arch<avx2_128>) noexcept
{
return _mm_maskload_epi64(mem, mask.as_batch());
return detail::maskload_avx2_128(mem, mask.as_batch());
}
template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...> mask, convert<uint64_t>, Mode, requires_arch<avx2_128>) noexcept
{
return _mm_maskload_epi64((int64_t*)mem, mask.as_batch());
return detail::maskload_avx2_128(mem, mask.as_batch());
}

// store_masked
template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
{
return _mm_maskstore_epi32(mem, mask.as_batch(), src);
detail::maskstore_avx2_128(mem, mask.as_batch(), src);
}
template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
{
return _mm_maskstore_epi32((int32_t*)mem, mask.as_batch(), src);
detail::maskstore_avx2_128(mem, mask.as_batch(), __m128i(src));
}
template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
XSIMD_INLINE void store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
{
return _mm_maskstore_epi64(mem, mask.as_batch(), src);
detail::maskstore_avx2_128(mem, mask.as_batch(), src);
}
template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
{
detail::maskstore_avx2_128(mem, mask.as_batch(), __m128i(src));
}

template <class A, class T, class Mode>
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2_128>) noexcept
{
return detail::maskload_avx2_128(mem, __m128i(mask));
}

template <class A, class T, class Mode>
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2_128>) noexcept
{
return _mm_maskstore_epi64((int64_t*)mem, mask.as_batch(), src);
detail::maskstore_avx2_128(mem, __m128i(mask), __m128i(src));
}

// gather
Expand Down
28 changes: 28 additions & 0 deletions include/xsimd/arch/xsimd_avx_128.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,20 @@ namespace xsimd
return _mm_maskload_pd(mem, mask.as_batch());
}

// Runtime-mask load (float/double).
template <class A, class Mode>
XSIMD_INLINE batch<float, A>
load_masked(float const* mem, batch_bool<float, A> mask, convert<float>, Mode, requires_arch<avx_128>) noexcept
{
return _mm_maskload_ps(mem, _mm_castps_si128(mask));
}
template <class A, class Mode>
XSIMD_INLINE batch<double, A>
load_masked(double const* mem, batch_bool<double, A> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
{
return _mm_maskload_pd(mem, _mm_castpd_si128(mask));
}

// store_masked
template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(float* mem, batch<float, A> const& src, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
Expand All @@ -128,6 +142,20 @@ namespace xsimd
return _mm_maskstore_pd(mem, mask.as_batch(), src);
}

// Runtime-mask store (float/double).
template <class A, class Mode>
XSIMD_INLINE void
store_masked(float* mem, batch<float, A> const& src, batch_bool<float, A> mask, Mode, requires_arch<avx_128>) noexcept
{
_mm_maskstore_ps(mem, _mm_castps_si128(mask), src);
}
template <class A, class Mode>
XSIMD_INLINE void
store_masked(double* mem, batch<double, A> const& src, batch_bool<double, A> mask, Mode, requires_arch<avx_128>) noexcept
{
_mm_maskstore_pd(mem, _mm_castpd_si128(mask), src);
}

// swizzle (dynamic mask)
template <class A, class T, class ITy, class = std::enable_if_t<std::is_floating_point<T>::value && sizeof(T) == sizeof(ITy)>>
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<ITy, A> mask, requires_arch<avx_128>) noexcept
Expand Down
4 changes: 4 additions & 0 deletions include/xsimd/arch/xsimd_common_fwd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,12 @@ namespace xsimd
XSIMD_INLINE batch<T, A> load(T const* mem, unaligned_mode, requires_arch<A>) noexcept;
template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE batch<T_out, A> load_masked(T_in const* mem, batch_bool_constant<T_out, A, Values...> mask, convert<T_out>, alignment, requires_arch<common>) noexcept;
template <class A, class T, class Mode>
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<common>) noexcept;
template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE void store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...> mask, alignment, requires_arch<common>) noexcept;
template <class A, class T, class Mode>
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<common>) noexcept;
template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<A>) noexcept;
template <class A, bool... Values, class Mode>
Expand Down
Loading
Loading