Skip to content

Commit da5848a

Browse files
1 parent db292cf commit da5848a

2 files changed

Lines changed: 109 additions & 13 deletions

File tree

stl/inc/xutility

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ _STL_DISABLE_CLANG_WARNINGS
7474
#endif // ^^^ unknown architecture ^^^
7575

7676
#define _VECTORIZED_ADJACENT_FIND _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC
77-
#define _VECTORIZED_BITSET_FROM_STRING _VECTORIZED_FOR_X64_X86
77+
#define _VECTORIZED_BITSET_FROM_STRING _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC
7878
#define _VECTORIZED_BITSET_TO_STRING _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC
7979
#define _VECTORIZED_COUNT _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC
8080
#define _VECTORIZED_FIND _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC

stl/src/vector_algorithms.cpp

Lines changed: 108 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11319,16 +11319,106 @@ __declspec(noalias) void __stdcall __std_bitset_to_string_2(wchar_t* const _Dest
1131911319
}
1132011320

1132111321
} // extern "C"
11322+
#endif // ^^^ !defined(_M_ARM64) ^^^
1132211323

1132311324
#ifndef _M_ARM64
1132411325
namespace {
1132511326
namespace _Bitset_from_string {
11326-
#ifdef _M_ARM64EC
11327-
using _Traits_1_avx = void;
11328-
using _Traits_1_sse = void;
11329-
using _Traits_2_avx = void;
11330-
using _Traits_2_sse = void;
11331-
#else // ^^^ defined(_M_ARM64EC) / !defined(_M_ARM64EC) vvv
11327+
#if defined(_M_ARM64) || defined(_M_ARM64EC)
11328+
struct _Traits_1_neon {
11329+
using _Guard = char;
11330+
using _Vec = uint8x16_t;
11331+
11332+
static _Vec _Load(const void* const _Src) noexcept {
11333+
return vld1q_u8(static_cast<const uint8_t*>(_Src));
11334+
}
11335+
11336+
static void _Store(void* const _Dest, const _Vec _Val) noexcept {
11337+
return vst1q_u8(static_cast<uint8_t*>(_Dest), _Val);
11338+
}
11339+
11340+
static _Vec _Set(const uint8_t _Val) noexcept {
11341+
return vdupq_n_u8(_Val);
11342+
}
11343+
11344+
static _Vec _Cmp(const _Vec _Val1, const _Vec _Val2) noexcept {
11345+
return vceqq_u8(_Val1, _Val2);
11346+
}
11347+
11348+
static bool _Check(const _Vec _Val, const _Vec _Ex1, const _Vec _Dx0) noexcept {
11349+
const auto _Ex0 = _Cmp(_Val, _Dx0);
11350+
const auto _Ex01 = vorrq_u8(_Ex0, _Ex1);
11351+
const auto _Msk = vgetq_lane_u64(vreinterpretq_u64_u8(vpminq_u8(_Ex01, _Ex01)), 0);
11352+
return _Msk == 0xFFFF'FFFF'FFFF'FFFF;
11353+
}
11354+
11355+
static uint64_t _Movemask(const _Vec _Val) noexcept {
11356+
uint64_t _Val0 = vgetq_lane_u64(vreinterpretq_u64_u8(_Val), 0);
11357+
uint64_t _Val1 = vgetq_lane_u64(vreinterpretq_u64_u8(_Val), 1);
11358+
11359+
_Val0 &= 0x8080808080808080ull;
11360+
_Val1 &= 0x8080808080808080ull;
11361+
_Val0 *= 0x02040810204081ull;
11362+
_Val1 *= 0x02040810204081ull;
11363+
11364+
return (_Val0 >> 56) | ((_Val1 >> 56) << 8);
11365+
}
11366+
11367+
static uint16_t _To_bits(const _Vec _Ex1) noexcept {
11368+
// We do not omit static here, despite DevCom-11055227, because codegen is worse - see DevCom-11056805.
11369+
static constexpr uint8_t _Idx_arr[16] = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
11370+
const auto _Idx = vld1q_u8(_Idx_arr);
11371+
11372+
const auto _Ex2 = vqtbl1q_u8(_Ex1, _Idx);
11373+
return static_cast<uint16_t>(_Movemask(_Ex2));
11374+
}
11375+
};
11376+
11377+
struct _Traits_2_neon {
11378+
using _Guard = char;
11379+
using _Vec = uint16x8_t;
11380+
11381+
static _Vec _Load(const void* const _Src) noexcept {
11382+
return vld1q_u16(static_cast<const uint16_t*>(_Src));
11383+
}
11384+
11385+
static void _Store(void* const _Dest, const _Vec _Val) noexcept {
11386+
return vst1q_u16(static_cast<uint16_t*>(_Dest), _Val);
11387+
}
11388+
11389+
static _Vec _Set(const uint16_t _Val) noexcept {
11390+
return vdupq_n_u16(_Val);
11391+
}
11392+
11393+
static _Vec _Cmp(const _Vec _Val1, const _Vec _Val2) noexcept {
11394+
return vceqq_u16(_Val1, _Val2);
11395+
}
11396+
11397+
static bool _Check(const _Vec _Val, const _Vec _Ex1, const _Vec _Dx0) noexcept {
11398+
const auto _Ex0 = _Cmp(_Val, _Dx0);
11399+
const auto _Ex01 = vorrq_u16(_Ex0, _Ex1);
11400+
const auto _Msk = vgetq_lane_u64(vreinterpretq_u64_u16(vpminq_u16(_Ex01, _Ex01)), 0);
11401+
return _Msk == 0xFFFF'FFFF'FFFF'FFFF;
11402+
}
11403+
11404+
static uint64_t _Movemask(const _Traits_1_neon::_Vec _Val) noexcept {
11405+
uint64_t _Val0 = vgetq_lane_u64(vreinterpretq_u64_u8(_Val), 0);
11406+
_Val0 &= 0x8080808080808080ull;
11407+
_Val0 *= 0x02040810204081ull;
11408+
return _Val0 >> 56;
11409+
}
11410+
11411+
static uint8_t _To_bits(const _Vec _Ex1) noexcept {
11412+
// We do not omit static here, despite DevCom-11055227, because codegen is worse - see DevCom-11056805.
11413+
static constexpr uint8_t _Idx_arr[16] = {
11414+
14, 12, 10, 8, 6, 4, 2, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
11415+
const auto _Idx = vld1q_u8(_Idx_arr);
11416+
11417+
const auto _Ex2 = vqtbl1q_u8(vreinterpretq_u8_u16(_Ex1), _Idx);
11418+
return static_cast<uint8_t>(_Movemask(_Ex2));
11419+
}
11420+
};
11421+
#else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv
1133211422
struct _Traits_avx {
1133311423
using _Guard = _Zeroupper_on_exit;
1133411424
using _Vec = __m256i;
@@ -11432,6 +11522,7 @@ namespace {
1143211522
return _mm_cmpeq_epi16(_Val, _Dx1);
1143311523
}
1143411524
};
11525+
#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1143511526

1143611527
template <class _Traits, class _Elem, class _OutFn>
1143711528
bool _Loop(const _Elem* const _Src, const _Elem* _Src_end, const typename _Traits::_Vec _Dx0,
@@ -11500,7 +11591,6 @@ namespace {
1150011591

1150111592
return true;
1150211593
}
11503-
#endif // ^^^ !defined(_M_ARM64EC) ^^^
1150411594

1150511595
template <class _Elem>
1150611596
bool _Fallback(void* const _Dest, const _Elem* const _Src, const size_t _Size_bytes, const size_t _Size_bits,
@@ -11533,20 +11623,19 @@ namespace {
1153311623
return true;
1153411624
}
1153511625

11626+
#if !defined(_M_ARM64) && !defined(_M_ARM64EC)
1153611627
template <class _Avx, class _Sse, class _Elem>
1153711628
bool _Dispatch(void* _Dest, const _Elem* _Src, size_t _Size_bytes, size_t _Size_bits, size_t _Size_chars,
1153811629
_Elem _Elem0, _Elem _Elem1) noexcept {
11539-
#ifndef _M_ARM64EC
1154011630
if (_Use_avx2() && _Size_bits >= 256) {
1154111631
return _Impl<_Avx>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1);
1154211632
} else if (_Use_sse42()) {
1154311633
return _Impl<_Sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1);
11544-
} else
11545-
#endif // ^^^ !defined(_M_ARM64EC) ^^^
11546-
{
11634+
} else {
1154711635
return _Fallback(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1);
1154811636
}
1154911637
}
11638+
#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1155011639
} // namespace _Bitset_from_string
1155111640
} // unnamed namespace
1155211641

@@ -11557,16 +11646,23 @@ __declspec(noalias) bool __stdcall __std_bitset_from_string_1(void* const _Dest,
1155711646
const char _Elem1) noexcept {
1155811647
using namespace _Bitset_from_string;
1155911648

11649+
#if defined(_M_ARM64) || defined(_M_ARM64EC)
11650+
return _Impl<_Traits_1_neon>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1);
11651+
#else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv
1156011652
return _Dispatch<_Traits_1_avx, _Traits_1_sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1);
11653+
#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1156111654
}
1156211655

1156311656
__declspec(noalias) bool __stdcall __std_bitset_from_string_2(void* const _Dest, const wchar_t* const _Src,
1156411657
const size_t _Size_bytes, const size_t _Size_bits, const size_t _Size_chars, const wchar_t _Elem0,
1156511658
const wchar_t _Elem1) noexcept {
1156611659
using namespace _Bitset_from_string;
1156711660

11661+
#if defined(_M_ARM64) || defined(_M_ARM64EC)
11662+
return _Impl<_Traits_2_neon>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1);
11663+
#else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv
1156811664
return _Dispatch<_Traits_2_avx, _Traits_2_sse>(_Dest, _Src, _Size_bytes, _Size_bits, _Size_chars, _Elem0, _Elem1);
11665+
#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1156911666
}
1157011667

1157111668
} // extern "C"
11572-
#endif // ^^^ !defined(_M_ARM64) ^^^

0 commit comments

Comments
 (0)