Skip to content

Commit db292cf

Browse files
1 parent dd33861 commit db292cf

2 files changed

Lines changed: 98 additions & 19 deletions

File tree

stl/inc/xutility

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ _STL_DISABLE_CLANG_WARNINGS
7575

7676
#define _VECTORIZED_ADJACENT_FIND _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC
7777
#define _VECTORIZED_BITSET_FROM_STRING _VECTORIZED_FOR_X64_X86
78-
#define _VECTORIZED_BITSET_TO_STRING _VECTORIZED_FOR_X64_X86
78+
#define _VECTORIZED_BITSET_TO_STRING _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC
7979
#define _VECTORIZED_COUNT _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC
8080
#define _VECTORIZED_FIND _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC
8181
#define _VECTORIZED_FIND_END _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC

stl/src/vector_algorithms.cpp

Lines changed: 97 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11069,15 +11069,70 @@ __declspec(noalias) bool __stdcall __std_includes_less_8u(
1106911069

1107011070
} // extern "C"
1107111071

11072-
#ifndef _M_ARM64
1107311072
namespace {
1107411073
namespace _Bitset_to_string {
11075-
#ifdef _M_ARM64EC
11076-
using _Traits_1_avx = void;
11077-
using _Traits_1_sse = void;
11078-
using _Traits_2_avx = void;
11079-
using _Traits_2_sse = void;
11080-
#else // ^^^ defined(_M_ARM64EC) / !defined(_M_ARM64EC) vvv
11074+
#if defined(_M_ARM64) || defined(_M_ARM64EC)
11075+
struct _Traits_1_neon {
11076+
using _Value_type = uint16_t;
11077+
using _Vec_t = uint8x16_t;
11078+
static void _Exit_vectorized() noexcept {}
11079+
11080+
static void _Out(void* const _Dest, const _Vec_t _Elems) noexcept {
11081+
vst1q_u8(static_cast<uint8_t*>(_Dest), _Elems);
11082+
}
11083+
11084+
static _Vec_t _Set(const uint8_t _Val) noexcept {
11085+
return vdupq_n_u8(_Val);
11086+
}
11087+
11088+
static _Vec_t _Load_constant() noexcept {
11089+
// We do not omit static here, despite DevCom-11055227, because codegen is worse - see DevCom-11056805.
11090+
static constexpr uint32_t _Idx_arr[4] = {0x01010101, 0x01010101, 0x00000000, 0x00000000};
11091+
const auto _Idx = vld1q_u8(reinterpret_cast<const uint8_t*>(_Idx_arr));
11092+
return _Idx;
11093+
}
11094+
11095+
static _Vec_t __forceinline _Step(
11096+
const uint16_t _Val, const _Vec_t _Px0, const _Vec_t _Px1, const _Vec_t _Idx) noexcept {
11097+
const auto _Vx0 = vdupq_n_u16(_Val);
11098+
const auto _Vx1 = vqtbl1q_u8(vreinterpretq_u8_u16(_Vx0), _Idx);
11099+
const auto _Msk = vandq_u8(_Vx1, vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080)));
11100+
const auto _Ex0 = vceqq_u8(_Msk, vdupq_n_u8(0));
11101+
const auto _Ex1 = vbslq_u8(_Ex0, _Px0, _Px1);
11102+
return _Ex1;
11103+
}
11104+
};
11105+
11106+
struct _Traits_2_neon {
11107+
using _Value_type = uint8_t;
11108+
using _Vec_t = uint16x8_t;
11109+
static void _Exit_vectorized() noexcept {}
11110+
11111+
static void _Out(void* const _Dest, const _Vec_t _Elems) noexcept {
11112+
vst1q_u16(static_cast<uint16_t*>(_Dest), _Elems);
11113+
}
11114+
11115+
static _Vec_t _Set(const uint16_t _Val) noexcept {
11116+
return vdupq_n_u16(_Val);
11117+
}
11118+
11119+
static _Vec_t _Load_constant() noexcept {
11120+
// We do not omit static here, despite DevCom-11055227, because codegen is worse - see DevCom-11056805.
11121+
static constexpr uint64_t _Wx_arr[2] = {0x0010002000400080, 0x0001000200040008};
11122+
const auto _Wx = vld1q_u64(_Wx_arr);
11123+
return vreinterpretq_u16_u64(_Wx);
11124+
}
11125+
11126+
static _Vec_t __forceinline _Step(
11127+
const uint8_t _Val, const _Vec_t _Px0, const _Vec_t _Px1, const _Vec_t _Wx) noexcept {
11128+
const auto _Vx0 = vdupq_n_u16(static_cast<uint16_t>(_Val));
11129+
const auto _Msk = vandq_u16(_Vx0, _Wx);
11130+
const auto _Ex0 = vceqq_u16(_Msk, vdupq_n_u16(0));
11131+
const auto _Ex1 = vbslq_u16(_Ex0, _Px0, _Px1);
11132+
return _Ex1;
11133+
}
11134+
};
11135+
#else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv
1108111136
struct _Traits_avx {
1108211137
static void _Out(void* const _Dest, const __m256i _Elems) noexcept {
1108311138
_mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Elems);
@@ -11086,6 +11141,10 @@ namespace {
1108611141
static void _Exit_vectorized() noexcept {
1108711142
_mm256_zeroupper();
1108811143
}
11144+
11145+
static int _Load_constant() noexcept {
11146+
return 0;
11147+
}
1108911148
};
1109011149

1109111150
struct _Traits_sse {
@@ -11094,6 +11153,10 @@ namespace {
1109411153
}
1109511154

1109611155
static void _Exit_vectorized() noexcept {}
11156+
11157+
static int _Load_constant() noexcept {
11158+
return 0;
11159+
}
1109711160
};
1109811161

1109911162
struct _Traits_1_avx : _Traits_avx {
@@ -11103,7 +11166,8 @@ namespace {
1110311166
return _mm256_broadcastb_epi8(_mm_cvtsi32_si128(_Val));
1110411167
}
1110511168

11106-
static __m256i __forceinline _Step(const uint32_t _Val, const __m256i _Px0, const __m256i _Px1) noexcept {
11169+
static __m256i __forceinline _Step(
11170+
const uint32_t _Val, const __m256i _Px0, const __m256i _Px1, int) noexcept {
1110711171
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
1110811172
const __m128i _Vx1 =
1110911173
_mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x01010101, 0x02020202, 0x03030303));
@@ -11123,7 +11187,8 @@ namespace {
1112311187
return _mm_shuffle_epi8(_mm_cvtsi32_si128(_Val), _mm_setzero_si128());
1112411188
}
1112511189

11126-
static __m128i __forceinline _Step(const uint16_t _Val, const __m128i _Px0, const __m128i _Px1) noexcept {
11190+
static __m128i __forceinline _Step(
11191+
const uint16_t _Val, const __m128i _Px0, const __m128i _Px1, int) noexcept {
1112711192
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
1112811193
const __m128i _Vx1 =
1112911194
_mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x00000000, 0x01010101, 0x01010101));
@@ -11141,7 +11206,8 @@ namespace {
1114111206
return _mm256_broadcastw_epi16(_mm_cvtsi32_si128(_Val));
1114211207
}
1114311208

11144-
static __m256i __forceinline _Step(const uint16_t _Val, const __m256i _Px0, const __m256i _Px1) noexcept {
11209+
static __m256i __forceinline _Step(
11210+
const uint16_t _Val, const __m256i _Px0, const __m256i _Px1, int) noexcept {
1114511211
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
1114611212
const __m128i _Vx1 =
1114711213
_mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x00000000, 0x01010101, 0x01010101));
@@ -11162,14 +11228,16 @@ namespace {
1116211228
return _mm_set1_epi16(_Val);
1116311229
}
1116411230

11165-
static __m128i __forceinline _Step(const uint8_t _Val, const __m128i _Px0, const __m128i _Px1) noexcept {
11231+
static __m128i __forceinline _Step(
11232+
const uint8_t _Val, const __m128i _Px0, const __m128i _Px1, int) noexcept {
1116611233
const __m128i _Vx = _mm_set1_epi16(_Val);
1116711234
const __m128i _Msk = _mm_and_si128(_Vx, _mm_set_epi64x(0x0001000200040008, 0x0010002000400080));
1116811235
const __m128i _Ex0 = _mm_cmpeq_epi16(_Msk, _mm_setzero_si128());
1116911236
const __m128i _Ex1 = _mm_blendv_epi8(_Px1, _Px0, _Ex0);
1117011237
return _Ex1;
1117111238
}
1117211239
};
11240+
#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1117311241

1117411242
template <class _Traits, class _Elem>
1117511243
void __stdcall _Impl(
@@ -11178,14 +11246,18 @@ namespace {
1117811246

1117911247
const auto _Px0 = _Traits::_Set(_Elem0);
1118011248
const auto _Px1 = _Traits::_Set(_Elem1);
11249+
11250+
// TRANSITION: DevCom-11056873
11251+
const auto _Constant = _Traits::_Load_constant();
11252+
1118111253
if (_Size_bits >= _Step_size_bits) {
1118211254
_Elem* _Pos = _Dest + _Size_bits;
1118311255
_Size_bits &= _Step_size_bits - 1;
1118411256
_Elem* const _Stop_at = _Dest + _Size_bits;
1118511257
do {
1118611258
typename _Traits::_Value_type _Val;
1118711259
memcpy(&_Val, _Src, sizeof(_Val));
11188-
const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1);
11260+
const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1, _Constant);
1118911261
_Pos -= _Step_size_bits;
1119011262
_Traits::_Out(_Pos, _Elems);
1119111263
_Advance_bytes(_Src, sizeof(_Val));
@@ -11195,7 +11267,7 @@ namespace {
1119511267
if (_Size_bits > 0) {
1119611268
typename _Traits::_Value_type _Val;
1119711269
memcpy(&_Val, _Src, sizeof(_Val));
11198-
const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1);
11270+
const auto _Elems = _Traits::_Step(_Val, _Px0, _Px1, _Constant);
1119911271
_Elem _Tmp[_Step_size_bits];
1120011272
_Traits::_Out(_Tmp, _Elems);
1120111273
const _Elem* const _Tmpd = _Tmp + (_Step_size_bits - _Size_bits);
@@ -11204,25 +11276,23 @@ namespace {
1120411276

1120511277
_Traits::_Exit_vectorized(); // TRANSITION, DevCom-10331414
1120611278
}
11207-
#endif // ^^^ !defined(_M_ARM64EC) ^^^
1120811279

11280+
#if !defined(_M_ARM64) && !defined(_M_ARM64EC)
1120911281
template <class _Avx_traits, class _Sse_traits, class _Elem>
1121011282
void __stdcall _Dispatch(_Elem* const _Dest, const void* const _Src, const size_t _Size_bits,
1121111283
const _Elem _Elem0, const _Elem _Elem1) noexcept {
11212-
#ifndef _M_ARM64EC
1121311284
if (_Use_avx2() && _Size_bits >= 256) {
1121411285
_Impl<_Avx_traits>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
1121511286
} else if (_Use_sse42()) {
1121611287
_Impl<_Sse_traits>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
11217-
} else
11218-
#endif // ^^^ !defined(_M_ARM64EC) ^^^
11219-
{
11288+
} else {
1122011289
const auto _Arr = reinterpret_cast<const uint8_t*>(_Src);
1122111290
for (size_t _Ix = 0; _Ix < _Size_bits; ++_Ix) {
1122211291
_Dest[_Size_bits - 1 - _Ix] = ((_Arr[_Ix >> 3] >> (_Ix & 7)) & 1) != 0 ? _Elem1 : _Elem0;
1122311292
}
1122411293
}
1122511294
}
11295+
#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1122611296
} // namespace _Bitset_to_string
1122711297
} // unnamed namespace
1122811298

@@ -11231,17 +11301,26 @@ extern "C" {
1123111301
__declspec(noalias) void __stdcall __std_bitset_to_string_1(
1123211302
char* const _Dest, const void* const _Src, const size_t _Size_bits, const char _Elem0, const char _Elem1) noexcept {
1123311303
using namespace _Bitset_to_string;
11304+
#if defined(_M_ARM64) || defined(_M_ARM64EC)
11305+
_Impl<_Traits_1_neon>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
11306+
#else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv
1123411307
_Dispatch<_Traits_1_avx, _Traits_1_sse>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
11308+
#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1123511309
}
1123611310

1123711311
__declspec(noalias) void __stdcall __std_bitset_to_string_2(wchar_t* const _Dest, const void* const _Src,
1123811312
const size_t _Size_bits, const wchar_t _Elem0, const wchar_t _Elem1) noexcept {
1123911313
using namespace _Bitset_to_string;
11314+
#if defined(_M_ARM64) || defined(_M_ARM64EC)
11315+
_Impl<_Traits_2_neon>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
11316+
#else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv
1124011317
_Dispatch<_Traits_2_avx, _Traits_2_sse>(_Dest, _Src, _Size_bits, _Elem0, _Elem1);
11318+
#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
1124111319
}
1124211320

1124311321
} // extern "C"
1124411322

11323+
#ifndef _M_ARM64
1124511324
namespace {
1124611325
namespace _Bitset_from_string {
1124711326
#ifdef _M_ARM64EC

0 commit comments

Comments
 (0)