diff --git a/Inc/DirectXMathVector.inl b/Inc/DirectXMathVector.inl index 2b0ce98..20bbab1 100644 --- a/Inc/DirectXMathVector.inl +++ b/Inc/DirectXMathVector.inl @@ -1239,18 +1239,23 @@ inline XMVECTOR XM_CALLCONV XMVectorSwizzle unsigned int elem[4] = { E0, E1, E2, E3 }; __m128i vControl = _mm_loadu_si128(reinterpret_cast(&elem[0])); return _mm_permutevar_ps(V, vControl); +#else +#if defined(__GNUC__) && !defined(__clang__) + // workaround some GCC optimization behavior that breaks this function + XMVECTORU32 T; + T.v = V; + auto aPtr = reinterpret_cast(&T); #else auto aPtr = reinterpret_cast(&V); +#endif - XMVECTOR Result; - auto pWork = reinterpret_cast(&Result); - - pWork[0] = aPtr[E0]; - pWork[1] = aPtr[E1]; - pWork[2] = aPtr[E2]; - pWork[3] = aPtr[E3]; + XMVECTORU32 vResult; + vResult.u[0] = aPtr[E0]; + vResult.u[1] = aPtr[E1]; + vResult.u[2] = aPtr[E2]; + vResult.u[3] = aPtr[E3]; - return Result; + return vResult.v; #endif } @@ -1313,29 +1318,38 @@ inline XMVECTOR XM_CALLCONV XMVectorPermute #else const uint32_t* aPtr[2]; + +#if defined(__GNUC__) && !defined(__clang__) + // workaround some GCC optimization behavior that breaks this function + XMVECTORU32 T1; + T1.v = V1; + XMVECTORU32 T2; + T2.v = V2; + aPtr[0] = reinterpret_cast(&T1); + aPtr[1] = reinterpret_cast(&T2); +#else aPtr[0] = reinterpret_cast(&V1); aPtr[1] = reinterpret_cast(&V2); +#endif - XMVECTOR Result; - auto pWork = reinterpret_cast(&Result); - + XMVECTORU32 vResult; const uint32_t i0 = PermuteX & 3; const uint32_t vi0 = PermuteX >> 2; - pWork[0] = aPtr[vi0][i0]; + vResult.u[0] = aPtr[vi0][i0]; const uint32_t i1 = PermuteY & 3; const uint32_t vi1 = PermuteY >> 2; - pWork[1] = aPtr[vi1][i1]; + vResult.u[1] = aPtr[vi1][i1]; const uint32_t i2 = PermuteZ & 3; const uint32_t vi2 = PermuteZ >> 2; - pWork[2] = aPtr[vi2][i2]; + vResult.u[2] = aPtr[vi2][i2]; const uint32_t i3 = PermuteW & 3; const uint32_t vi3 = PermuteW >> 2; - pWork[3] = aPtr[vi3][i3]; + vResult.u[3] = aPtr[vi3][i3]; - return Result; + return vResult.v; #endif }