@@ -36,6 +36,7 @@ public static void FourChannels(ReadOnlySpan<sbyte> input, Span<sbyte> output)
3636 sbyte * finishPtr = inputP + input . Length ;
3737 while ( inputPtr < finishPtr )
3838 {
39+ // Loads are non-temporal because the data is never used again. Store as normal.
3940 var loaded1 = Vector256 . LoadAlignedNonTemporal ( inputPtr ) ;
4041 var loaded2 = Vector256 . LoadAlignedNonTemporal ( inputPtr + Vector256 < sbyte > . Count ) ;
4142 var loaded3 = Vector256 . LoadAlignedNonTemporal ( inputPtr + Vector256 < sbyte > . Count * 2 ) ;
@@ -56,15 +57,14 @@ public static void FourChannels(ReadOnlySpan<sbyte> input, Span<sbyte> output)
5657 var channel2 = Avx2 . PermuteVar8x32 ( unpackHigh3 , permuteMask ) . AsSByte ( ) ;
5758 var channel3 = Avx2 . PermuteVar8x32 ( unpackLow4 , permuteMask ) . AsSByte ( ) ;
5859 var channel4 = Avx2 . PermuteVar8x32 ( unpackHigh4 , permuteMask ) . AsSByte ( ) ;
59- Vector256 . StoreAlignedNonTemporal ( channel1 , outputPtr ) ;
60- Vector256 . StoreAlignedNonTemporal ( channel2 , outputPtr + ch2Offset ) ;
61- Vector256 . StoreAlignedNonTemporal ( channel3 , outputPtr + ch3Offset ) ;
62- Vector256 . StoreAlignedNonTemporal ( channel4 , outputPtr + ch4Offset ) ;
60+ Vector256 . StoreAligned ( channel1 , outputPtr ) ;
61+ Vector256 . StoreAligned ( channel2 , outputPtr + ch2Offset ) ;
62+ Vector256 . StoreAligned ( channel3 , outputPtr + ch3Offset ) ;
63+ Vector256 . StoreAligned ( channel4 , outputPtr + ch4Offset ) ;
6364 inputPtr += processingLength ;
6465 outputPtr += Vector256 < sbyte > . Count ;
6566 }
6667 }
67- Sse2 . MemoryFence ( ) ;
6868 }
6969 }
7070 else if ( Ssse3 . IsSupported )
@@ -86,6 +86,7 @@ public static void FourChannels(ReadOnlySpan<sbyte> input, Span<sbyte> output)
8686 sbyte * finishPtr = inputP + input . Length ;
8787 while ( inputPtr < finishPtr )
8888 {
89+ // Loads are non-temporal because the data is never used again. Store as normal.
8990 var loaded1 = Vector128 . LoadAlignedNonTemporal ( inputPtr ) ;
9091 var loaded2 = Vector128 . LoadAlignedNonTemporal ( inputPtr + Vector128 < sbyte > . Count ) ;
9192 var loaded3 = Vector128 . LoadAlignedNonTemporal ( inputPtr + Vector128 < sbyte > . Count * 2 ) ;
@@ -102,15 +103,14 @@ public static void FourChannels(ReadOnlySpan<sbyte> input, Span<sbyte> output)
102103 var unpackHigh2 = Sse2 . UnpackHigh ( shuffle3 . AsUInt32 ( ) , shuffle4 . AsUInt32 ( ) ) . AsUInt64 ( ) ;
103104 var channel3 = Sse2 . UnpackLow ( unpackHigh . AsUInt64 ( ) , unpackHigh2 . AsUInt64 ( ) ) . AsSByte ( ) ;
104105 var channel4 = Sse2 . UnpackHigh ( unpackHigh . AsUInt64 ( ) , unpackHigh2 . AsUInt64 ( ) ) . AsSByte ( ) ;
105- Vector128 . StoreAlignedNonTemporal ( channel1 , outputPtr ) ;
106- Vector128 . StoreAlignedNonTemporal ( channel2 , outputPtr + ch2Offset ) ;
107- Vector128 . StoreAlignedNonTemporal ( channel3 , outputPtr + ch3Offset ) ;
108- Vector128 . StoreAlignedNonTemporal ( channel4 , outputPtr + ch4Offset ) ;
106+ Vector128 . StoreAligned ( channel1 , outputPtr ) ;
107+ Vector128 . StoreAligned ( channel2 , outputPtr + ch2Offset ) ;
108+ Vector128 . StoreAligned ( channel3 , outputPtr + ch3Offset ) ;
109+ Vector128 . StoreAligned ( channel4 , outputPtr + ch4Offset ) ;
109110 inputPtr += processingLength ;
110111 outputPtr += Vector128 < sbyte > . Count ;
111112 }
112113 }
113- Sse2 . MemoryFence ( ) ;
114114 }
115115 }
116116 else if ( AdvSimd . Arm64 . IsSupported )
@@ -207,6 +207,7 @@ public static void TwoChannels(ReadOnlySpan<sbyte> input, Span<sbyte> output)
207207 sbyte * finishPtr = inputP + input . Length ;
208208 while ( inputPtr < finishPtr )
209209 {
210+ // Loads are non-temporal because the data is never used again. Store as normal.
210211 var loaded1 = Vector256 . LoadAlignedNonTemporal ( inputPtr ) ;
211212 var loaded2 = Vector256 . LoadAlignedNonTemporal ( inputPtr + Vector256 < sbyte > . Count ) ;
212213 var shuffle1 = Avx2 . Shuffle ( loaded1 , shuffleMask ) ;
@@ -215,13 +216,12 @@ public static void TwoChannels(ReadOnlySpan<sbyte> input, Span<sbyte> output)
215216 var permuted2 = Avx2 . PermuteVar8x32 ( shuffle2 . AsInt32 ( ) , permuteMask ) ;
216217 var channel1 = Avx2 . Permute2x128 ( permuted1 , permuted2 , 0x20 ) . AsSByte ( ) ;
217218 var channel2 = Avx2 . Permute2x128 ( permuted1 , permuted2 , 0x31 ) . AsSByte ( ) ;
218- Vector256 . StoreAlignedNonTemporal ( channel1 , outputPtr ) ;
219- Vector256 . StoreAlignedNonTemporal ( channel2 , outputPtr + ch2Offset ) ;
219+ Vector256 . StoreAligned ( channel1 , outputPtr ) ;
220+ Vector256 . StoreAligned ( channel2 , outputPtr + ch2Offset ) ;
220221 inputPtr += processingLength ;
221222 outputPtr += Vector256 < sbyte > . Count ;
222223 }
223224 }
224- Sse2 . MemoryFence ( ) ;
225225 }
226226 }
227227 else if ( Ssse3 . IsSupported )
@@ -242,19 +242,19 @@ public static void TwoChannels(ReadOnlySpan<sbyte> input, Span<sbyte> output)
242242 sbyte * finishPtr = inputP + input . Length ;
243243 while ( inputPtr < finishPtr )
244244 {
245+ // Loads are non-temporal because the data is never used again. Store as normal.
245246 var loaded1 = Vector128 . LoadAlignedNonTemporal ( inputPtr ) ;
246247 var loaded2 = Vector128 . LoadAlignedNonTemporal ( inputPtr + Vector128 < sbyte > . Count ) ;
247248 var shuffle1 = Ssse3 . Shuffle ( loaded1 , shuffleMask ) ;
248249 var shuffle2 = Ssse3 . Shuffle ( loaded2 , shuffleMask ) ;
249250 var channel1 = Sse2 . UnpackLow ( shuffle1 . AsUInt64 ( ) , shuffle2 . AsUInt64 ( ) ) . AsSByte ( ) ;
250251 var channel2 = Sse2 . UnpackHigh ( shuffle1 . AsUInt64 ( ) , shuffle2 . AsUInt64 ( ) ) . AsSByte ( ) ;
251- Vector128 . StoreAlignedNonTemporal ( channel1 , outputPtr ) ;
252- Vector128 . StoreAlignedNonTemporal ( channel2 , outputPtr + ch2Offset ) ;
252+ Vector128 . StoreAligned ( channel1 , outputPtr ) ;
253+ Vector128 . StoreAligned ( channel2 , outputPtr + ch2Offset ) ;
253254 inputPtr += processingLength ;
254255 outputPtr += Vector128 < sbyte > . Count ;
255256 }
256257 }
257- Sse2 . MemoryFence ( ) ;
258258 }
259259 }
260260 else if ( AdvSimd . Arm64 . IsSupported )
0 commit comments