Skip to content

Commit a744028

Browse files
committed
Tweaks
1 parent e89075a commit a744028

2 files changed

Lines changed: 24 additions & 24 deletions

File tree

source/TS.NET/Processing/ShuffleI16.cs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ public static void FourChannels(ReadOnlySpan<short> input, Span<short> output)
3535
short* finishPtr = inputP + input.Length;
3636
while (inputPtr < finishPtr)
3737
{
38+
// Loads are non-temporal because the data is never used again. Store as normal.
3839
var loaded1 = Vector256.LoadAlignedNonTemporal(inputPtr);
3940
var loaded2 = Vector256.LoadAlignedNonTemporal(inputPtr + Vector256<short>.Count);
4041
var loaded3 = Vector256.LoadAlignedNonTemporal(inputPtr + Vector256<short>.Count * 2);
@@ -55,15 +56,14 @@ public static void FourChannels(ReadOnlySpan<short> input, Span<short> output)
5556
var channel2 = Avx2.PermuteVar8x32(unpackHigh3, permuteMask).AsInt16();
5657
var channel3 = Avx2.PermuteVar8x32(unpackLow4, permuteMask).AsInt16();
5758
var channel4 = Avx2.PermuteVar8x32(unpackHigh4, permuteMask).AsInt16();
58-
Vector256.StoreAlignedNonTemporal(channel1, outputPtr);
59-
Vector256.StoreAlignedNonTemporal(channel2, outputPtr + ch2Offset);
60-
Vector256.StoreAlignedNonTemporal(channel3, outputPtr + ch3Offset);
61-
Vector256.StoreAlignedNonTemporal(channel4, outputPtr + ch4Offset);
59+
Vector256.StoreAligned(channel1, outputPtr);
60+
Vector256.StoreAligned(channel2, outputPtr + ch2Offset);
61+
Vector256.StoreAligned(channel3, outputPtr + ch3Offset);
62+
Vector256.StoreAligned(channel4, outputPtr + ch4Offset);
6263
inputPtr += processingLength;
6364
outputPtr += Vector256<short>.Count;
6465
}
6566
}
66-
Sse2.MemoryFence();
6767
}
6868
}
6969
else
@@ -126,6 +126,7 @@ public static void TwoChannels(ReadOnlySpan<short> input, Span<short> output)
126126
short* finishPtr = inputP + input.Length;
127127
while (inputPtr < finishPtr)
128128
{
129+
// Loads are non-temporal because the data is never used again. Store as normal.
129130
var loaded1 = Vector256.LoadAlignedNonTemporal(inputPtr);
130131
var loaded2 = Vector256.LoadAlignedNonTemporal(inputPtr + Vector256<short>.Count);
131132
var shuffle1 = Avx2.Shuffle(loaded1.AsSByte(), shuffleMask);
@@ -134,13 +135,12 @@ public static void TwoChannels(ReadOnlySpan<short> input, Span<short> output)
134135
var permuted2 = Avx2.PermuteVar8x32(shuffle2.AsInt32(), permuteMask);
135136
var channel1 = Avx2.Permute2x128(permuted1, permuted2, 0x20).AsInt16();
136137
var channel2 = Avx2.Permute2x128(permuted1, permuted2, 0x31).AsInt16();
137-
Vector256.StoreAlignedNonTemporal(channel1, outputPtr);
138-
Vector256.StoreAlignedNonTemporal(channel2, outputPtr + ch2Offset);
138+
Vector256.StoreAligned(channel1, outputPtr);
139+
Vector256.StoreAligned(channel2, outputPtr + ch2Offset);
139140
inputPtr += processingLength;
140141
outputPtr += Vector256<short>.Count;
141142
}
142143
}
143-
Sse2.MemoryFence();
144144
}
145145
}
146146
else

source/TS.NET/Processing/ShuffleI8.cs

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ public static void FourChannels(ReadOnlySpan<sbyte> input, Span<sbyte> output)
3636
sbyte* finishPtr = inputP + input.Length;
3737
while (inputPtr < finishPtr)
3838
{
39+
// Loads are non-temporal because the data is never used again. Store as normal.
3940
var loaded1 = Vector256.LoadAlignedNonTemporal(inputPtr);
4041
var loaded2 = Vector256.LoadAlignedNonTemporal(inputPtr + Vector256<sbyte>.Count);
4142
var loaded3 = Vector256.LoadAlignedNonTemporal(inputPtr + Vector256<sbyte>.Count * 2);
@@ -56,15 +57,14 @@ public static void FourChannels(ReadOnlySpan<sbyte> input, Span<sbyte> output)
5657
var channel2 = Avx2.PermuteVar8x32(unpackHigh3, permuteMask).AsSByte();
5758
var channel3 = Avx2.PermuteVar8x32(unpackLow4, permuteMask).AsSByte();
5859
var channel4 = Avx2.PermuteVar8x32(unpackHigh4, permuteMask).AsSByte();
59-
Vector256.StoreAlignedNonTemporal(channel1, outputPtr);
60-
Vector256.StoreAlignedNonTemporal(channel2, outputPtr + ch2Offset);
61-
Vector256.StoreAlignedNonTemporal(channel3, outputPtr + ch3Offset);
62-
Vector256.StoreAlignedNonTemporal(channel4, outputPtr + ch4Offset);
60+
Vector256.StoreAligned(channel1, outputPtr);
61+
Vector256.StoreAligned(channel2, outputPtr + ch2Offset);
62+
Vector256.StoreAligned(channel3, outputPtr + ch3Offset);
63+
Vector256.StoreAligned(channel4, outputPtr + ch4Offset);
6364
inputPtr += processingLength;
6465
outputPtr += Vector256<sbyte>.Count;
6566
}
6667
}
67-
Sse2.MemoryFence();
6868
}
6969
}
7070
else if (Ssse3.IsSupported)
@@ -86,6 +86,7 @@ public static void FourChannels(ReadOnlySpan<sbyte> input, Span<sbyte> output)
8686
sbyte* finishPtr = inputP + input.Length;
8787
while (inputPtr < finishPtr)
8888
{
89+
// Loads are non-temporal because the data is never used again. Store as normal.
8990
var loaded1 = Vector128.LoadAlignedNonTemporal(inputPtr);
9091
var loaded2 = Vector128.LoadAlignedNonTemporal(inputPtr + Vector128<sbyte>.Count);
9192
var loaded3 = Vector128.LoadAlignedNonTemporal(inputPtr + Vector128<sbyte>.Count * 2);
@@ -102,15 +103,14 @@ public static void FourChannels(ReadOnlySpan<sbyte> input, Span<sbyte> output)
102103
var unpackHigh2 = Sse2.UnpackHigh(shuffle3.AsUInt32(), shuffle4.AsUInt32()).AsUInt64();
103104
var channel3 = Sse2.UnpackLow(unpackHigh.AsUInt64(), unpackHigh2.AsUInt64()).AsSByte();
104105
var channel4 = Sse2.UnpackHigh(unpackHigh.AsUInt64(), unpackHigh2.AsUInt64()).AsSByte();
105-
Vector128.StoreAlignedNonTemporal(channel1, outputPtr);
106-
Vector128.StoreAlignedNonTemporal(channel2, outputPtr + ch2Offset);
107-
Vector128.StoreAlignedNonTemporal(channel3, outputPtr + ch3Offset);
108-
Vector128.StoreAlignedNonTemporal(channel4, outputPtr + ch4Offset);
106+
Vector128.StoreAligned(channel1, outputPtr);
107+
Vector128.StoreAligned(channel2, outputPtr + ch2Offset);
108+
Vector128.StoreAligned(channel3, outputPtr + ch3Offset);
109+
Vector128.StoreAligned(channel4, outputPtr + ch4Offset);
109110
inputPtr += processingLength;
110111
outputPtr += Vector128<sbyte>.Count;
111112
}
112113
}
113-
Sse2.MemoryFence();
114114
}
115115
}
116116
else if (AdvSimd.Arm64.IsSupported)
@@ -207,6 +207,7 @@ public static void TwoChannels(ReadOnlySpan<sbyte> input, Span<sbyte> output)
207207
sbyte* finishPtr = inputP + input.Length;
208208
while (inputPtr < finishPtr)
209209
{
210+
// Loads are non-temporal because the data is never used again. Store as normal.
210211
var loaded1 = Vector256.LoadAlignedNonTemporal(inputPtr);
211212
var loaded2 = Vector256.LoadAlignedNonTemporal(inputPtr + Vector256<sbyte>.Count);
212213
var shuffle1 = Avx2.Shuffle(loaded1, shuffleMask);
@@ -215,13 +216,12 @@ public static void TwoChannels(ReadOnlySpan<sbyte> input, Span<sbyte> output)
215216
var permuted2 = Avx2.PermuteVar8x32(shuffle2.AsInt32(), permuteMask);
216217
var channel1 = Avx2.Permute2x128(permuted1, permuted2, 0x20).AsSByte();
217218
var channel2 = Avx2.Permute2x128(permuted1, permuted2, 0x31).AsSByte();
218-
Vector256.StoreAlignedNonTemporal(channel1, outputPtr);
219-
Vector256.StoreAlignedNonTemporal(channel2, outputPtr + ch2Offset);
219+
Vector256.StoreAligned(channel1, outputPtr);
220+
Vector256.StoreAligned(channel2, outputPtr + ch2Offset);
220221
inputPtr += processingLength;
221222
outputPtr += Vector256<sbyte>.Count;
222223
}
223224
}
224-
Sse2.MemoryFence();
225225
}
226226
}
227227
else if (Ssse3.IsSupported)
@@ -242,19 +242,19 @@ public static void TwoChannels(ReadOnlySpan<sbyte> input, Span<sbyte> output)
242242
sbyte* finishPtr = inputP + input.Length;
243243
while (inputPtr < finishPtr)
244244
{
245+
// Loads are non-temporal because the data is never used again. Store as normal.
245246
var loaded1 = Vector128.LoadAlignedNonTemporal(inputPtr);
246247
var loaded2 = Vector128.LoadAlignedNonTemporal(inputPtr + Vector128<sbyte>.Count);
247248
var shuffle1 = Ssse3.Shuffle(loaded1, shuffleMask);
248249
var shuffle2 = Ssse3.Shuffle(loaded2, shuffleMask);
249250
var channel1 = Sse2.UnpackLow(shuffle1.AsUInt64(), shuffle2.AsUInt64()).AsSByte();
250251
var channel2 = Sse2.UnpackHigh(shuffle1.AsUInt64(), shuffle2.AsUInt64()).AsSByte();
251-
Vector128.StoreAlignedNonTemporal(channel1, outputPtr);
252-
Vector128.StoreAlignedNonTemporal(channel2, outputPtr + ch2Offset);
252+
Vector128.StoreAligned(channel1, outputPtr);
253+
Vector128.StoreAligned(channel2, outputPtr + ch2Offset);
253254
inputPtr += processingLength;
254255
outputPtr += Vector128<sbyte>.Count;
255256
}
256257
}
257-
Sse2.MemoryFence();
258258
}
259259
}
260260
else if (AdvSimd.Arm64.IsSupported)

0 commit comments

Comments
 (0)