Skip to content

Commit 8e7c879

Browse files
committed
AVX2 16-bit shuffle paths
1 parent 1ff1b0f commit 8e7c879

5 files changed

Lines changed: 239 additions & 59 deletions

File tree

source/TS.NET.Engine/EngineManager.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ public bool TryStart(string configurationFile, string calibrationFile, string de
146146
{
147147
AdcChannelMode = AdcChannelMode.Single,
148148
EnabledChannels = 0x01,
149-
SampleRateHz = 1000000000,
149+
SampleRateHz = 1_000_000_000,
150150
Resolution = AdcResolution.EightBit
151151
};
152152
initialHardwareConfiguration.Frontend[0] = ThunderscopeChannelFrontend.Default();

source/TS.NET.Engine/Threads/ProcessingThread.cs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ private static unsafe void Loop(
102102
};
103103
var processingConfig = new ThunderscopeProcessingConfig
104104
{
105-
ChannelDataLength = 1000,
105+
ChannelDataLength = 1_000_000,
106106
ChannelDataType = initialChannelDataType,
107107
Mode = Mode.Normal, // Temporary, change back to AUTO when NotImplementedException fixed
108108
TriggerChannel = TriggerChannel.Channel1,
@@ -590,7 +590,6 @@ private static unsafe void Loop(
590590
periodicReadBytes += preShuffleMemory.LengthBytes;
591591
periodicReadSamplesPerChannel += sampleLengthPerChannel;
592592

593-
// To do: decide if the "Shuffle" and "Write to acquisition buffers" regions should move inside the `if (runMode) { }`
594593
// Shuffle
595594
switch (currentHardwareConfig.Acquisition.AdcChannelMode)
596595
{
@@ -607,7 +606,7 @@ private static unsafe void Loop(
607606
if (!optimisationWarning)
608607
{
609608
optimisationWarning = true;
610-
logger.LogWarning("Unoptimised ShuffleI16.TwoChannels");
609+
logger.LogWarning("Unoptimised ShuffleI16.TwoChannels, missing Ssse3 & Arm64 paths");
611610
}
612611
ShuffleI16.TwoChannels(input: preShuffleMemory.DataSpanI16, output: postShuffleMemory.DataSpanI16);
613612
break;
@@ -623,7 +622,7 @@ private static unsafe void Loop(
623622
if (!optimisationWarning)
624623
{
625624
optimisationWarning = true;
626-
logger.LogWarning("Unoptimised ShuffleI16.FourChannels");
625+
logger.LogWarning("Unoptimised ShuffleI16.FourChannels, missing Ssse3 & Arm64 paths");
627626
}
628627
ShuffleI16.FourChannels(input: preShuffleMemory.DataSpanI16, output: postShuffleMemory.DataSpanI16);
629628
break;
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
using System;
2+
using System.Runtime.InteropServices;
3+
using Xunit;
4+
5+
namespace TS.NET.Tests;
6+
7+
public class ShuffleI16Tests
8+
{
9+
[Fact]
10+
public unsafe void ShuffleI16_FourChannels_Samples128()
11+
{
12+
const int length = 128;
13+
var inputP = NativeMemory.AlignedAlloc(length * sizeof(short), 32);
14+
var input = new Span<short>((short*)inputP, length);
15+
var outputP = NativeMemory.AlignedAlloc(length * sizeof(short), 32);
16+
var output = new Span<short>((short*)outputP, length);
17+
18+
for (int i = 0; i < length; i += 4)
19+
{
20+
input[i] = 1;
21+
input[i + 1] = 2;
22+
input[i + 2] = 3;
23+
input[i + 3] = 4;
24+
}
25+
26+
ShuffleI16.FourChannels(input, output);
27+
28+
Span<short> expectedOutput = new short[length];
29+
var runLength = length / 4;
30+
expectedOutput.Slice(runLength * 0, runLength).Fill(1);
31+
expectedOutput.Slice(runLength * 1, runLength).Fill(2);
32+
expectedOutput.Slice(runLength * 2, runLength).Fill(3);
33+
expectedOutput.Slice(runLength * 3, runLength).Fill(4);
34+
35+
for (int i = 0; i < length; i++)
36+
{
37+
Assert.Equal(expectedOutput[i], output[i]);
38+
}
39+
40+
NativeMemory.AlignedFree(inputP);
41+
NativeMemory.AlignedFree(outputP);
42+
}
43+
44+
[Fact]
45+
public unsafe void ShuffleI16_TwoChannels_Samples128()
46+
{
47+
const int length = 128;
48+
var inputP = NativeMemory.AlignedAlloc(length * sizeof(short), 32);
49+
var input = new Span<short>((short*)inputP, length);
50+
var outputP = NativeMemory.AlignedAlloc(length * sizeof(short), 32);
51+
var output = new Span<short>((short*)outputP, length);
52+
53+
for (int i = 0; i < length; i += 2)
54+
{
55+
input[i] = 1;
56+
input[i + 1] = 2;
57+
}
58+
59+
ShuffleI16.TwoChannels(input, output);
60+
61+
Span<short> expectedOutput = new short[length];
62+
var runLength = length / 2;
63+
expectedOutput.Slice(runLength * 0, runLength).Fill(1);
64+
expectedOutput.Slice(runLength * 1, runLength).Fill(2);
65+
66+
for (int i = 0; i < length; i++)
67+
{
68+
Assert.Equal(expectedOutput[i], output[i]);
69+
}
70+
71+
NativeMemory.AlignedFree(inputP);
72+
NativeMemory.AlignedFree(outputP);
73+
}
74+
}

source/TS.NET/Processing/ShuffleI16.cs

Lines changed: 136 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
using System.Runtime.Intrinsics;
2+
using System.Runtime.Intrinsics.X86;
3+
14
namespace TS.NET;
25

36
public static class ShuffleI16
@@ -9,29 +12,87 @@ public static void FourChannels(ReadOnlySpan<short> input, Span<short> output)
912

1013
int channelBlockSize = output.Length / 4;
1114

12-
var processingLength = 4;
13-
if (input.Length % processingLength != 0)
14-
throw new ArgumentException($"Input length must be multiple of {processingLength}");
15+
if (Avx2.IsSupported) // Const after JIT/AOT
16+
{
17+
int processingLength = Vector256<short>.Count * 4;
18+
if (input.Length % processingLength != 0)
19+
throw new ArgumentException($"Input length must be multiple of {processingLength}");
1520

16-
int ch2Offset = channelBlockSize;
17-
int ch3Offset = channelBlockSize * 2;
18-
int ch4Offset = channelBlockSize * 3;
19-
unsafe
21+
int ch2Offset = channelBlockSize;
22+
int ch3Offset = channelBlockSize * 2;
23+
int ch4Offset = channelBlockSize * 3;
24+
Vector256<sbyte> shuffleMask = Vector256.Create(
25+
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, // 128-bit lane
26+
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); // 128-bit lane
27+
Vector256<int> permuteMask = Vector256.Create(0, 4, 2, 6, 1, 5, 3, 7);
28+
unsafe
29+
{
30+
fixed (short* inputP = input)
31+
fixed (short* outputP = output)
32+
{
33+
short* inputPtr = inputP;
34+
short* outputPtr = outputP;
35+
short* finishPtr = inputP + input.Length;
36+
while (inputPtr < finishPtr)
37+
{
38+
var loaded1 = Vector256.LoadAlignedNonTemporal(inputPtr);
39+
var loaded2 = Vector256.LoadAlignedNonTemporal(inputPtr + Vector256<short>.Count);
40+
var loaded3 = Vector256.LoadAlignedNonTemporal(inputPtr + Vector256<short>.Count * 2);
41+
var loaded4 = Vector256.LoadAlignedNonTemporal(inputPtr + Vector256<short>.Count * 3);
42+
var shuffle1 = Avx2.Shuffle(loaded1.AsSByte(), shuffleMask).AsInt32();
43+
var shuffle2 = Avx2.Shuffle(loaded2.AsSByte(), shuffleMask).AsInt32();
44+
var shuffle3 = Avx2.Shuffle(loaded3.AsSByte(), shuffleMask).AsInt32();
45+
var shuffle4 = Avx2.Shuffle(loaded4.AsSByte(), shuffleMask).AsInt32();
46+
var unpackLow1 = Avx2.UnpackLow(shuffle1, shuffle2);
47+
var unpackHigh1 = Avx2.UnpackHigh(shuffle1, shuffle2);
48+
var unpackLow2 = Avx2.UnpackLow(shuffle3, shuffle4);
49+
var unpackHigh2 = Avx2.UnpackHigh(shuffle3, shuffle4);
50+
var unpackLow3 = Avx2.UnpackLow(unpackLow1, unpackLow2);
51+
var unpackHigh3 = Avx2.UnpackHigh(unpackLow1, unpackLow2);
52+
var unpackLow4 = Avx2.UnpackLow(unpackHigh1, unpackHigh2);
53+
var unpackHigh4 = Avx2.UnpackHigh(unpackHigh1, unpackHigh2);
54+
var channel1 = Avx2.PermuteVar8x32(unpackLow3, permuteMask).AsInt16();
55+
var channel2 = Avx2.PermuteVar8x32(unpackHigh3, permuteMask).AsInt16();
56+
var channel3 = Avx2.PermuteVar8x32(unpackLow4, permuteMask).AsInt16();
57+
var channel4 = Avx2.PermuteVar8x32(unpackHigh4, permuteMask).AsInt16();
58+
Vector256.StoreAlignedNonTemporal(channel1, outputPtr);
59+
Vector256.StoreAlignedNonTemporal(channel2, outputPtr + ch2Offset);
60+
Vector256.StoreAlignedNonTemporal(channel3, outputPtr + ch3Offset);
61+
Vector256.StoreAlignedNonTemporal(channel4, outputPtr + ch4Offset);
62+
inputPtr += processingLength;
63+
outputPtr += Vector256<short>.Count;
64+
}
65+
}
66+
Sse2.MemoryFence();
67+
}
68+
}
69+
else
2070
{
21-
fixed (short* inputP = input)
22-
fixed (short* outputP = output)
71+
var processingLength = 4;
72+
if (input.Length % processingLength != 0)
73+
throw new ArgumentException($"Input length must be multiple of {processingLength}");
74+
75+
int ch2Offset = channelBlockSize;
76+
int ch3Offset = channelBlockSize * 2;
77+
int ch4Offset = channelBlockSize * 3;
78+
79+
unsafe
2380
{
24-
short* inputPtr = inputP;
25-
short* outputPtr = outputP;
26-
short* finishPtr = inputP + input.Length;
27-
while (inputPtr < finishPtr)
81+
fixed (short* inputP = input)
82+
fixed (short* outputP = output)
2883
{
29-
outputPtr[0] = inputPtr[0];
30-
outputPtr[0 + channelBlockSize] = inputPtr[1];
31-
outputPtr[0 + channelBlockSize] = inputPtr[2];
32-
outputPtr[0 + channelBlockSize] = inputPtr[3];
33-
inputPtr += processingLength;
34-
outputPtr += 1;
84+
short* inputPtr = inputP;
85+
short* outputPtr = outputP;
86+
short* finishPtr = inputP + input.Length;
87+
while (inputPtr < finishPtr)
88+
{
89+
outputPtr[0] = inputPtr[0];
90+
outputPtr[0 + ch2Offset] = inputPtr[1];
91+
outputPtr[0 + ch3Offset] = inputPtr[2];
92+
outputPtr[0 + ch4Offset] = inputPtr[3];
93+
inputPtr += processingLength;
94+
outputPtr += 1;
95+
}
3596
}
3697
}
3798
}
@@ -44,25 +105,66 @@ public static void TwoChannels(ReadOnlySpan<short> input, Span<short> output)
44105

45106
int channelBlockSize = output.Length / 2;
46107

47-
var processingLength = 2;
48-
if (input.Length % processingLength != 0)
49-
throw new ArgumentException($"Input length must be multiple of {processingLength}");
108+
if (Avx2.IsSupported) // Const after JIT/AOT
109+
{
110+
var processingLength = Vector256<short>.Count * 2;
111+
if (input.Length % processingLength != 0)
112+
throw new ArgumentException($"Input length must be multiple of {processingLength}");
50113

51-
int ch2Offset = channelBlockSize;
52-
unsafe
114+
int ch2Offset = channelBlockSize;
115+
Vector256<sbyte> shuffleMask = Vector256.Create(
116+
0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, // 128-bit lane
117+
0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); // 128-bit lane
118+
Vector256<int> permuteMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
119+
unsafe
120+
{
121+
fixed (short* inputP = input)
122+
fixed (short* outputP = output)
123+
{
124+
short* inputPtr = inputP;
125+
short* outputPtr = outputP;
126+
short* finishPtr = inputP + input.Length;
127+
while (inputPtr < finishPtr)
128+
{
129+
var loaded1 = Vector256.LoadAlignedNonTemporal(inputPtr);
130+
var loaded2 = Vector256.LoadAlignedNonTemporal(inputPtr + Vector256<short>.Count);
131+
var shuffle1 = Avx2.Shuffle(loaded1.AsSByte(), shuffleMask);
132+
var shuffle2 = Avx2.Shuffle(loaded2.AsSByte(), shuffleMask);
133+
var permuted1 = Avx2.PermuteVar8x32(shuffle1.AsInt32(), permuteMask);
134+
var permuted2 = Avx2.PermuteVar8x32(shuffle2.AsInt32(), permuteMask);
135+
var channel1 = Avx2.Permute2x128(permuted1, permuted2, 0x20).AsInt16();
136+
var channel2 = Avx2.Permute2x128(permuted1, permuted2, 0x31).AsInt16();
137+
Vector256.StoreAlignedNonTemporal(channel1, outputPtr);
138+
Vector256.StoreAlignedNonTemporal(channel2, outputPtr + ch2Offset);
139+
inputPtr += processingLength;
140+
outputPtr += Vector256<short>.Count;
141+
}
142+
}
143+
Sse2.MemoryFence();
144+
}
145+
}
146+
else
53147
{
54-
fixed (short* inputP = input)
55-
fixed (short* outputP = output)
148+
var processingLength = 2;
149+
if (input.Length % processingLength != 0)
150+
throw new ArgumentException($"Input length must be multiple of {processingLength}");
151+
152+
int ch2Offset = channelBlockSize;
153+
unsafe
56154
{
57-
short* inputPtr = inputP;
58-
short* outputPtr = outputP;
59-
short* finishPTr = inputP + input.Length;
60-
while (inputPtr < finishPTr)
155+
fixed (short* inputP = input)
156+
fixed (short* outputP = output)
61157
{
62-
outputPtr[0] = inputPtr[0];
63-
outputPtr[0 + ch2Offset] = inputPtr[1];
64-
inputPtr += processingLength;
65-
outputPtr += 1;
158+
short* inputPtr = inputP;
159+
short* outputPtr = outputP;
160+
short* finishPTr = inputP + input.Length;
161+
while (inputPtr < finishPTr)
162+
{
163+
outputPtr[0] = inputPtr[0];
164+
outputPtr[0 + ch2Offset] = inputPtr[1];
165+
inputPtr += processingLength;
166+
outputPtr += 1;
167+
}
66168
}
67169
}
68170
}

0 commit comments

Comments
 (0)