1+ using System . Runtime . Intrinsics ;
2+ using System . Runtime . Intrinsics . X86 ;
3+
14namespace TS . NET ;
25
36public static class ShuffleI16
@@ -9,29 +12,87 @@ public static void FourChannels(ReadOnlySpan<short> input, Span<short> output)
912
1013 int channelBlockSize = output . Length / 4 ;
1114
12- var processingLength = 4 ;
13- if ( input . Length % processingLength != 0 )
14- throw new ArgumentException ( $ "Input length must be multiple of { processingLength } ") ;
15+ if ( Avx2 . IsSupported ) // Const after JIT/AOT
16+ {
17+ int processingLength = Vector256 < short > . Count * 4 ;
18+ if ( input . Length % processingLength != 0 )
19+ throw new ArgumentException ( $ "Input length must be multiple of { processingLength } ") ;
1520
16- int ch2Offset = channelBlockSize ;
17- int ch3Offset = channelBlockSize * 2 ;
18- int ch4Offset = channelBlockSize * 3 ;
19- unsafe
21+ int ch2Offset = channelBlockSize ;
22+ int ch3Offset = channelBlockSize * 2 ;
23+ int ch4Offset = channelBlockSize * 3 ;
24+ Vector256 < sbyte > shuffleMask = Vector256 . Create (
25+ 0 , 1 , 8 , 9 , 2 , 3 , 10 , 11 , 4 , 5 , 12 , 13 , 6 , 7 , 14 , 15 , // 128-bit lane
26+ 0 , 1 , 8 , 9 , 2 , 3 , 10 , 11 , 4 , 5 , 12 , 13 , 6 , 7 , 14 , 15 ) ; // 128-bit lane
27+ Vector256 < int > permuteMask = Vector256 . Create ( 0 , 4 , 2 , 6 , 1 , 5 , 3 , 7 ) ;
28+ unsafe
29+ {
30+ fixed ( short * inputP = input )
31+ fixed ( short * outputP = output )
32+ {
33+ short * inputPtr = inputP ;
34+ short * outputPtr = outputP ;
35+ short * finishPtr = inputP + input . Length ;
36+ while ( inputPtr < finishPtr )
37+ {
38+ var loaded1 = Vector256 . LoadAlignedNonTemporal ( inputPtr ) ;
39+ var loaded2 = Vector256 . LoadAlignedNonTemporal ( inputPtr + Vector256 < short > . Count ) ;
40+ var loaded3 = Vector256 . LoadAlignedNonTemporal ( inputPtr + Vector256 < short > . Count * 2 ) ;
41+ var loaded4 = Vector256 . LoadAlignedNonTemporal ( inputPtr + Vector256 < short > . Count * 3 ) ;
42+ var shuffle1 = Avx2 . Shuffle ( loaded1 . AsSByte ( ) , shuffleMask ) . AsInt32 ( ) ;
43+ var shuffle2 = Avx2 . Shuffle ( loaded2 . AsSByte ( ) , shuffleMask ) . AsInt32 ( ) ;
44+ var shuffle3 = Avx2 . Shuffle ( loaded3 . AsSByte ( ) , shuffleMask ) . AsInt32 ( ) ;
45+ var shuffle4 = Avx2 . Shuffle ( loaded4 . AsSByte ( ) , shuffleMask ) . AsInt32 ( ) ;
46+ var unpackLow1 = Avx2 . UnpackLow ( shuffle1 , shuffle2 ) ;
47+ var unpackHigh1 = Avx2 . UnpackHigh ( shuffle1 , shuffle2 ) ;
48+ var unpackLow2 = Avx2 . UnpackLow ( shuffle3 , shuffle4 ) ;
49+ var unpackHigh2 = Avx2 . UnpackHigh ( shuffle3 , shuffle4 ) ;
50+ var unpackLow3 = Avx2 . UnpackLow ( unpackLow1 , unpackLow2 ) ;
51+ var unpackHigh3 = Avx2 . UnpackHigh ( unpackLow1 , unpackLow2 ) ;
52+ var unpackLow4 = Avx2 . UnpackLow ( unpackHigh1 , unpackHigh2 ) ;
53+ var unpackHigh4 = Avx2 . UnpackHigh ( unpackHigh1 , unpackHigh2 ) ;
54+ var channel1 = Avx2 . PermuteVar8x32 ( unpackLow3 , permuteMask ) . AsInt16 ( ) ;
55+ var channel2 = Avx2 . PermuteVar8x32 ( unpackHigh3 , permuteMask ) . AsInt16 ( ) ;
56+ var channel3 = Avx2 . PermuteVar8x32 ( unpackLow4 , permuteMask ) . AsInt16 ( ) ;
57+ var channel4 = Avx2 . PermuteVar8x32 ( unpackHigh4 , permuteMask ) . AsInt16 ( ) ;
58+ Vector256 . StoreAlignedNonTemporal ( channel1 , outputPtr ) ;
59+ Vector256 . StoreAlignedNonTemporal ( channel2 , outputPtr + ch2Offset ) ;
60+ Vector256 . StoreAlignedNonTemporal ( channel3 , outputPtr + ch3Offset ) ;
61+ Vector256 . StoreAlignedNonTemporal ( channel4 , outputPtr + ch4Offset ) ;
62+ inputPtr += processingLength ;
63+ outputPtr += Vector256 < short > . Count ;
64+ }
65+ }
66+ Sse2 . MemoryFence ( ) ;
67+ }
68+ }
69+ else
2070 {
21- fixed ( short * inputP = input )
22- fixed ( short * outputP = output )
71+ var processingLength = 4 ;
72+ if ( input . Length % processingLength != 0 )
73+ throw new ArgumentException ( $ "Input length must be multiple of { processingLength } ") ;
74+
75+ int ch2Offset = channelBlockSize ;
76+ int ch3Offset = channelBlockSize * 2 ;
77+ int ch4Offset = channelBlockSize * 3 ;
78+
79+ unsafe
2380 {
24- short * inputPtr = inputP ;
25- short * outputPtr = outputP ;
26- short * finishPtr = inputP + input . Length ;
27- while ( inputPtr < finishPtr )
81+ fixed ( short * inputP = input )
82+ fixed ( short * outputP = output )
2883 {
29- outputPtr [ 0 ] = inputPtr [ 0 ] ;
30- outputPtr [ 0 + channelBlockSize ] = inputPtr [ 1 ] ;
31- outputPtr [ 0 + channelBlockSize ] = inputPtr [ 2 ] ;
32- outputPtr [ 0 + channelBlockSize ] = inputPtr [ 3 ] ;
33- inputPtr += processingLength ;
34- outputPtr += 1 ;
84+ short * inputPtr = inputP ;
85+ short * outputPtr = outputP ;
86+ short * finishPtr = inputP + input . Length ;
87+ while ( inputPtr < finishPtr )
88+ {
89+ outputPtr [ 0 ] = inputPtr [ 0 ] ;
90+ outputPtr [ 0 + ch2Offset ] = inputPtr [ 1 ] ;
91+ outputPtr [ 0 + ch3Offset ] = inputPtr [ 2 ] ;
92+ outputPtr [ 0 + ch4Offset ] = inputPtr [ 3 ] ;
93+ inputPtr += processingLength ;
94+ outputPtr += 1 ;
95+ }
3596 }
3697 }
3798 }
@@ -44,25 +105,66 @@ public static void TwoChannels(ReadOnlySpan<short> input, Span<short> output)
44105
45106 int channelBlockSize = output . Length / 2 ;
46107
47- var processingLength = 2 ;
48- if ( input . Length % processingLength != 0 )
49- throw new ArgumentException ( $ "Input length must be multiple of { processingLength } ") ;
108+ if ( Avx2 . IsSupported ) // Const after JIT/AOT
109+ {
110+ var processingLength = Vector256 < short > . Count * 2 ;
111+ if ( input . Length % processingLength != 0 )
112+ throw new ArgumentException ( $ "Input length must be multiple of { processingLength } ") ;
50113
51- int ch2Offset = channelBlockSize ;
52- unsafe
114+ int ch2Offset = channelBlockSize ;
115+ Vector256 < sbyte > shuffleMask = Vector256 . Create (
116+ 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15 , // 128-bit lane
117+ 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 , 2 , 3 , 6 , 7 , 10 , 11 , 14 , 15 ) ; // 128-bit lane
118+ Vector256 < int > permuteMask = Vector256 . Create ( 0 , 1 , 4 , 5 , 2 , 3 , 6 , 7 ) ;
119+ unsafe
120+ {
121+ fixed ( short * inputP = input )
122+ fixed ( short * outputP = output )
123+ {
124+ short * inputPtr = inputP ;
125+ short * outputPtr = outputP ;
126+ short * finishPtr = inputP + input . Length ;
127+ while ( inputPtr < finishPtr )
128+ {
129+ var loaded1 = Vector256 . LoadAlignedNonTemporal ( inputPtr ) ;
130+ var loaded2 = Vector256 . LoadAlignedNonTemporal ( inputPtr + Vector256 < short > . Count ) ;
131+ var shuffle1 = Avx2 . Shuffle ( loaded1 . AsSByte ( ) , shuffleMask ) ;
132+ var shuffle2 = Avx2 . Shuffle ( loaded2 . AsSByte ( ) , shuffleMask ) ;
133+ var permuted1 = Avx2 . PermuteVar8x32 ( shuffle1 . AsInt32 ( ) , permuteMask ) ;
134+ var permuted2 = Avx2 . PermuteVar8x32 ( shuffle2 . AsInt32 ( ) , permuteMask ) ;
135+ var channel1 = Avx2 . Permute2x128 ( permuted1 , permuted2 , 0x20 ) . AsInt16 ( ) ;
136+ var channel2 = Avx2 . Permute2x128 ( permuted1 , permuted2 , 0x31 ) . AsInt16 ( ) ;
137+ Vector256 . StoreAlignedNonTemporal ( channel1 , outputPtr ) ;
138+ Vector256 . StoreAlignedNonTemporal ( channel2 , outputPtr + ch2Offset ) ;
139+ inputPtr += processingLength ;
140+ outputPtr += Vector256 < short > . Count ;
141+ }
142+ }
143+ Sse2 . MemoryFence ( ) ;
144+ }
145+ }
146+ else
53147 {
54- fixed ( short * inputP = input )
55- fixed ( short * outputP = output )
148+ var processingLength = 2 ;
149+ if ( input . Length % processingLength != 0 )
150+ throw new ArgumentException ( $ "Input length must be multiple of { processingLength } ") ;
151+
152+ int ch2Offset = channelBlockSize ;
153+ unsafe
56154 {
57- short * inputPtr = inputP ;
58- short * outputPtr = outputP ;
59- short * finishPTr = inputP + input . Length ;
60- while ( inputPtr < finishPTr )
155+ fixed ( short * inputP = input )
156+ fixed ( short * outputP = output )
61157 {
62- outputPtr [ 0 ] = inputPtr [ 0 ] ;
63- outputPtr [ 0 + ch2Offset ] = inputPtr [ 1 ] ;
64- inputPtr += processingLength ;
65- outputPtr += 1 ;
158+ short * inputPtr = inputP ;
159+ short * outputPtr = outputP ;
160+ short * finishPTr = inputP + input . Length ;
161+ while ( inputPtr < finishPTr )
162+ {
163+ outputPtr [ 0 ] = inputPtr [ 0 ] ;
164+ outputPtr [ 0 + ch2Offset ] = inputPtr [ 1 ] ;
165+ inputPtr += processingLength ;
166+ outputPtr += 1 ;
167+ }
66168 }
67169 }
68170 }
0 commit comments