44using System . Numerics ;
55using System . Runtime . CompilerServices ;
66using System . Runtime . InteropServices ;
7+ using System . Runtime . Intrinsics ;
8+ using System . Runtime . Intrinsics . X86 ;
79using SixLabors . ImageSharp . ColorProfiles . WorkingSpaces ;
810
911namespace SixLabors . ImageSharp . ColorProfiles ;
@@ -105,10 +107,87 @@ public static void ToScaledVector4(ReadOnlySpan<Rgb> source, Span<Vector4> desti
105107 {
106108 Guard . DestinationShouldNotBeTooShort ( source , destination , nameof ( destination ) ) ;
107109
108- // TODO: Optimize via SIMD
109- for ( int i = 0 ; i < source . Length ; i ++ )
110+ int length = source . Length ;
111+ if ( length == 0 )
110112 {
111- destination [ i ] = source [ i ] . ToScaledVector4 ( ) ;
113+ return ;
114+ }
115+
116+ ref Rgb srcRgb = ref MemoryMarshal . GetReference ( source ) ;
117+ ref Vector4 dstV4 = ref MemoryMarshal . GetReference ( destination ) ;
118+
119+ // Float streams:
120+ // src: r0 g0 b0 r1 g1 b1 ...
121+ // dst: r0 g0 b0 a0 r1 g1 b1 a1 ...
122+ ref float src = ref Unsafe . As < Rgb , float > ( ref srcRgb ) ;
123+ ref float dst = ref Unsafe . As < Vector4 , float > ( ref dstV4 ) ;
124+
125+ int i = 0 ;
126+
127+ if ( Avx512F . IsSupported )
128+ {
129+ // 4 pixels per iteration. Using overlapped 16-float loads.
130+ Vector512 < int > perm = Vector512 . Create ( 0 , 1 , 2 , 0 , 3 , 4 , 5 , 0 , 6 , 7 , 8 , 0 , 9 , 10 , 11 , 0 ) ;
131+ Vector512 < float > ones = Vector512 . Create ( 1F ) ;
132+
133+ // BlendVariable selects from 'ones' where the sign-bit of mask lane is set.
134+ // Using -0f sets only the sign bit, producing an efficient "select lane" mask.
135+ Vector512 < float > alphaSelect = Vector512 . Create ( 0F , 0F , 0F , - 0F , 0F , 0F , 0F , - 0F , 0F , 0F , 0F , - 0F , 0F , 0F , 0F , - 0F ) ;
136+
137+ int quads = length >> 2 ;
138+
139+ // Leave the last quad (4 pixels) for the scalar tail.
140+ int simdQuads = quads - 1 ;
141+
142+ for ( int q = 0 ; q < simdQuads ; q ++ )
143+ {
144+ Vector512 < float > v = ReadVector512 ( ref src ) ;
145+ Vector512 < float > rgbx = Avx512F . PermuteVar16x32 ( v , perm ) ;
146+ Vector512 < float > rgba = Avx512F . BlendVariable ( rgbx , ones , alphaSelect ) ;
147+
148+ WriteVector512 ( ref dst , rgba ) ;
149+
150+ src = ref Unsafe . Add ( ref src , 12 ) ;
151+ dst = ref Unsafe . Add ( ref dst , 16 ) ;
152+
153+ i += 4 ;
154+ }
155+ }
156+ else if ( Avx2 . IsSupported )
157+ {
158+ // 2 pixels per iteration. Using overlapped 8-float loads.
159+ Vector256 < int > perm = Vector256 . Create ( 0 , 1 , 2 , 0 , 3 , 4 , 5 , 0 ) ;
160+
161+ Vector256 < float > ones = Vector256 . Create ( 1F ) ;
162+
163+ // vblendps mask: bit i selects lane i from 'ones' when set.
164+ // We want lanes 3 and 7 -> 0b10001000 = 0x88.
165+ const byte alphaMask = 0x88 ;
166+
167+ int pairs = length >> 1 ;
168+
169+ // Leave the last pair (2 pixels) for the scalar tail.
170+ int simdPairs = pairs - 1 ;
171+
172+ for ( int p = 0 ; p < simdPairs ; p ++ )
173+ {
174+ Vector256 < float > v = ReadVector256 ( ref src ) ;
175+ Vector256 < float > rgbx = Avx2 . PermuteVar8x32 ( v , perm ) ;
176+ Vector256 < float > rgba = Avx . Blend ( rgbx , ones , alphaMask ) ;
177+
178+ WriteVector256 ( ref dst , rgba ) ;
179+
180+ src = ref Unsafe . Add ( ref src , 6 ) ;
181+ dst = ref Unsafe . Add ( ref dst , 8 ) ;
182+
183+ i += 2 ;
184+ }
185+ }
186+
187+ // Tail (and non-AVX paths)
188+ for ( ; i < length ; i ++ )
189+ {
190+ Unsafe . Add ( ref dstV4 , i ) = Unsafe. Add ( ref srcRgb , i ) . ToScaledVector4 ( ) ;
112191 }
113192 }
114193
@@ -117,10 +196,75 @@ public static void FromScaledVector4(ReadOnlySpan<Vector4> source, Span<Rgb> des
117196 {
118197 Guard . DestinationShouldNotBeTooShort ( source , destination , nameof ( destination ) ) ;
119198
120- // TODO: Optimize via SIMD
121- for ( int i = 0 ; i < source . Length ; i ++ )
199+ int length = source . Length ;
200+ if ( length == 0 )
122201 {
123- destination [ i ] = FromScaledVector4 ( source [ i ] ) ;
202+ return ;
203+ }
204+
205+ ref Vector4 srcV4 = ref MemoryMarshal . GetReference ( source ) ;
206+ ref Rgb dstRgb = ref MemoryMarshal . GetReference ( destination ) ;
207+
208+ // Float streams:
209+ // src: r0 g0 b0 a0 r1 g1 b1 a1 ...
210+ // dst: r0 g0 b0 r1 g1 b1 ...
211+ ref float src = ref Unsafe . As < Vector4 , float > ( ref srcV4 ) ;
212+ ref float dst = ref Unsafe . As < Rgb , float > ( ref dstRgb ) ;
213+
214+ int i = 0 ;
215+
216+ if ( Avx512F . IsSupported )
217+ {
218+ // 4 pixels per iteration. Using overlapped 16-float stores:
219+ Vector512 < int > idx = Vector512 . Create ( 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ) ;
220+
221+ // Number of 4-pixel groups in the input.
222+ int quads = length >> 2 ;
223+
224+ // Leave the last quad (4 pixels) for the scalar tail.
225+ int simdQuads = quads - 1 ;
226+
227+ for ( int q = 0 ; q < simdQuads ; q ++ )
228+ {
229+ Vector512 < float > v = ReadVector512 ( ref src ) ;
230+ Vector512 < float > packed = Avx512F . PermuteVar16x32 ( v , idx ) ;
231+
232+ WriteVector512 ( ref dst , packed ) ;
233+
234+ src = ref Unsafe . Add ( ref src , 16 ) ;
235+ dst = ref Unsafe . Add ( ref dst , 12 ) ;
236+ i += 4 ;
237+ }
238+ }
239+ else if ( Avx2 . IsSupported )
240+ {
241+ // 2 pixels per iteration, using overlapped 8-float stores:
242+ Vector256 < int > idx = Vector256 . Create ( 0 , 1 , 2 , 4 , 5 , 6 , 0 , 0 ) ;
243+
244+ int pairs = length >> 1 ;
245+
246+ // Leave the last pair (2 pixels) for the scalar tail.
247+ int simdPairs = pairs - 1 ;
248+
249+ int pairIndex = 0 ;
250+ for ( ; pairIndex < simdPairs ; pairIndex ++ )
251+ {
252+ Vector256 < float > v = ReadVector256 ( ref src ) ;
253+ Vector256 < float > packed = Avx2 . PermuteVar8x32 ( v , idx ) ;
254+
255+ WriteVector256 ( ref dst , packed ) ;
256+
257+ src = ref Unsafe . Add ( ref src , 8 ) ;
258+ dst = ref Unsafe . Add ( ref dst , 6 ) ;
259+ i += 2 ;
260+ }
261+ }
262+
263+ // Tail (and non-AVX paths)
264+ for ( ; i < length ; i ++ )
265+ {
266+ Vector4 v = Unsafe . Add ( ref srcV4 , i ) ;
267+ Unsafe . Add ( ref dstRgb , i ) = FromScaledVector4( v ) ;
124268 }
125269 }
126270
@@ -288,4 +432,32 @@ private static Matrix4x4 GetRgbToCieXyzMatrix(RgbWorkingSpace workingSpace)
288432 M44 = 1F
289433 } ;
290434 }
435+
436+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
437+ private static Vector512 < float > ReadVector512 ( ref float src )
438+ {
439+ ref byte b = ref Unsafe . As < float , byte > ( ref src ) ;
440+ return Unsafe . ReadUnaligned < Vector512 < float > > ( ref b ) ;
441+ }
442+
443+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
444+ private static Vector256 < float > ReadVector256 ( ref float src )
445+ {
446+ ref byte b = ref Unsafe . As < float , byte > ( ref src ) ;
447+ return Unsafe . ReadUnaligned < Vector256 < float > > ( ref b ) ;
448+ }
449+
450+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
451+ private static void WriteVector512 ( ref float dst , Vector512 < float > value )
452+ {
453+ ref byte b = ref Unsafe . As < float , byte > ( ref dst ) ;
454+ Unsafe . WriteUnaligned ( ref b , value ) ;
455+ }
456+
457+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
458+ private static void WriteVector256 ( ref float dst , Vector256 < float > value )
459+ {
460+ ref byte b = ref Unsafe . As < float , byte > ( ref dst ) ;
461+ Unsafe . WriteUnaligned ( ref b , value ) ;
462+ }
291463}
0 commit comments