Skip to content

Commit da6d56b

Browse files
Merge pull request #3051 from SixLabors/js/webp-icc
Add ICC profile conversion support to WEBP
2 parents ce7185e + 08dd24f commit da6d56b

9 files changed

Lines changed: 331 additions & 7 deletions

File tree

src/ImageSharp/ColorProfiles/ColorProfileConverterExtensionsPixelCompatible.cs

Lines changed: 121 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
using System.Numerics;
66
using System.Runtime.CompilerServices;
77
using System.Runtime.InteropServices;
8+
using System.Runtime.Intrinsics;
9+
using System.Runtime.Intrinsics.X86;
810
using SixLabors.ImageSharp.PixelFormats;
911
using SixLabors.ImageSharp.Processing;
1012

@@ -60,8 +62,126 @@ public static void Convert<TPixel>(this ColorProfileConverter converter, Image<T
6062
converter.ConvertUsingIccProfile<Rgb, Rgb>(rgbSpan, rgbSpan);
6163

6264
// Copy the converted Rgb pixels back to the row as TPixel.
65+
// Important: Preserve alpha from the existing row Vector4 values.
66+
// We merge RGB from rgbSpan into row, leaving W untouched.
67+
ref float srcRgb = ref Unsafe.As<Rgb, float>(ref MemoryMarshal.GetReference(rgbSpan));
68+
ref float dstRow = ref Unsafe.As<Vector4, float>(ref MemoryMarshal.GetReference(row));
69+
70+
int count = rgbSpan.Length;
71+
int i = 0;
72+
73+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
74+
static Vector512<float> ReadVector512(ref float f)
75+
{
76+
ref byte b = ref Unsafe.As<float, byte>(ref f);
77+
return Unsafe.ReadUnaligned<Vector512<float>>(ref b);
78+
}
79+
80+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
81+
static void WriteVector512(ref float f, Vector512<float> v)
82+
{
83+
ref byte b = ref Unsafe.As<float, byte>(ref f);
84+
Unsafe.WriteUnaligned(ref b, v);
85+
}
86+
87+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
88+
static Vector256<float> ReadVector256(ref float f)
89+
{
90+
ref byte b = ref Unsafe.As<float, byte>(ref f);
91+
return Unsafe.ReadUnaligned<Vector256<float>>(ref b);
92+
}
93+
94+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
95+
static void WriteVector256(ref float f, Vector256<float> v)
96+
{
97+
ref byte b = ref Unsafe.As<float, byte>(ref f);
98+
Unsafe.WriteUnaligned(ref b, v);
99+
}
100+
101+
if (Avx512F.IsSupported)
102+
{
103+
// 4 pixels per iteration.
104+
//
105+
// Source layout (Rgb float stream, 12 floats):
106+
// [r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3]
107+
//
108+
// Destination layout (row Vector4 float stream, 16 floats):
109+
// [r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3]
110+
//
111+
// We use an overlapped load (16 floats) from the 3-float stride source.
112+
// The permute selects the RGB we need and inserts placeholders for alpha lanes.
113+
//
114+
// Then we blend RGB lanes into the existing destination, preserving alpha lanes.
115+
Vector512<int> rgbPerm = Vector512.Create(0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 0);
116+
117+
// BlendVariable selects from the second operand where the sign bit of the mask lane is set.
118+
// We want to overwrite lanes 0,1,2 then 4,5,6 then 8,9,10 then 12,13,14, and preserve lanes 3,7,11,15 (alpha).
119+
Vector512<float> rgbSelect = Vector512.Create(-0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F);
120+
121+
int quads = count >> 2;
122+
int simdQuads = quads - 1; // Leave the last quad for the scalar tail to avoid the final overlapped load reading past the end.
123+
124+
for (int q = 0; q < simdQuads; q++)
125+
{
126+
Vector512<float> dst = ReadVector512(ref dstRow);
127+
Vector512<float> src = ReadVector512(ref srcRgb);
128+
129+
Vector512<float> rgbx = Avx512F.PermuteVar16x32(src, rgbPerm);
130+
Vector512<float> merged = Avx512F.BlendVariable(dst, rgbx, rgbSelect);
131+
132+
WriteVector512(ref dstRow, merged);
133+
134+
// Advance input by 4 pixels (4 * 3 = 12 floats)
135+
srcRgb = ref Unsafe.Add(ref srcRgb, 12);
136+
137+
// Advance output by 4 pixels (4 * 4 = 16 floats)
138+
dstRow = ref Unsafe.Add(ref dstRow, 16);
139+
140+
i += 4;
141+
}
142+
}
143+
else if (Avx2.IsSupported)
144+
{
145+
// 2 pixels per iteration.
146+
//
147+
// Same idea as AVX-512, but on 256-bit vectors.
148+
// We permute packed RGB into rgbx layout and blend into the existing destination,
149+
// preserving alpha lanes.
150+
Vector256<int> rgbPerm = Vector256.Create(0, 1, 2, 0, 3, 4, 5, 0);
151+
152+
Vector256<float> rgbSelect = Vector256.Create(-0F, -0F, -0F, 0F, -0F, -0F, -0F, 0F);
153+
154+
int pairs = count >> 1;
155+
int simdPairs = pairs - 1; // Leave the last pair for the scalar tail to avoid the final overlapped load reading past the end.
156+
157+
for (int p = 0; p < simdPairs; p++)
158+
{
159+
Vector256<float> dst = ReadVector256(ref dstRow);
160+
Vector256<float> src = ReadVector256(ref srcRgb);
161+
162+
Vector256<float> rgbx = Avx2.PermuteVar8x32(src, rgbPerm);
163+
Vector256<float> merged = Avx.BlendVariable(dst, rgbx, rgbSelect);
164+
165+
WriteVector256(ref dstRow, merged);
166+
167+
// Advance input by 2 pixels (2 * 3 = 6 floats)
168+
srcRgb = ref Unsafe.Add(ref srcRgb, 6);
169+
170+
// Advance output by 2 pixels (2 * 4 = 8 floats)
171+
dstRow = ref Unsafe.Add(ref dstRow, 8);
172+
173+
i += 2;
174+
}
175+
}
176+
177+
// Scalar tail.
178+
// Handles:
179+
// - the last skipped SIMD block (quad or pair)
180+
// - any remainder
181+
//
182+
// Preserve alpha by writing Vector3 into the Vector4 storage.
63183
ref Vector4 rowRef = ref MemoryMarshal.GetReference(row);
64-
for (int i = 0; i < rgbSpan.Length; i++)
184+
for (; i < count; i++)
65185
{
66186
Vector3 rgb = rgbSpan[i].AsVector3Unsafe();
67187
Unsafe.As<Vector4, Vector3>(ref Unsafe.Add(ref rowRef, (uint)i)) = rgb;

src/ImageSharp/ColorProfiles/Rgb.cs

Lines changed: 178 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
using System.Numerics;
55
using System.Runtime.CompilerServices;
66
using System.Runtime.InteropServices;
7+
using System.Runtime.Intrinsics;
8+
using System.Runtime.Intrinsics.X86;
79
using SixLabors.ImageSharp.ColorProfiles.WorkingSpaces;
810

911
namespace SixLabors.ImageSharp.ColorProfiles;
@@ -105,10 +107,87 @@ public static void ToScaledVector4(ReadOnlySpan<Rgb> source, Span<Vector4> desti
105107
{
106108
Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));
107109

108-
// TODO: Optimize via SIMD
109-
for (int i = 0; i < source.Length; i++)
110+
int length = source.Length;
111+
if (length == 0)
110112
{
111-
destination[i] = source[i].ToScaledVector4();
113+
return;
114+
}
115+
116+
ref Rgb srcRgb = ref MemoryMarshal.GetReference(source);
117+
ref Vector4 dstV4 = ref MemoryMarshal.GetReference(destination);
118+
119+
// Float streams:
120+
// src: r0 g0 b0 r1 g1 b1 ...
121+
// dst: r0 g0 b0 a0 r1 g1 b1 a1 ...
122+
ref float src = ref Unsafe.As<Rgb, float>(ref srcRgb);
123+
ref float dst = ref Unsafe.As<Vector4, float>(ref dstV4);
124+
125+
int i = 0;
126+
127+
if (Avx512F.IsSupported)
128+
{
129+
// 4 pixels per iteration. Using overlapped 16-float loads.
130+
Vector512<int> perm = Vector512.Create(0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 9, 10, 11, 0);
131+
Vector512<float> ones = Vector512.Create(1F);
132+
133+
// BlendVariable selects from 'ones' where the sign-bit of mask lane is set.
134+
// Using -0f sets only the sign bit, producing an efficient "select lane" mask.
135+
Vector512<float> alphaSelect = Vector512.Create(0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F, 0F, 0F, 0F, -0F);
136+
137+
int quads = length >> 2;
138+
139+
// Leave the last quad (4 pixels) for the scalar tail.
140+
int simdQuads = quads - 1;
141+
142+
for (int q = 0; q < simdQuads; q++)
143+
{
144+
Vector512<float> v = ReadVector512(ref src);
145+
Vector512<float> rgbx = Avx512F.PermuteVar16x32(v, perm);
146+
Vector512<float> rgba = Avx512F.BlendVariable(rgbx, ones, alphaSelect);
147+
148+
WriteVector512(ref dst, rgba);
149+
150+
src = ref Unsafe.Add(ref src, 12);
151+
dst = ref Unsafe.Add(ref dst, 16);
152+
153+
i += 4;
154+
}
155+
}
156+
else if (Avx2.IsSupported)
157+
{
158+
// 2 pixels per iteration. Using overlapped 8-float loads.
159+
Vector256<int> perm = Vector256.Create(0, 1, 2, 0, 3, 4, 5, 0);
160+
161+
Vector256<float> ones = Vector256.Create(1F);
162+
163+
// vblendps mask: bit i selects lane i from 'ones' when set.
164+
// We want lanes 3 and 7 -> 0b10001000 = 0x88.
165+
const byte alphaMask = 0x88;
166+
167+
int pairs = length >> 1;
168+
169+
// Leave the last pair (2 pixels) for the scalar tail.
170+
int simdPairs = pairs - 1;
171+
172+
for (int p = 0; p < simdPairs; p++)
173+
{
174+
Vector256<float> v = ReadVector256(ref src);
175+
Vector256<float> rgbx = Avx2.PermuteVar8x32(v, perm);
176+
Vector256<float> rgba = Avx.Blend(rgbx, ones, alphaMask);
177+
178+
WriteVector256(ref dst, rgba);
179+
180+
src = ref Unsafe.Add(ref src, 6);
181+
dst = ref Unsafe.Add(ref dst, 8);
182+
183+
i += 2;
184+
}
185+
}
186+
187+
// Tail (and non-AVX paths)
188+
for (; i < length; i++)
189+
{
190+
Unsafe.Add(ref dstV4, i) = Unsafe.Add(ref srcRgb, i).ToScaledVector4();
112191
}
113192
}
114193

@@ -117,10 +196,75 @@ public static void FromScaledVector4(ReadOnlySpan<Vector4> source, Span<Rgb> des
117196
{
118197
Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));
119198

120-
// TODO: Optimize via SIMD
121-
for (int i = 0; i < source.Length; i++)
199+
int length = source.Length;
200+
if (length == 0)
122201
{
123-
destination[i] = FromScaledVector4(source[i]);
202+
return;
203+
}
204+
205+
ref Vector4 srcV4 = ref MemoryMarshal.GetReference(source);
206+
ref Rgb dstRgb = ref MemoryMarshal.GetReference(destination);
207+
208+
// Float streams:
209+
// src: r0 g0 b0 a0 r1 g1 b1 a1 ...
210+
// dst: r0 g0 b0 r1 g1 b1 ...
211+
ref float src = ref Unsafe.As<Vector4, float>(ref srcV4);
212+
ref float dst = ref Unsafe.As<Rgb, float>(ref dstRgb);
213+
214+
int i = 0;
215+
216+
if (Avx512F.IsSupported)
217+
{
218+
// 4 pixels per iteration. Using overlapped 16-float stores:
219+
Vector512<int> idx = Vector512.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15);
220+
221+
// Number of 4-pixel groups in the input.
222+
int quads = length >> 2;
223+
224+
// Leave the last quad (4 pixels) for the scalar tail.
225+
int simdQuads = quads - 1;
226+
227+
for (int q = 0; q < simdQuads; q++)
228+
{
229+
Vector512<float> v = ReadVector512(ref src);
230+
Vector512<float> packed = Avx512F.PermuteVar16x32(v, idx);
231+
232+
WriteVector512(ref dst, packed);
233+
234+
src = ref Unsafe.Add(ref src, 16);
235+
dst = ref Unsafe.Add(ref dst, 12);
236+
i += 4;
237+
}
238+
}
239+
else if (Avx2.IsSupported)
240+
{
241+
// 2 pixels per iteration, using overlapped 8-float stores:
242+
Vector256<int> idx = Vector256.Create(0, 1, 2, 4, 5, 6, 0, 0);
243+
244+
int pairs = length >> 1;
245+
246+
// Leave the last pair (2 pixels) for the scalar tail.
247+
int simdPairs = pairs - 1;
248+
249+
int pairIndex = 0;
250+
for (; pairIndex < simdPairs; pairIndex++)
251+
{
252+
Vector256<float> v = ReadVector256(ref src);
253+
Vector256<float> packed = Avx2.PermuteVar8x32(v, idx);
254+
255+
WriteVector256(ref dst, packed);
256+
257+
src = ref Unsafe.Add(ref src, 8);
258+
dst = ref Unsafe.Add(ref dst, 6);
259+
i += 2;
260+
}
261+
}
262+
263+
// Tail (and non-AVX paths)
264+
for (; i < length; i++)
265+
{
266+
Vector4 v = Unsafe.Add(ref srcV4, i);
267+
Unsafe.Add(ref dstRgb, i) = FromScaledVector4(v);
124268
}
125269
}
126270

@@ -288,4 +432,32 @@ private static Matrix4x4 GetRgbToCieXyzMatrix(RgbWorkingSpace workingSpace)
288432
M44 = 1F
289433
};
290434
}
435+
436+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
437+
private static Vector512<float> ReadVector512(ref float src)
438+
{
439+
ref byte b = ref Unsafe.As<float, byte>(ref src);
440+
return Unsafe.ReadUnaligned<Vector512<float>>(ref b);
441+
}
442+
443+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
444+
private static Vector256<float> ReadVector256(ref float src)
445+
{
446+
ref byte b = ref Unsafe.As<float, byte>(ref src);
447+
return Unsafe.ReadUnaligned<Vector256<float>>(ref b);
448+
}
449+
450+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
451+
private static void WriteVector512(ref float dst, Vector512<float> value)
452+
{
453+
ref byte b = ref Unsafe.As<float, byte>(ref dst);
454+
Unsafe.WriteUnaligned(ref b, value);
455+
}
456+
457+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
458+
private static void WriteVector256(ref float dst, Vector256<float> value)
459+
{
460+
ref byte b = ref Unsafe.As<float, byte>(ref dst);
461+
Unsafe.WriteUnaligned(ref b, value);
462+
}
291463
}

src/ImageSharp/Formats/Webp/WebpDecoderCore.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ protected override Image<TPixel> Decode<TPixel>(BufferedReadStream stream, Cance
122122
this.ParseOptionalChunks(stream, metadata, this.webImageInfo.Features, buffer);
123123
}
124124

125+
_ = this.TryConvertIccProfile(image);
125126
return image;
126127
}
127128
}

tests/ImageSharp.Tests/Formats/WebP/WebpDecoderTests.cs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -608,4 +608,17 @@ public void WebpDecoder_CanDecode_Issue2906<TPixel>(TestImageProvider<TPixel> pr
608608
image.DebugSave(provider);
609609
image.CompareToOriginal(provider, ReferenceDecoder);
610610
}
611+
612+
[Theory]
613+
[WithFile(Icc.Perceptual, PixelTypes.Rgba32)]
614+
[WithFile(Icc.PerceptualcLUTOnly, PixelTypes.Rgba32)]
615+
public void Decode_WhenColorProfileHandlingIsConvert_ApplyIccProfile<TPixel>(TestImageProvider<TPixel> provider)
616+
where TPixel : unmanaged, IPixel<TPixel>
617+
{
618+
using Image<TPixel> image = provider.GetImage(WebpDecoder.Instance, new DecoderOptions { ColorProfileHandling = ColorProfileHandling.Convert });
619+
620+
image.DebugSave(provider);
621+
image.CompareToReferenceOutput(provider);
622+
Assert.Null(image.Metadata.IccProfile);
623+
}
611624
}

tests/ImageSharp.Tests/TestImages.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -901,6 +901,12 @@ public static class Lossy
901901
public const string AlphaBlend2 = "Webp/alpha-blend-2.webp";
902902
public const string AlphaBlend3 = "Webp/alpha-blend-3.webp";
903903
public const string AlphaBlend4 = "Webp/alpha-blend-4.webp";
904+
905+
public static class Icc
906+
{
907+
public const string Perceptual = "Webp/icc-profiles/Perceptual.webp";
908+
public const string PerceptualcLUTOnly = "Webp/icc-profiles/Perceptual-cLUT-only.webp";
909+
}
904910
}
905911

906912
public static class Tiff
Lines changed: 3 additions & 0 deletions
Loading
Lines changed: 3 additions & 0 deletions
Loading
Lines changed: 3 additions & 0 deletions
Loading

0 commit comments

Comments
 (0)