SixLabors
diff --git a/‎src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
Copy file name to clipboardExpand all lines: src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+62-62Lines changed: 62 additions & 62 deletions b/‎src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
Copy file name to clipboardExpand all lines: src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+62-62Lines changed: 62 additions & 62 deletions
diff --git a/‎src/ImageSharp/Common/Helpers/Vector128Utilities.cs
Copy file name to clipboardExpand all lines: src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+10-5Lines changed: 10 additions & 5 deletions b/‎src/ImageSharp/Common/Helpers/Vector128Utilities.cs
Copy file name to clipboardExpand all lines: src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+10-5Lines changed: 10 additions & 5 deletions
diff --git a/‎src/ImageSharp/Common/Helpers/Vector256Utilities.cs
Copy file name to clipboardExpand all lines: src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+40-18Lines changed: 40 additions & 18 deletions b/‎src/ImageSharp/Common/Helpers/Vector256Utilities.cs
Copy file name to clipboardExpand all lines: src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+40-18Lines changed: 40 additions & 18 deletions
diff --git a/‎src/ImageSharp/Common/Helpers/Vector512Utilities.cs
Copy file name to clipboardExpand all lines: src/ImageSharp/Common/Helpers/Vector512Utilities.cs
+5-12Lines changed: 5 additions & 12 deletions b/‎src/ImageSharp/Common/Helpers/Vector512Utilities.cs
Copy file name to clipboardExpand all lines: src/ImageSharp/Common/Helpers/Vector512Utilities.cs
+5-12Lines changed: 5 additions & 12 deletions
diff --git a/‎src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
Copy file name to clipboardExpand all lines: src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
+39-73Lines changed: 39 additions & 73 deletions b/‎src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
Copy file name to clipboardExpand all lines: src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
+39-73Lines changed: 39 additions & 73 deletions
@@ -26,7 +26,7 @@ internal static class Vector128_
     /// <summary>
     /// Gets a value indicating whether shuffle operations are supported.
     /// </summary>
-    public static bool SupportsShuffleFloat
+    public static bool SupportsShuffleNativeFloat
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         get => Sse.IsSupported;
@@ -35,10 +35,10 @@ public static bool SupportsShuffleFloat
     /// <summary>
     /// Gets a value indicating whether shuffle operations are supported.
     /// </summary>
-    public static bool SupportsShuffleByte
+    public static bool SupportsShuffleNativeByte
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported;
+        get => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported;
     }
 
     /// <summary>
@@ -66,7 +66,7 @@ public static bool SupportsShiftByte
     /// <param name="control">The shuffle control byte.</param>
     /// <returns>The <see cref="Vector128{Single}"/>.</returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector128<float> Shuffle(Vector128<float> vector, [ConstantExpected] byte control)
+    public static Vector128<float> ShuffleNative(Vector128<float> vector, [ConstantExpected] byte control)
     {
         if (Sse.IsSupported)
         {
@@ -89,7 +89,7 @@ public static Vector128<float> Shuffle(Vector128<float> vector, [ConstantExpecte
     /// A new vector containing the values from <paramref name="vector" /> selected by the given <paramref name="indices" />.
     /// </returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector128<byte> Shuffle(Vector128<byte> vector, Vector128<byte> indices)
+    public static Vector128<byte> ShuffleNative(Vector128<byte> vector, Vector128<byte> indices)
     {
         if (Ssse3.IsSupported)
         {
@@ -101,6 +101,11 @@ public static Vector128<byte> Shuffle(Vector128<byte> vector, Vector128<byte> in
             return AdvSimd.Arm64.VectorTableLookup(vector, indices);
         }
 
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.Swizzle(vector, indices);
+        }
+
         ThrowUnreachableException();
         return default;
     }
 
@@ -24,10 +24,10 @@ internal static class Vector256_
     /// <summary>
     /// Gets a value indicating whether shuffle byte operations are supported.
     /// </summary>
-    public static bool SupportsShuffleFloat
+    public static bool SupportsShuffleNativeFloat
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Avx.IsSupported || Sse.IsSupported;
+        get => Avx.IsSupported;
     }
 
     /// <summary>
@@ -46,20 +46,13 @@ public static bool SupportsShuffleByte
     /// <param name="control">The shuffle control byte.</param>
     /// <returns>The <see cref="Vector256{Single}"/>.</returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector256<float> Shuffle(Vector256<float> vector, [ConstantExpected] byte control)
+    public static Vector256<float> ShuffleNative(Vector256<float> vector, [ConstantExpected] byte control)
     {
         if (Avx.IsSupported)
         {
             return Avx.Shuffle(vector, vector, control);
         }
 
-        if (Sse.IsSupported)
-        {
-            Vector128<float> lower = vector.GetLower();
-            Vector128<float> upper = vector.GetUpper();
-            return Vector256.Create(Sse.Shuffle(lower, lower, control), Sse.Shuffle(upper, upper, control));
-        }
-
         ThrowUnreachableException();
         return default;
     }
@@ -73,7 +66,7 @@ public static Vector256<float> Shuffle(Vector256<float> vector, [ConstantExpecte
     /// </param>
     /// <returns>The <see cref="Vector256{Single}"/>.</returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector256<byte> Shuffle(Vector256<byte> vector, Vector256<byte> indices)
+    public static Vector256<byte> ShuffleNative(Vector256<byte> vector, Vector256<byte> indices)
     {
         if (Avx2.IsSupported)
         {
@@ -98,13 +91,6 @@ public static Vector256<int> ConvertToInt32RoundToEven(Vector256<float> vector)
             return Avx.ConvertToVector256Int32(vector);
         }
 
-        if (Sse2.IsSupported)
-        {
-            Vector128<int> lower = Sse2.ConvertToVector128Int32(vector.GetLower());
-            Vector128<int> upper = Sse2.ConvertToVector128Int32(vector.GetUpper());
-            return Vector256.Create(lower, upper);
-        }
-
         Vector256<float> sign = vector & Vector256.Create(-0F);
         Vector256<float> val_2p23_f32 = sign | Vector256.Create(8388608F);
 
@@ -154,6 +140,27 @@ public static Vector256<float> MultiplyAdd(
         return va + (vm0 * vm1);
     }
 
+    /// <summary>
+    /// Packs signed 32-bit integers to signed 16-bit integers and saturates.
+    /// </summary>
+    /// <param name="left">The left hand source vector.</param>
+    /// <param name="right">The right hand source vector.</param>
+    /// <returns>The <see cref="Vector256{Int16}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<short> PackSignedSaturate(Vector256<int> left, Vector256<int> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.PackSignedSaturate(left, right);
+        }
+
+        Vector256<int> min = Vector256.Create((int)short.MinValue);
+        Vector256<int> max = Vector256.Create((int)short.MaxValue);
+        Vector256<int> lefClamped = Clamp(left, min, max);
+        Vector256<int> rightClamped = Clamp(right, min, max);
+        return Vector256.Narrow(lefClamped, rightClamped);
+    }
+
     /// <summary>
     /// Restricts a vector between a minimum and a maximum value.
     /// </summary>
@@ -166,6 +173,21 @@ public static Vector256<float> MultiplyAdd(
     public static Vector256<T> Clamp<T>(Vector256<T> value, Vector256<T> min, Vector256<T> max)
         => Vector256.Min(Vector256.Max(value, min), max);
 
+    /// <summary>
+    /// Widens a <see cref="Vector128{Int16}"/> to a <see cref="Vector256{Int32}"/>.
+    /// </summary>
+    /// <param name="value">The vector to widen.</param>
+    /// <returns>The widened <see cref="Vector256{Int32}"/>.</returns>
+    public static Vector256<int> Widen(Vector128<short> value)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.ConvertToVector256Int32(value);
+        }
+
+        return Vector256.WidenLower(value.ToVector256());
+    }
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
@@ -24,16 +24,16 @@ internal static class Vector512_
     /// <summary>
     /// Gets a value indicating whether shuffle float operations are supported.
     /// </summary>
-    public static bool SupportsShuffleFloat
+    public static bool SupportsShuffleNativeFloat
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Avx512F.IsSupported || Avx.IsSupported;
+        get => Avx512F.IsSupported;
     }
 
     /// <summary>
     /// Gets a value indicating whether shuffle byte operations are supported.
     /// </summary>
-    public static bool SupportsShuffleByte
+    public static bool SupportsShuffleNativeByte
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         get => Avx512BW.IsSupported;
@@ -46,20 +46,13 @@ public static bool SupportsShuffleByte
     /// <param name="control">The shuffle control byte.</param>
     /// <returns>The <see cref="Vector512{Single}"/>.</returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector512<float> Shuffle(Vector512<float> vector, [ConstantExpected] byte control)
+    public static Vector512<float> ShuffleNative(Vector512<float> vector, [ConstantExpected] byte control)
     {
         if (Avx512F.IsSupported)
         {
             return Avx512F.Shuffle(vector, vector, control);
         }
 
-        if (Avx.IsSupported)
-        {
-            Vector256<float> lower = vector.GetLower();
-            Vector256<float> upper = vector.GetUpper();
-            return Vector512.Create(Avx.Shuffle(lower, lower, control), Avx.Shuffle(upper, upper, control));
-        }
-
         ThrowUnreachableException();
         return default;
     }
@@ -73,7 +66,7 @@ public static Vector512<float> Shuffle(Vector512<float> vector, [ConstantExpecte
     /// </param>
     /// <returns>The <see cref="Vector512{Byte}"/>.</returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector512<byte> Shuffle(Vector512<byte> vector, Vector512<byte> indices)
+    public static Vector512<byte> ShuffleNative(Vector512<byte> vector, Vector512<byte> indices)
     {
         if (Avx512BW.IsSupported)
         {
 
@@ -1,7 +1,6 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
-using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
@@ -60,109 +59,76 @@ public void NormalizeColorsAndRoundInPlaceVector256(float maximum)
     }
 
     /// <summary>
-    /// Loads values from <paramref name="source"/> using extended AVX2 intrinsics.
+    /// Loads values from <paramref name="source"/> using <see cref="Vector256{T}"/> intrinsics.
     /// </summary>
     /// <param name="source">The source <see cref="Block8x8"/></param>
-    public void LoadFromInt16ExtendedAvx2(ref Block8x8 source)
+    public void LoadFromInt16ExtendedVector256(ref Block8x8 source)
     {
         DebugGuard.IsTrue(
-            Avx2.IsSupported,
-            "LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!");
+            Vector256.IsHardwareAccelerated,
+            "LoadFromInt16ExtendedVector256 only works on Vector256 compatible architecture!");
 
         ref short sRef = ref Unsafe.As<Block8x8, short>(ref source);
         ref Vector256<float> dRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this);
 
-        // Vector256<ushort>.Count == 16 on AVX2
+        // Vector256<ushort>.Count == 16
         // We can process 2 block rows in a single step
-        Vector256<int> top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef));
-        Vector256<int> bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
-        dRef = Avx.ConvertToVector256Single(top);
-        Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom);
-
-        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
-        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
-        Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top);
-        Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom);
-
-        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
-        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
-        Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top);
-        Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom);
-
-        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
-        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
-        Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top);
-        Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom);
+        Vector256<int> top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef));
+        Vector256<int> bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
+        dRef = Vector256.ConvertToSingle(top);
+        Unsafe.Add(ref dRef, 1) = Vector256.ConvertToSingle(bottom);
+
+        top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
+        bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
+        Unsafe.Add(ref dRef, 2) = Vector256.ConvertToSingle(top);
+        Unsafe.Add(ref dRef, 3) = Vector256.ConvertToSingle(bottom);
+
+        top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
+        bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
+        Unsafe.Add(ref dRef, 4) = Vector256.ConvertToSingle(top);
+        Unsafe.Add(ref dRef, 5) = Vector256.ConvertToSingle(bottom);
+
+        top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
+        bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
+        Unsafe.Add(ref dRef, 6) = Vector256.ConvertToSingle(top);
+        Unsafe.Add(ref dRef, 7) = Vector256.ConvertToSingle(bottom);
     }
 
     [MethodImpl(InliningOptions.ShortMethod)]
     private static Vector256<float> NormalizeAndRoundVector256(Vector256<float> value, Vector256<float> off, Vector256<float> max)
         => Vector256_.RoundToNearestInteger(Vector256_.Clamp(value + off, Vector256<float>.Zero, max));
 
-    private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+    private static unsafe void MultiplyIntoInt16Vector256(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
     {
-        DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
+        DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to run this operation!");
 
         ref Vector256<float> aBase = ref a.V256_0;
         ref Vector256<float> bBase = ref b.V256_0;
-
         ref Vector256<short> destRef = ref dest.V01;
-        Vector256<int> multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
 
         for (nuint i = 0; i < 8; i += 2)
         {
-            Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
-            Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+            Vector256<int> row0 = Vector256_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 0) * Unsafe.Add(ref bBase, i + 0));
+            Vector256<int> row1 = Vector256_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 1) * Unsafe.Add(ref bBase, i + 1));
 
-            Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
-            row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16();
+            Vector256<short> row = Vector256_.PackSignedSaturate(row0, row1);
+            row = Vector256.Shuffle(row.AsInt32(), Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7)).AsInt16();
 
             Unsafe.Add(ref destRef, i / 2) = row;
         }
     }
 
-    private void TransposeInPlace_Avx()
+    private void TransposeInPlaceVector256()
     {
         // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
-        Vector256<float> r0 = Avx.InsertVector128(
-            this.V256_0,
-            Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
-            1);
-
-        Vector256<float> r1 = Avx.InsertVector128(
-           this.V256_1,
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
-           1);
-
-        Vector256<float> r2 = Avx.InsertVector128(
-           this.V256_2,
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
-           1);
-
-        Vector256<float> r3 = Avx.InsertVector128(
-           this.V256_3,
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
-           1);
-
-        Vector256<float> r4 = Avx.InsertVector128(
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
-           1);
-
-        Vector256<float> r5 = Avx.InsertVector128(
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
-           1);
-
-        Vector256<float> r6 = Avx.InsertVector128(
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
-           1);
-
-        Vector256<float> r7 = Avx.InsertVector128(
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
-           Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
-           1);
+        Vector256<float> r0 = this.V256_0.WithUpper(this.V4L.AsVector128());
+        Vector256<float> r1 = this.V256_1.WithUpper(this.V5L.AsVector128());
+        Vector256<float> r2 = this.V256_2.WithUpper(this.V6L.AsVector128());
+        Vector256<float> r3 = this.V256_3.WithUpper(this.V7L.AsVector128());
+        Vector256<float> r4 = this.V0R.AsVector128().ToVector256().WithUpper(this.V4R.AsVector128());
+        Vector256<float> r5 = this.V1R.AsVector128().ToVector256().WithUpper(this.V5R.AsVector128());
+        Vector256<float> r6 = this.V2R.AsVector128().ToVector256().WithUpper(this.V6R.AsVector128());
+        Vector256<float> r7 = this.V3R.AsVector128().ToVector256().WithUpper(this.V7R.AsVector128());
 
         Vector256<float> t0 = Avx.UnpackLow(r0, r1);
         Vector256<float> t2 = Avx.UnpackLow(r2, r3);