1
1
// Copyright (c) Six Labors.
2
2
// Licensed under the Six Labors Split License.
3
3
4
- using System . Numerics ;
5
4
using System . Runtime . CompilerServices ;
6
5
using System . Runtime . InteropServices ;
7
6
using System . Runtime . Intrinsics ;
@@ -60,109 +59,76 @@ public void NormalizeColorsAndRoundInPlaceVector256(float maximum)
60
59
}
61
60
62
61
/// <summary>
63
- /// Loads values from <paramref name="source"/> using extended AVX2 intrinsics.
62
+ /// Loads values from <paramref name="source"/> using <see cref="Vector256{T}"/> intrinsics.
64
63
/// </summary>
65
64
/// <param name="source">The source <see cref="Block8x8"/></param>
66
- public void LoadFromInt16ExtendedAvx2 ( ref Block8x8 source )
65
+ public void LoadFromInt16ExtendedVector256 ( ref Block8x8 source )
67
66
{
68
67
DebugGuard . IsTrue (
69
- Avx2 . IsSupported ,
70
- "LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!" ) ;
68
+ Vector256 . IsHardwareAccelerated ,
69
+ "LoadFromInt16ExtendedVector256 only works on Vector256 compatible architecture!" ) ;
71
70
72
71
ref short sRef = ref Unsafe . As < Block8x8 , short > ( ref source ) ;
73
72
ref Vector256 < float > dRef = ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref this ) ;
74
73
75
- // Vector256<ushort>.Count == 16 on AVX2
74
+ // Vector256<ushort>.Count == 16
76
75
// We can process 2 block rows in a single step
77
- Vector256 < int > top = Avx2 . ConvertToVector256Int32 ( Vector128 . LoadUnsafe ( ref sRef ) ) ;
78
- Vector256 < int > bottom = Avx2 . ConvertToVector256Int32 ( Vector128 . LoadUnsafe ( ref sRef , ( nuint ) Vector256 < int > . Count ) ) ;
79
- dRef = Avx . ConvertToVector256Single ( top ) ;
80
- Unsafe . Add ( ref dRef , 1 ) = Avx . ConvertToVector256Single ( bottom ) ;
81
-
82
- top = Avx2 . ConvertToVector256Int32 ( Vector128 . LoadUnsafe ( ref sRef , ( nuint ) ( Vector256 < int > . Count * 2 ) ) ) ;
83
- bottom = Avx2 . ConvertToVector256Int32 ( Vector128 . LoadUnsafe ( ref sRef , ( nuint ) ( Vector256 < int > . Count * 3 ) ) ) ;
84
- Unsafe . Add ( ref dRef , 2 ) = Avx . ConvertToVector256Single ( top ) ;
85
- Unsafe . Add ( ref dRef , 3 ) = Avx . ConvertToVector256Single ( bottom ) ;
86
-
87
- top = Avx2 . ConvertToVector256Int32 ( Vector128 . LoadUnsafe ( ref sRef , ( nuint ) ( Vector256 < int > . Count * 4 ) ) ) ;
88
- bottom = Avx2 . ConvertToVector256Int32 ( Vector128 . LoadUnsafe ( ref sRef , ( nuint ) ( Vector256 < int > . Count * 5 ) ) ) ;
89
- Unsafe . Add ( ref dRef , 4 ) = Avx . ConvertToVector256Single ( top ) ;
90
- Unsafe . Add ( ref dRef , 5 ) = Avx . ConvertToVector256Single ( bottom ) ;
91
-
92
- top = Avx2 . ConvertToVector256Int32 ( Vector128 . LoadUnsafe ( ref sRef , ( nuint ) ( Vector256 < int > . Count * 6 ) ) ) ;
93
- bottom = Avx2 . ConvertToVector256Int32 ( Vector128 . LoadUnsafe ( ref sRef , ( nuint ) ( Vector256 < int > . Count * 7 ) ) ) ;
94
- Unsafe . Add ( ref dRef , 6 ) = Avx . ConvertToVector256Single ( top ) ;
95
- Unsafe . Add ( ref dRef , 7 ) = Avx . ConvertToVector256Single ( bottom ) ;
76
+ Vector256 < int > top = Vector256_ . Widen ( Vector128 . LoadUnsafe ( ref sRef ) ) ;
77
+ Vector256 < int > bottom = Vector256_ . Widen ( Vector128 . LoadUnsafe ( ref sRef , ( nuint ) Vector256 < int > . Count ) ) ;
78
+ dRef = Vector256 . ConvertToSingle ( top ) ;
79
+ Unsafe . Add ( ref dRef , 1 ) = Vector256 . ConvertToSingle ( bottom ) ;
80
+
81
+ top = Vector256_ . Widen ( Vector128 . LoadUnsafe ( ref sRef , ( nuint ) ( Vector256 < int > . Count * 2 ) ) ) ;
82
+ bottom = Vector256_ . Widen ( Vector128 . LoadUnsafe ( ref sRef , ( nuint ) ( Vector256 < int > . Count * 3 ) ) ) ;
83
+ Unsafe . Add ( ref dRef , 2 ) = Vector256 . ConvertToSingle ( top ) ;
84
+ Unsafe . Add ( ref dRef , 3 ) = Vector256 . ConvertToSingle ( bottom ) ;
85
+
86
+ top = Vector256_ . Widen ( Vector128 . LoadUnsafe ( ref sRef , ( nuint ) ( Vector256 < int > . Count * 4 ) ) ) ;
87
+ bottom = Vector256_ . Widen ( Vector128 . LoadUnsafe ( ref sRef , ( nuint ) ( Vector256 < int > . Count * 5 ) ) ) ;
88
+ Unsafe . Add ( ref dRef , 4 ) = Vector256 . ConvertToSingle ( top ) ;
89
+ Unsafe . Add ( ref dRef , 5 ) = Vector256 . ConvertToSingle ( bottom ) ;
90
+
91
+ top = Vector256_ . Widen ( Vector128 . LoadUnsafe ( ref sRef , ( nuint ) ( Vector256 < int > . Count * 6 ) ) ) ;
92
+ bottom = Vector256_ . Widen ( Vector128 . LoadUnsafe ( ref sRef , ( nuint ) ( Vector256 < int > . Count * 7 ) ) ) ;
93
+ Unsafe . Add ( ref dRef , 6 ) = Vector256 . ConvertToSingle ( top ) ;
94
+ Unsafe . Add ( ref dRef , 7 ) = Vector256 . ConvertToSingle ( bottom ) ;
96
95
}
97
96
98
97
[ MethodImpl ( InliningOptions . ShortMethod ) ]
99
98
private static Vector256 < float > NormalizeAndRoundVector256 ( Vector256 < float > value , Vector256 < float > off , Vector256 < float > max )
100
99
=> Vector256_ . RoundToNearestInteger ( Vector256_ . Clamp ( value + off , Vector256 < float > . Zero , max ) ) ;
101
100
102
- private static unsafe void MultiplyIntoInt16_Avx2 ( ref Block8x8F a , ref Block8x8F b , ref Block8x8 dest )
101
+ private static unsafe void MultiplyIntoInt16Vector256 ( ref Block8x8F a , ref Block8x8F b , ref Block8x8 dest )
103
102
{
104
- DebugGuard . IsTrue ( Avx2 . IsSupported , "Avx2 support is required to run this operation!" ) ;
103
+ DebugGuard . IsTrue ( Vector256 . IsHardwareAccelerated , "Vector256 support is required to run this operation!" ) ;
105
104
106
105
ref Vector256 < float > aBase = ref a . V256_0 ;
107
106
ref Vector256 < float > bBase = ref b . V256_0 ;
108
-
109
107
ref Vector256 < short > destRef = ref dest . V01 ;
110
- Vector256 < int > multiplyIntoInt16ShuffleMask = Vector256 . Create ( 0 , 1 , 4 , 5 , 2 , 3 , 6 , 7 ) ;
111
108
112
109
for ( nuint i = 0 ; i < 8 ; i += 2 )
113
110
{
114
- Vector256 < int > row0 = Avx . ConvertToVector256Int32 ( Avx . Multiply ( Unsafe . Add ( ref aBase , i + 0 ) , Unsafe . Add ( ref bBase , i + 0 ) ) ) ;
115
- Vector256 < int > row1 = Avx . ConvertToVector256Int32 ( Avx . Multiply ( Unsafe . Add ( ref aBase , i + 1 ) , Unsafe . Add ( ref bBase , i + 1 ) ) ) ;
111
+ Vector256 < int > row0 = Vector256_ . ConvertToInt32RoundToEven ( Unsafe . Add ( ref aBase , i + 0 ) * Unsafe . Add ( ref bBase , i + 0 ) ) ;
112
+ Vector256 < int > row1 = Vector256_ . ConvertToInt32RoundToEven ( Unsafe . Add ( ref aBase , i + 1 ) * Unsafe . Add ( ref bBase , i + 1 ) ) ;
116
113
117
- Vector256 < short > row = Avx2 . PackSignedSaturate ( row0 , row1 ) ;
118
- row = Avx2 . PermuteVar8x32 ( row . AsInt32 ( ) , multiplyIntoInt16ShuffleMask ) . AsInt16 ( ) ;
114
+ Vector256 < short > row = Vector256_ . PackSignedSaturate ( row0 , row1 ) ;
115
+ row = Vector256 . Shuffle ( row . AsInt32 ( ) , Vector256 . Create ( 0 , 1 , 4 , 5 , 2 , 3 , 6 , 7 ) ) . AsInt16 ( ) ;
119
116
120
117
Unsafe . Add ( ref destRef , i / 2 ) = row;
121
118
}
122
119
}
123
120
124
- private void TransposeInPlace_Avx ( )
121
+ private void TransposeInPlaceVector256 ( )
125
122
{
126
123
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
127
- Vector256 < float > r0 = Avx . InsertVector128 (
128
- this . V256_0 ,
129
- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V4L ) ,
130
- 1 ) ;
131
-
132
- Vector256 < float > r1 = Avx . InsertVector128 (
133
- this . V256_1 ,
134
- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V5L ) ,
135
- 1 ) ;
136
-
137
- Vector256 < float > r2 = Avx . InsertVector128 (
138
- this . V256_2 ,
139
- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V6L ) ,
140
- 1 ) ;
141
-
142
- Vector256 < float > r3 = Avx . InsertVector128 (
143
- this . V256_3 ,
144
- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V7L ) ,
145
- 1 ) ;
146
-
147
- Vector256 < float > r4 = Avx . InsertVector128 (
148
- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V0R ) . ToVector256 ( ) ,
149
- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V4R ) ,
150
- 1 ) ;
151
-
152
- Vector256 < float > r5 = Avx . InsertVector128 (
153
- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V1R ) . ToVector256 ( ) ,
154
- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V5R ) ,
155
- 1 ) ;
156
-
157
- Vector256 < float > r6 = Avx . InsertVector128 (
158
- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V2R ) . ToVector256 ( ) ,
159
- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V6R ) ,
160
- 1 ) ;
161
-
162
- Vector256 < float > r7 = Avx . InsertVector128 (
163
- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V3R ) . ToVector256 ( ) ,
164
- Unsafe . As < Vector4 , Vector128 < float > > ( ref this . V7R ) ,
165
- 1 ) ;
124
+ Vector256 < float > r0 = this . V256_0 . WithUpper ( this . V4L . AsVector128 ( ) ) ;
125
+ Vector256 < float > r1 = this . V256_1 . WithUpper ( this . V5L . AsVector128 ( ) ) ;
126
+ Vector256 < float > r2 = this . V256_2 . WithUpper ( this . V6L . AsVector128 ( ) ) ;
127
+ Vector256 < float > r3 = this . V256_3 . WithUpper ( this . V7L . AsVector128 ( ) ) ;
128
+ Vector256 < float > r4 = this . V0R . AsVector128 ( ) . ToVector256 ( ) . WithUpper ( this . V4R . AsVector128 ( ) ) ;
129
+ Vector256 < float > r5 = this . V1R . AsVector128 ( ) . ToVector256 ( ) . WithUpper ( this . V5R . AsVector128 ( ) ) ;
130
+ Vector256 < float > r6 = this . V2R . AsVector128 ( ) . ToVector256 ( ) . WithUpper ( this . V6R . AsVector128 ( ) ) ;
131
+ Vector256 < float > r7 = this . V3R . AsVector128 ( ) . ToVector256 ( ) . WithUpper ( this . V7R . AsVector128 ( ) ) ;
166
132
167
133
Vector256 < float > t0 = Avx . UnpackLow ( r0 , r1 ) ;
168
134
Vector256 < float > t2 = Avx . UnpackLow ( r2 , r3 ) ;
0 commit comments