-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AArch64] Improve bcvtn2 and remove aarch64_neon_bfcvt intrinsics #120363
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang @llvm/pr-subscribers-clang-codegen Author: David Green (davemgreen) ChangesThis started out as trying to combine bf16 fpround to BFCVT2 instructions, but ended up removing the aarch64.neon.nfcvt intrinsics in favour of generating fpround instructions directly. This simplifies the patterns and can lead to other optimizations. The BFCVT2 instruction is adjusted to makes sure the types are valid, and a bfcvt2 is now generated in more place. The old intrinsics are auto-upgraded to fptrunc instructions too. Patch is 34.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/120363.diff 10 Files Affected:
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ef89fa4358dfeb..ddc5391eb3fa23 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -259,11 +259,6 @@ def OP_VCVT_F32_BF16_LO
def OP_VCVT_F32_BF16_HI
: Op<(call "vcvt_f32_bf16", (call "vget_high", $p0))>;
-def OP_VCVT_BF16_F32_LO_A64
- : Op<(call "__a64_vcvtq_low_bf16", $p0)>;
-def OP_VCVT_BF16_F32_A64
- : Op<(call "vget_low", (call "__a64_vcvtq_low_bf16", $p0))>;
-
def OP_VCVT_BF16_F32_A32
: Op<(call "__a32_vcvt_bf16", $p0)>;
@@ -2061,10 +2056,9 @@ let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard =
}
let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
- def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">;
- def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>;
+ def VCVT_LOW_BF16_F32_A64 : SInst<"vcvt_low_bf16", "BQ", "Qf">;
def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">;
- def VCVT_BF16_F32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A64>;
+ def VCVT_BF16_F32 : SInst<"vcvt_bf16", "BQ", "f">;
def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>;
def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4d4b7428abd505..47e4a10addc167 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -7277,7 +7277,6 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
};
static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
- NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0),
NEONMAP0(splat_lane_v),
NEONMAP0(splat_laneq_v),
NEONMAP0(splatq_lane_v),
@@ -7377,7 +7376,8 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP0(vcvtq_f16_s16),
NEONMAP0(vcvtq_f16_u16),
NEONMAP0(vcvtq_f32_v),
- NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0),
+ NEONMAP0(vcvtq_high_bf16_f32),
+ NEONMAP0(vcvtq_low_bf16_f32),
NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
@@ -7586,7 +7586,7 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
- NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0),
+ NEONMAP0(vcvth_bf16_f32),
NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
@@ -12040,6 +12040,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
return ConstantInt::get(Builder.getInt32Ty(), 0);
}
+ if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32)
+ return Builder.CreateFPTrunc(
+ Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
+ Builder.getFloatTy()),
+ Builder.getBFloatTy());
+
// Handle MSVC intrinsics before argument evaluation to prevent double
// evaluation.
if (std::optional<MSVCIntrin> MsvcIntId =
@@ -12765,6 +12771,35 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
"vgetq_lane");
}
+ case NEON::BI__builtin_neon_vcvt_bf16_f32: {
+ llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
+ llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
+ return Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
+ }
+ case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: {
+ SmallVector<int, 16> ConcatMask(8);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
+ llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
+ llvm::Value *Trunc =
+ Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
+ return Builder.CreateShuffleVector(
+ Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
+ }
+ case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: {
+ SmallVector<int, 16> ConcatMask(8);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ SmallVector<int, 16> LoMask(4);
+ std::iota(LoMask.begin(), LoMask.end(), 0);
+ llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
+ llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
+ llvm::Type *V8BF16 = FixedVectorType::get(Builder.getBFloatTy(), 8);
+ llvm::Value *Inactive = Builder.CreateShuffleVector(
+ Builder.CreateBitCast(Ops[0], V8BF16), LoMask);
+ llvm::Value *Trunc =
+ Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[1], V4F32), V4BF16);
+ return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
+ }
case clang::AArch64::BI_InterlockedAdd:
case clang::AArch64::BI_InterlockedAdd64: {
diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index 51aa5aa758f0c3..93f54c70c340d6 100644
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -223,10 +223,8 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
// CHECK-A64-LABEL: @test_vcvt_bf16_f32(
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
-// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]]
+// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP1]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
@@ -263,9 +261,9 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
// CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
-// CHECK-A64-NEXT: ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]]
+// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP2]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
@@ -323,9 +321,10 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
-// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] to <16 x i8>
-// CHECK-A64-NEXT: ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]]
+// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
@@ -404,8 +403,8 @@ bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
// CHECK-A64-LABEL: @test_vcvth_bf16_f32(
// CHECK-A64-NEXT: entry:
-// CHECK-A64-NEXT: [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]])
-// CHECK-A64-NEXT: ret bfloat [[VCVTH_BF16_F32_I]]
+// CHECK-A64-NEXT: [[TMP0:%.*]] = fptrunc float [[A:%.*]] to bfloat
+// CHECK-A64-NEXT: ret bfloat [[TMP0]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvth_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 53a66099a92bda..763bf31f378e98 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -538,17 +538,6 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;
-
- // v8.6-A Bfloat Intrinsics
- def int_aarch64_neon_bfcvt
- : DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
- def int_aarch64_neon_bfcvtn
- : DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
- def int_aarch64_neon_bfcvtn2
- : DefaultAttrsIntrinsic<[llvm_v8bf16_ty],
- [llvm_v8bf16_ty, llvm_v4f32_ty],
- [IntrNoMem]>;
-
// v8.2-A FP16 Fused Multiply-Add Long
def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 06e62bf7f9f757..be67bed087b81e 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -45,6 +45,7 @@
#include "llvm/Support/Regex.h"
#include "llvm/TargetParser/Triple.h"
#include <cstring>
+#include <numeric>
using namespace llvm;
@@ -828,6 +829,13 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
return true;
}
}
+
+ // Changed in 20.0: bfcvt/bfcvtn/bcvtn2 have been replaced with fptrunc.
+ if (Name.starts_with("bfcvt")) {
+ NewFn = nullptr;
+ return true;
+ }
+
return false; // No other 'aarch64.neon.*'.
}
if (Name.consume_front("sve.")) {
@@ -4064,31 +4072,59 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI,
Function *F, IRBuilder<> &Builder) {
- Intrinsic::ID NewID =
- StringSwitch<Intrinsic::ID>(Name)
- .Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
- .Case("sve.fcvtnt.bf16f32", Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
- .Default(Intrinsic::not_intrinsic);
- if (NewID == Intrinsic::not_intrinsic)
- llvm_unreachable("Unhandled Intrinsic!");
-
- SmallVector<Value *, 3> Args(CI->args());
-
- // The original intrinsics incorrectly used a predicate based on the smallest
- // element type rather than the largest.
- Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
- Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
-
- if (Args[1]->getType() != BadPredTy)
- llvm_unreachable("Unexpected predicate type!");
-
- Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
- BadPredTy, Args[1]);
- Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
- GoodPredTy, Args[1]);
-
- return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
- CI->getName());
+ if (Name.starts_with("neon.bfcvt")) {
+ if (Name.starts_with("neon.bfcvtn2")) {
+ SmallVector<int, 32> LoMask(4);
+ std::iota(LoMask.begin(), LoMask.end(), 0);
+ SmallVector<int, 32> ConcatMask(8);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ Value *Inactive = Builder.CreateShuffleVector(CI->getOperand(0), LoMask);
+ Value *Trunc =
+ Builder.CreateFPTrunc(CI->getOperand(1), Inactive->getType());
+ return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
+ } else if (Name.starts_with("neon.bfcvtn")) {
+ SmallVector<int, 32> ConcatMask(8);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ Type *V4BF16 =
+ FixedVectorType::get(Type::getBFloatTy(F->getContext()), 4);
+ Value *Trunc = Builder.CreateFPTrunc(CI->getOperand(0), V4BF16);
+ dbgs() << "Trunc: " << *Trunc << "\n";
+ return Builder.CreateShuffleVector(
+ Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
+ } else {
+ return Builder.CreateFPTrunc(CI->getOperand(0),
+ Type::getBFloatTy(F->getContext()));
+ }
+ } else if (Name.starts_with("sve.fcvt")) {
+ Intrinsic::ID NewID =
+ StringSwitch<Intrinsic::ID>(Name)
+ .Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
+ .Case("sve.fcvtnt.bf16f32",
+ Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
+ .Default(Intrinsic::not_intrinsic);
+ if (NewID == Intrinsic::not_intrinsic)
+ llvm_unreachable("Unhandled Intrinsic!");
+
+ SmallVector<Value *, 3> Args(CI->args());
+
+ // The original intrinsics incorrectly used a predicate based on the
+ // smallest element type rather than the largest.
+ Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
+ Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
+
+ if (Args[1]->getType() != BadPredTy)
+ llvm_unreachable("Unexpected predicate type!");
+
+ Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
+ BadPredTy, Args[1]);
+ Args[1] = Builder.CreateIntrinsic(
+ Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]);
+
+ return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
+ CI->getName());
+ }
+
+ llvm_unreachable("Unhandled Intrinsic!");
}
static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 56ff7b0d3a280d..a03d97cd81d0a0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -9045,22 +9045,19 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
let mayRaiseFPException = 1, Uses = [FPCR] in
class SIMD_BFCVTN
- : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
+ : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V64,
"bfcvtn", ".4h", ".4s",
- [(set (v8bf16 V128:$Rd),
- (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
+ [(set (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn)))]>;
let mayRaiseFPException = 1, Uses = [FPCR] in
class SIMD_BFCVTN2
: BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
- "bfcvtn2", ".8h", ".4s",
- [(set (v8bf16 V128:$dst),
- (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
+ "bfcvtn2", ".8h", ".4s", []>;
let mayRaiseFPException = 1, Uses = [FPCR] in
class BF16ToSinglePrecision<string asm>
: I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
- [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
+ [(set (bf16 FPR16:$Rd), (any_fpround (f32 FPR32:$Rn)))]>,
Sched<[WriteFCvt]> {
bits<5> Rd;
bits<5> Rn;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d015cc15581ad0..825bf130e9baa7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1446,8 +1446,8 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
def BFCVTN : SIMD_BFCVTN;
def BFCVTN2 : SIMD_BFCVTN2;
-def : Pat<(v4bf16 (any_fpround (v4f32 V128:$Rn))),
- (EXTRACT_SUBREG (BFCVTN V128:$Rn), dsub)>;
+def : Pat<(concat_vectors (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn))),
+ (BFCVTN2 (v8bf16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub)), V128:$Rn)>;
// Vector-scalar BFDOT:
// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
@@ -1469,8 +1469,6 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
-// Round FP32 to BF16.
-def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
}
// ARMv8.6A AArch64 matrix multiplication
@@ -10425,9 +10423,11 @@ multiclass PromoteUnaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst>
let Predicates = [HasBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn)),
(v8bf16 (BFCVTN2
- (v8bf16 (BFCVTN
- (v4f32 (OutInst
- (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
+ (INSERT_SUBREG (IMPLICIT_DEF),
+ (v4bf16 (BFCVTN
+ (v4f32 (OutInst
+ (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
+ dsub),
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>;
let Predicates = [HasNoBF16] in
@@ -10462,10 +10462,12 @@ multiclass PromoteBinaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst
let Predicates = [HasBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
(v8bf16 (BFCVTN2
- (v8bf16 (BFCVTN
- (v4f32 (OutInst
- (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
- (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
+ (INSERT_SUBREG (IMPLICIT_DEF),
+ (v4bf16 (BFCVTN
+ (v4f32 (OutInst
+ (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
+ (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
+ dsub),
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn)),
(v4f32 (SHLLv8i16 V128:$Rm))))))>;
diff --git a/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll b/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll
index 9d4e79d38d5d1a..64bc95f2f38906 100644
--- a/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll
@@ -1,5 +1,8 @@
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64 -mattr=+neon -mattr=+bf16 | FileCheck %s
+; This test acts to test the old neon.bfcvt intrinsics, which are now
+; autoupgraded to fptrunc operations.
+
declare bfloat @llvm.aarch64.neon.bfcvt(float)
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float>)
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat>, <4 x float>)
diff --git a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
index 9b6e19eba3f4e6..1cd0294b0083eb 100644
--- a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
@@ -22,7 +22,6 @@ define <4 x bfloat> @add_h(<4 x bfloat> %a, <4 x bfloat> %b) {
; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16
; CHECK-BF16-NEXT: fadd v0.4s, v0.4s, v1.4s
; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-BF16-NEXT: ret
entry:
@@ -62,7 +61,6 @@ define <4 x bfloat> @sub_h(<4 x bfloat> %a, <4 x bfloat> %b) {
; CHECK-BF16-NEXT: ...
[truncated]
|
@llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesThis started out as trying to combine bf16 fpround to BFCVT2 instructions, but ended up removing the aarch64.neon.nfcvt intrinsics in favour of generating fpround instructions directly. This simplifies the patterns and can lead to other optimizations. The BFCVT2 instruction is adjusted to makes sure the types are valid, and a bfcvt2 is now generated in more place. The old intrinsics are auto-upgraded to fptrunc instructions too. Patch is 34.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/120363.diff 10 Files Affected:
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ef89fa4358dfeb..ddc5391eb3fa23 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -259,11 +259,6 @@ def OP_VCVT_F32_BF16_LO
def OP_VCVT_F32_BF16_HI
: Op<(call "vcvt_f32_bf16", (call "vget_high", $p0))>;
-def OP_VCVT_BF16_F32_LO_A64
- : Op<(call "__a64_vcvtq_low_bf16", $p0)>;
-def OP_VCVT_BF16_F32_A64
- : Op<(call "vget_low", (call "__a64_vcvtq_low_bf16", $p0))>;
-
def OP_VCVT_BF16_F32_A32
: Op<(call "__a32_vcvt_bf16", $p0)>;
@@ -2061,10 +2056,9 @@ let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard =
}
let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
- def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">;
- def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>;
+ def VCVT_LOW_BF16_F32_A64 : SInst<"vcvt_low_bf16", "BQ", "Qf">;
def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">;
- def VCVT_BF16_F32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A64>;
+ def VCVT_BF16_F32 : SInst<"vcvt_bf16", "BQ", "f">;
def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>;
def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4d4b7428abd505..47e4a10addc167 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -7277,7 +7277,6 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
};
static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
- NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0),
NEONMAP0(splat_lane_v),
NEONMAP0(splat_laneq_v),
NEONMAP0(splatq_lane_v),
@@ -7377,7 +7376,8 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP0(vcvtq_f16_s16),
NEONMAP0(vcvtq_f16_u16),
NEONMAP0(vcvtq_f32_v),
- NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0),
+ NEONMAP0(vcvtq_high_bf16_f32),
+ NEONMAP0(vcvtq_low_bf16_f32),
NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
@@ -7586,7 +7586,7 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
- NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0),
+ NEONMAP0(vcvth_bf16_f32),
NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
@@ -12040,6 +12040,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
return ConstantInt::get(Builder.getInt32Ty(), 0);
}
+ if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32)
+ return Builder.CreateFPTrunc(
+ Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
+ Builder.getFloatTy()),
+ Builder.getBFloatTy());
+
// Handle MSVC intrinsics before argument evaluation to prevent double
// evaluation.
if (std::optional<MSVCIntrin> MsvcIntId =
@@ -12765,6 +12771,35 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
"vgetq_lane");
}
+ case NEON::BI__builtin_neon_vcvt_bf16_f32: {
+ llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
+ llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
+ return Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
+ }
+ case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: {
+ SmallVector<int, 16> ConcatMask(8);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
+ llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
+ llvm::Value *Trunc =
+ Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
+ return Builder.CreateShuffleVector(
+ Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
+ }
+ case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: {
+ SmallVector<int, 16> ConcatMask(8);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ SmallVector<int, 16> LoMask(4);
+ std::iota(LoMask.begin(), LoMask.end(), 0);
+ llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
+ llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
+ llvm::Type *V8BF16 = FixedVectorType::get(Builder.getBFloatTy(), 8);
+ llvm::Value *Inactive = Builder.CreateShuffleVector(
+ Builder.CreateBitCast(Ops[0], V8BF16), LoMask);
+ llvm::Value *Trunc =
+ Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[1], V4F32), V4BF16);
+ return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
+ }
case clang::AArch64::BI_InterlockedAdd:
case clang::AArch64::BI_InterlockedAdd64: {
diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index 51aa5aa758f0c3..93f54c70c340d6 100644
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -223,10 +223,8 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
// CHECK-A64-LABEL: @test_vcvt_bf16_f32(
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
-// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]]
+// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP1]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
@@ -263,9 +261,9 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
// CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
-// CHECK-A64-NEXT: ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]]
+// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP2]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
@@ -323,9 +321,10 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
-// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] to <16 x i8>
-// CHECK-A64-NEXT: ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]]
+// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
+// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
@@ -404,8 +403,8 @@ bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
// CHECK-A64-LABEL: @test_vcvth_bf16_f32(
// CHECK-A64-NEXT: entry:
-// CHECK-A64-NEXT: [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]])
-// CHECK-A64-NEXT: ret bfloat [[VCVTH_BF16_F32_I]]
+// CHECK-A64-NEXT: [[TMP0:%.*]] = fptrunc float [[A:%.*]] to bfloat
+// CHECK-A64-NEXT: ret bfloat [[TMP0]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvth_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 53a66099a92bda..763bf31f378e98 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -538,17 +538,6 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;
-
- // v8.6-A Bfloat Intrinsics
- def int_aarch64_neon_bfcvt
- : DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
- def int_aarch64_neon_bfcvtn
- : DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
- def int_aarch64_neon_bfcvtn2
- : DefaultAttrsIntrinsic<[llvm_v8bf16_ty],
- [llvm_v8bf16_ty, llvm_v4f32_ty],
- [IntrNoMem]>;
-
// v8.2-A FP16 Fused Multiply-Add Long
def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 06e62bf7f9f757..be67bed087b81e 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -45,6 +45,7 @@
#include "llvm/Support/Regex.h"
#include "llvm/TargetParser/Triple.h"
#include <cstring>
+#include <numeric>
using namespace llvm;
@@ -828,6 +829,13 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
return true;
}
}
+
+ // Changed in 20.0: bfcvt/bfcvtn/bcvtn2 have been replaced with fptrunc.
+ if (Name.starts_with("bfcvt")) {
+ NewFn = nullptr;
+ return true;
+ }
+
return false; // No other 'aarch64.neon.*'.
}
if (Name.consume_front("sve.")) {
@@ -4064,31 +4072,59 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI,
Function *F, IRBuilder<> &Builder) {
- Intrinsic::ID NewID =
- StringSwitch<Intrinsic::ID>(Name)
- .Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
- .Case("sve.fcvtnt.bf16f32", Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
- .Default(Intrinsic::not_intrinsic);
- if (NewID == Intrinsic::not_intrinsic)
- llvm_unreachable("Unhandled Intrinsic!");
-
- SmallVector<Value *, 3> Args(CI->args());
-
- // The original intrinsics incorrectly used a predicate based on the smallest
- // element type rather than the largest.
- Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
- Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
-
- if (Args[1]->getType() != BadPredTy)
- llvm_unreachable("Unexpected predicate type!");
-
- Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
- BadPredTy, Args[1]);
- Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
- GoodPredTy, Args[1]);
-
- return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
- CI->getName());
+ if (Name.starts_with("neon.bfcvt")) {
+ if (Name.starts_with("neon.bfcvtn2")) {
+ SmallVector<int, 32> LoMask(4);
+ std::iota(LoMask.begin(), LoMask.end(), 0);
+ SmallVector<int, 32> ConcatMask(8);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ Value *Inactive = Builder.CreateShuffleVector(CI->getOperand(0), LoMask);
+ Value *Trunc =
+ Builder.CreateFPTrunc(CI->getOperand(1), Inactive->getType());
+ return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
+ } else if (Name.starts_with("neon.bfcvtn")) {
+ SmallVector<int, 32> ConcatMask(8);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ Type *V4BF16 =
+ FixedVectorType::get(Type::getBFloatTy(F->getContext()), 4);
+ Value *Trunc = Builder.CreateFPTrunc(CI->getOperand(0), V4BF16);
+ dbgs() << "Trunc: " << *Trunc << "\n";
+ return Builder.CreateShuffleVector(
+ Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
+ } else {
+ return Builder.CreateFPTrunc(CI->getOperand(0),
+ Type::getBFloatTy(F->getContext()));
+ }
+ } else if (Name.starts_with("sve.fcvt")) {
+ Intrinsic::ID NewID =
+ StringSwitch<Intrinsic::ID>(Name)
+ .Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
+ .Case("sve.fcvtnt.bf16f32",
+ Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
+ .Default(Intrinsic::not_intrinsic);
+ if (NewID == Intrinsic::not_intrinsic)
+ llvm_unreachable("Unhandled Intrinsic!");
+
+ SmallVector<Value *, 3> Args(CI->args());
+
+ // The original intrinsics incorrectly used a predicate based on the
+ // smallest element type rather than the largest.
+ Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
+ Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
+
+ if (Args[1]->getType() != BadPredTy)
+ llvm_unreachable("Unexpected predicate type!");
+
+ Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
+ BadPredTy, Args[1]);
+ Args[1] = Builder.CreateIntrinsic(
+ Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]);
+
+ return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
+ CI->getName());
+ }
+
+ llvm_unreachable("Unhandled Intrinsic!");
}
static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 56ff7b0d3a280d..a03d97cd81d0a0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -9045,22 +9045,19 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
let mayRaiseFPException = 1, Uses = [FPCR] in
class SIMD_BFCVTN
- : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
+ : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V64,
"bfcvtn", ".4h", ".4s",
- [(set (v8bf16 V128:$Rd),
- (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
+ [(set (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn)))]>;
let mayRaiseFPException = 1, Uses = [FPCR] in
class SIMD_BFCVTN2
: BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
- "bfcvtn2", ".8h", ".4s",
- [(set (v8bf16 V128:$dst),
- (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
+ "bfcvtn2", ".8h", ".4s", []>;
let mayRaiseFPException = 1, Uses = [FPCR] in
class BF16ToSinglePrecision<string asm>
: I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
- [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
+ [(set (bf16 FPR16:$Rd), (any_fpround (f32 FPR32:$Rn)))]>,
Sched<[WriteFCvt]> {
bits<5> Rd;
bits<5> Rn;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d015cc15581ad0..825bf130e9baa7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1446,8 +1446,8 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
def BFCVTN : SIMD_BFCVTN;
def BFCVTN2 : SIMD_BFCVTN2;
-def : Pat<(v4bf16 (any_fpround (v4f32 V128:$Rn))),
- (EXTRACT_SUBREG (BFCVTN V128:$Rn), dsub)>;
+def : Pat<(concat_vectors (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn))),
+ (BFCVTN2 (v8bf16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub)), V128:$Rn)>;
// Vector-scalar BFDOT:
// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
@@ -1469,8 +1469,6 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
-// Round FP32 to BF16.
-def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
}
// ARMv8.6A AArch64 matrix multiplication
@@ -10425,9 +10423,11 @@ multiclass PromoteUnaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst>
let Predicates = [HasBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn)),
(v8bf16 (BFCVTN2
- (v8bf16 (BFCVTN
- (v4f32 (OutInst
- (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
+ (INSERT_SUBREG (IMPLICIT_DEF),
+ (v4bf16 (BFCVTN
+ (v4f32 (OutInst
+ (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
+ dsub),
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>;
let Predicates = [HasNoBF16] in
@@ -10462,10 +10462,12 @@ multiclass PromoteBinaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst
let Predicates = [HasBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
(v8bf16 (BFCVTN2
- (v8bf16 (BFCVTN
- (v4f32 (OutInst
- (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
- (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
+ (INSERT_SUBREG (IMPLICIT_DEF),
+ (v4bf16 (BFCVTN
+ (v4f32 (OutInst
+ (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
+ (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
+ dsub),
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn)),
(v4f32 (SHLLv8i16 V128:$Rm))))))>;
diff --git a/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll b/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll
index 9d4e79d38d5d1a..64bc95f2f38906 100644
--- a/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll
@@ -1,5 +1,8 @@
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64 -mattr=+neon -mattr=+bf16 | FileCheck %s
+; This test acts to test the old neon.bfcvt intrinsics, which are now
+; autoupgraded to fptrunc operations.
+
declare bfloat @llvm.aarch64.neon.bfcvt(float)
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float>)
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat>, <4 x float>)
diff --git a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
index 9b6e19eba3f4e6..1cd0294b0083eb 100644
--- a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
@@ -22,7 +22,6 @@ define <4 x bfloat> @add_h(<4 x bfloat> %a, <4 x bfloat> %b) {
; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16
; CHECK-BF16-NEXT: fadd v0.4s, v0.4s, v1.4s
; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-BF16-NEXT: ret
entry:
@@ -62,7 +61,6 @@ define <4 x bfloat> @sub_h(<4 x bfloat> %a, <4 x bfloat> %b) {
; CHECK-BF16-NEXT: ...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff edc02351dd11cc4a39b7c541b26b71c6f36c8e55 ff5b62875738cc89266aeec6f0b06f4b55d30a3a --extensions cpp,c -- clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/arm-bf16-convert-intrinsics.c llvm/lib/IR/AutoUpgrade.cpp View the diff from clang-format here.diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 063cd0cc09..59bac8e972 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -7307,517 +7307,607 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
};
static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
- NEONMAP0(splat_lane_v),
- NEONMAP0(splat_laneq_v),
- NEONMAP0(splatq_lane_v),
- NEONMAP0(splatq_laneq_v),
- NEONMAP1(vabs_v, aarch64_neon_abs, 0),
- NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
- NEONMAP0(vadd_v),
- NEONMAP0(vaddhn_v),
- NEONMAP0(vaddq_p128),
- NEONMAP0(vaddq_v),
- NEONMAP1(vaesdq_u8, aarch64_crypto_aesd, 0),
- NEONMAP1(vaeseq_u8, aarch64_crypto_aese, 0),
- NEONMAP1(vaesimcq_u8, aarch64_crypto_aesimc, 0),
- NEONMAP1(vaesmcq_u8, aarch64_crypto_aesmc, 0),
- NEONMAP2(vbcaxq_s16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
- NEONMAP2(vbcaxq_s32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
- NEONMAP2(vbcaxq_s64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
- NEONMAP2(vbcaxq_s8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
- NEONMAP2(vbcaxq_u16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
- NEONMAP2(vbcaxq_u32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
- NEONMAP2(vbcaxq_u64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
- NEONMAP2(vbcaxq_u8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
- NEONMAP1(vbfdot_f32, aarch64_neon_bfdot, 0),
- NEONMAP1(vbfdotq_f32, aarch64_neon_bfdot, 0),
- NEONMAP1(vbfmlalbq_f32, aarch64_neon_bfmlalb, 0),
- NEONMAP1(vbfmlaltq_f32, aarch64_neon_bfmlalt, 0),
- NEONMAP1(vbfmmlaq_f32, aarch64_neon_bfmmla, 0),
- NEONMAP1(vcadd_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
- NEONMAP1(vcadd_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
- NEONMAP1(vcadd_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
- NEONMAP1(vcadd_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
- NEONMAP1(vcaddq_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
- NEONMAP1(vcaddq_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
- NEONMAP1(vcaddq_rot270_f64, aarch64_neon_vcadd_rot270, Add1ArgType),
- NEONMAP1(vcaddq_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
- NEONMAP1(vcaddq_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
- NEONMAP1(vcaddq_rot90_f64, aarch64_neon_vcadd_rot90, Add1ArgType),
- NEONMAP1(vcage_v, aarch64_neon_facge, 0),
- NEONMAP1(vcageq_v, aarch64_neon_facge, 0),
- NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),
- NEONMAP1(vcagtq_v, aarch64_neon_facgt, 0),
- NEONMAP1(vcale_v, aarch64_neon_facge, 0),
- NEONMAP1(vcaleq_v, aarch64_neon_facge, 0),
- NEONMAP1(vcalt_v, aarch64_neon_facgt, 0),
- NEONMAP1(vcaltq_v, aarch64_neon_facgt, 0),
- NEONMAP0(vceqz_v),
- NEONMAP0(vceqzq_v),
- NEONMAP0(vcgez_v),
- NEONMAP0(vcgezq_v),
- NEONMAP0(vcgtz_v),
- NEONMAP0(vcgtzq_v),
- NEONMAP0(vclez_v),
- NEONMAP0(vclezq_v),
- NEONMAP1(vcls_v, aarch64_neon_cls, Add1ArgType),
- NEONMAP1(vclsq_v, aarch64_neon_cls, Add1ArgType),
- NEONMAP0(vcltz_v),
- NEONMAP0(vcltzq_v),
- NEONMAP1(vclz_v, ctlz, Add1ArgType),
- NEONMAP1(vclzq_v, ctlz, Add1ArgType),
- NEONMAP1(vcmla_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
- NEONMAP1(vcmla_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
- NEONMAP1(vcmla_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
- NEONMAP1(vcmla_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
- NEONMAP1(vcmla_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
- NEONMAP1(vcmla_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
- NEONMAP1(vcmla_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
- NEONMAP1(vcmla_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
- NEONMAP1(vcmlaq_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
- NEONMAP1(vcmlaq_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
- NEONMAP1(vcmlaq_f64, aarch64_neon_vcmla_rot0, Add1ArgType),
- NEONMAP1(vcmlaq_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
- NEONMAP1(vcmlaq_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
- NEONMAP1(vcmlaq_rot180_f64, aarch64_neon_vcmla_rot180, Add1ArgType),
- NEONMAP1(vcmlaq_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
- NEONMAP1(vcmlaq_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
- NEONMAP1(vcmlaq_rot270_f64, aarch64_neon_vcmla_rot270, Add1ArgType),
- NEONMAP1(vcmlaq_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
- NEONMAP1(vcmlaq_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
- NEONMAP1(vcmlaq_rot90_f64, aarch64_neon_vcmla_rot90, Add1ArgType),
- NEONMAP1(vcnt_v, ctpop, Add1ArgType),
- NEONMAP1(vcntq_v, ctpop, Add1ArgType),
- NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0),
- NEONMAP0(vcvt_f16_s16),
- NEONMAP0(vcvt_f16_u16),
- NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0),
- NEONMAP0(vcvt_f32_v),
- NEONMAP1(vcvt_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
- NEONMAP1(vcvt_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
- NEONMAP2(vcvt_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
- NEONMAP2(vcvt_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
- NEONMAP1(vcvt_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
- NEONMAP1(vcvt_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
- NEONMAP1(vcvt_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
- NEONMAP1(vcvt_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
- NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
- NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
- NEONMAP0(vcvtq_f16_s16),
- NEONMAP0(vcvtq_f16_u16),
- NEONMAP0(vcvtq_f32_v),
- NEONMAP0(vcvtq_high_bf16_f32),
- NEONMAP0(vcvtq_low_bf16_f32),
- NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
- NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
- NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
- NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
- NEONMAP1(vcvtq_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
- NEONMAP1(vcvtq_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
- NEONMAP1(vcvtq_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
- NEONMAP1(vcvtq_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
- NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
- NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
- NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
- NEONMAP1(vdot_s32, aarch64_neon_sdot, 0),
- NEONMAP1(vdot_u32, aarch64_neon_udot, 0),
- NEONMAP1(vdotq_s32, aarch64_neon_sdot, 0),
- NEONMAP1(vdotq_u32, aarch64_neon_udot, 0),
- NEONMAP2(veor3q_s16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
- NEONMAP2(veor3q_s32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
- NEONMAP2(veor3q_s64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
- NEONMAP2(veor3q_s8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
- NEONMAP2(veor3q_u16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
- NEONMAP2(veor3q_u32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
- NEONMAP2(veor3q_u64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
- NEONMAP2(veor3q_u8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
- NEONMAP0(vext_v),
- NEONMAP0(vextq_v),
- NEONMAP0(vfma_v),
- NEONMAP0(vfmaq_v),
- NEONMAP1(vfmlal_high_f16, aarch64_neon_fmlal2, 0),
- NEONMAP1(vfmlal_low_f16, aarch64_neon_fmlal, 0),
- NEONMAP1(vfmlalq_high_f16, aarch64_neon_fmlal2, 0),
- NEONMAP1(vfmlalq_low_f16, aarch64_neon_fmlal, 0),
- NEONMAP1(vfmlsl_high_f16, aarch64_neon_fmlsl2, 0),
- NEONMAP1(vfmlsl_low_f16, aarch64_neon_fmlsl, 0),
- NEONMAP1(vfmlslq_high_f16, aarch64_neon_fmlsl2, 0),
- NEONMAP1(vfmlslq_low_f16, aarch64_neon_fmlsl, 0),
- NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
- NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
- NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
- NEONMAP2(vhsubq_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
- NEONMAP1(vld1_x2_v, aarch64_neon_ld1x2, 0),
- NEONMAP1(vld1_x3_v, aarch64_neon_ld1x3, 0),
- NEONMAP1(vld1_x4_v, aarch64_neon_ld1x4, 0),
- NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0),
- NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0),
- NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0),
- NEONMAP1(vmmlaq_s32, aarch64_neon_smmla, 0),
- NEONMAP1(vmmlaq_u32, aarch64_neon_ummla, 0),
- NEONMAP0(vmovl_v),
- NEONMAP0(vmovn_v),
- NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType),
- NEONMAP1(vmulq_v, aarch64_neon_pmul, Add1ArgType),
- NEONMAP1(vpadd_v, aarch64_neon_addp, Add1ArgType),
- NEONMAP2(vpaddl_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
- NEONMAP2(vpaddlq_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
- NEONMAP1(vpaddq_v, aarch64_neon_addp, Add1ArgType),
- NEONMAP1(vqabs_v, aarch64_neon_sqabs, Add1ArgType),
- NEONMAP1(vqabsq_v, aarch64_neon_sqabs, Add1ArgType),
- NEONMAP2(vqadd_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
- NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
- NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
- NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
- NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, 0),
- NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
- NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
- NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, 0),
- NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
- NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
- NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
- NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts),
- NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
- NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
- NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
- NEONMAP1(vqrdmlah_s16, aarch64_neon_sqrdmlah, Add1ArgType),
- NEONMAP1(vqrdmlah_s32, aarch64_neon_sqrdmlah, Add1ArgType),
- NEONMAP1(vqrdmlahq_s16, aarch64_neon_sqrdmlah, Add1ArgType),
- NEONMAP1(vqrdmlahq_s32, aarch64_neon_sqrdmlah, Add1ArgType),
- NEONMAP1(vqrdmlsh_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
- NEONMAP1(vqrdmlsh_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
- NEONMAP1(vqrdmlshq_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
- NEONMAP1(vqrdmlshq_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
- NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0),
- NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
- NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
- NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, 0),
- NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
- NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
- NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
- NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
- NEONMAP2(vqshl_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl, UnsignedAlts),
- NEONMAP2(vqshl_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
- NEONMAP2(vqshlq_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl,UnsignedAlts),
- NEONMAP2(vqshlq_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
- NEONMAP1(vqshlu_n_v, aarch64_neon_sqshlu, 0),
- NEONMAP1(vqshluq_n_v, aarch64_neon_sqshlu, 0),
- NEONMAP2(vqsub_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
- NEONMAP2(vqsubq_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
- NEONMAP1(vraddhn_v, aarch64_neon_raddhn, Add1ArgType),
- NEONMAP1(vrax1q_u64, aarch64_crypto_rax1, 0),
- NEONMAP2(vrecpe_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
- NEONMAP2(vrecpeq_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
- NEONMAP1(vrecps_v, aarch64_neon_frecps, Add1ArgType),
- NEONMAP1(vrecpsq_v, aarch64_neon_frecps, Add1ArgType),
- NEONMAP2(vrhadd_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
- NEONMAP2(vrhaddq_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
- NEONMAP1(vrnd32x_f32, aarch64_neon_frint32x, Add1ArgType),
- NEONMAP1(vrnd32x_f64, aarch64_neon_frint32x, Add1ArgType),
- NEONMAP1(vrnd32xq_f32, aarch64_neon_frint32x, Add1ArgType),
- NEONMAP1(vrnd32xq_f64, aarch64_neon_frint32x, Add1ArgType),
- NEONMAP1(vrnd32z_f32, aarch64_neon_frint32z, Add1ArgType),
- NEONMAP1(vrnd32z_f64, aarch64_neon_frint32z, Add1ArgType),
- NEONMAP1(vrnd32zq_f32, aarch64_neon_frint32z, Add1ArgType),
- NEONMAP1(vrnd32zq_f64, aarch64_neon_frint32z, Add1ArgType),
- NEONMAP1(vrnd64x_f32, aarch64_neon_frint64x, Add1ArgType),
- NEONMAP1(vrnd64x_f64, aarch64_neon_frint64x, Add1ArgType),
- NEONMAP1(vrnd64xq_f32, aarch64_neon_frint64x, Add1ArgType),
- NEONMAP1(vrnd64xq_f64, aarch64_neon_frint64x, Add1ArgType),
- NEONMAP1(vrnd64z_f32, aarch64_neon_frint64z, Add1ArgType),
- NEONMAP1(vrnd64z_f64, aarch64_neon_frint64z, Add1ArgType),
- NEONMAP1(vrnd64zq_f32, aarch64_neon_frint64z, Add1ArgType),
- NEONMAP1(vrnd64zq_f64, aarch64_neon_frint64z, Add1ArgType),
- NEONMAP0(vrndi_v),
- NEONMAP0(vrndiq_v),
- NEONMAP2(vrshl_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
- NEONMAP2(vrshlq_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
- NEONMAP2(vrshr_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
- NEONMAP2(vrshrq_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
- NEONMAP2(vrsqrte_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
- NEONMAP2(vrsqrteq_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
- NEONMAP1(vrsqrts_v, aarch64_neon_frsqrts, Add1ArgType),
- NEONMAP1(vrsqrtsq_v, aarch64_neon_frsqrts, Add1ArgType),
- NEONMAP1(vrsubhn_v, aarch64_neon_rsubhn, Add1ArgType),
- NEONMAP1(vsha1su0q_u32, aarch64_crypto_sha1su0, 0),
- NEONMAP1(vsha1su1q_u32, aarch64_crypto_sha1su1, 0),
- NEONMAP1(vsha256h2q_u32, aarch64_crypto_sha256h2, 0),
- NEONMAP1(vsha256hq_u32, aarch64_crypto_sha256h, 0),
- NEONMAP1(vsha256su0q_u32, aarch64_crypto_sha256su0, 0),
- NEONMAP1(vsha256su1q_u32, aarch64_crypto_sha256su1, 0),
- NEONMAP1(vsha512h2q_u64, aarch64_crypto_sha512h2, 0),
- NEONMAP1(vsha512hq_u64, aarch64_crypto_sha512h, 0),
- NEONMAP1(vsha512su0q_u64, aarch64_crypto_sha512su0, 0),
- NEONMAP1(vsha512su1q_u64, aarch64_crypto_sha512su1, 0),
- NEONMAP0(vshl_n_v),
- NEONMAP2(vshl_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
- NEONMAP0(vshll_n_v),
- NEONMAP0(vshlq_n_v),
- NEONMAP2(vshlq_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
- NEONMAP0(vshr_n_v),
- NEONMAP0(vshrn_n_v),
- NEONMAP0(vshrq_n_v),
- NEONMAP1(vsm3partw1q_u32, aarch64_crypto_sm3partw1, 0),
- NEONMAP1(vsm3partw2q_u32, aarch64_crypto_sm3partw2, 0),
- NEONMAP1(vsm3ss1q_u32, aarch64_crypto_sm3ss1, 0),
- NEONMAP1(vsm3tt1aq_u32, aarch64_crypto_sm3tt1a, 0),
- NEONMAP1(vsm3tt1bq_u32, aarch64_crypto_sm3tt1b, 0),
- NEONMAP1(vsm3tt2aq_u32, aarch64_crypto_sm3tt2a, 0),
- NEONMAP1(vsm3tt2bq_u32, aarch64_crypto_sm3tt2b, 0),
- NEONMAP1(vsm4ekeyq_u32, aarch64_crypto_sm4ekey, 0),
- NEONMAP1(vsm4eq_u32, aarch64_crypto_sm4e, 0),
- NEONMAP1(vst1_x2_v, aarch64_neon_st1x2, 0),
- NEONMAP1(vst1_x3_v, aarch64_neon_st1x3, 0),
- NEONMAP1(vst1_x4_v, aarch64_neon_st1x4, 0),
- NEONMAP1(vst1q_x2_v, aarch64_neon_st1x2, 0),
- NEONMAP1(vst1q_x3_v, aarch64_neon_st1x3, 0),
- NEONMAP1(vst1q_x4_v, aarch64_neon_st1x4, 0),
- NEONMAP0(vsubhn_v),
- NEONMAP0(vtst_v),
- NEONMAP0(vtstq_v),
- NEONMAP1(vusdot_s32, aarch64_neon_usdot, 0),
- NEONMAP1(vusdotq_s32, aarch64_neon_usdot, 0),
- NEONMAP1(vusmmlaq_s32, aarch64_neon_usmmla, 0),
- NEONMAP1(vxarq_u64, aarch64_crypto_xar, 0),
+ NEONMAP0(splat_lane_v),
+ NEONMAP0(splat_laneq_v),
+ NEONMAP0(splatq_lane_v),
+ NEONMAP0(splatq_laneq_v),
+ NEONMAP1(vabs_v, aarch64_neon_abs, 0),
+ NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
+ NEONMAP0(vadd_v),
+ NEONMAP0(vaddhn_v),
+ NEONMAP0(vaddq_p128),
+ NEONMAP0(vaddq_v),
+ NEONMAP1(vaesdq_u8, aarch64_crypto_aesd, 0),
+ NEONMAP1(vaeseq_u8, aarch64_crypto_aese, 0),
+ NEONMAP1(vaesimcq_u8, aarch64_crypto_aesimc, 0),
+ NEONMAP1(vaesmcq_u8, aarch64_crypto_aesmc, 0),
+ NEONMAP2(vbcaxq_s16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vbcaxq_s32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vbcaxq_s64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vbcaxq_s8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vbcaxq_u16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vbcaxq_u32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vbcaxq_u64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vbcaxq_u8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP1(vbfdot_f32, aarch64_neon_bfdot, 0),
+ NEONMAP1(vbfdotq_f32, aarch64_neon_bfdot, 0),
+ NEONMAP1(vbfmlalbq_f32, aarch64_neon_bfmlalb, 0),
+ NEONMAP1(vbfmlaltq_f32, aarch64_neon_bfmlalt, 0),
+ NEONMAP1(vbfmmlaq_f32, aarch64_neon_bfmmla, 0),
+ NEONMAP1(vcadd_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
+ NEONMAP1(vcadd_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
+ NEONMAP1(vcadd_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
+ NEONMAP1(vcadd_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
+ NEONMAP1(vcaddq_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
+ NEONMAP1(vcaddq_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
+ NEONMAP1(vcaddq_rot270_f64, aarch64_neon_vcadd_rot270, Add1ArgType),
+ NEONMAP1(vcaddq_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
+ NEONMAP1(vcaddq_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
+ NEONMAP1(vcaddq_rot90_f64, aarch64_neon_vcadd_rot90, Add1ArgType),
+ NEONMAP1(vcage_v, aarch64_neon_facge, 0),
+ NEONMAP1(vcageq_v, aarch64_neon_facge, 0),
+ NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),
+ NEONMAP1(vcagtq_v, aarch64_neon_facgt, 0),
+ NEONMAP1(vcale_v, aarch64_neon_facge, 0),
+ NEONMAP1(vcaleq_v, aarch64_neon_facge, 0),
+ NEONMAP1(vcalt_v, aarch64_neon_facgt, 0),
+ NEONMAP1(vcaltq_v, aarch64_neon_facgt, 0),
+ NEONMAP0(vceqz_v),
+ NEONMAP0(vceqzq_v),
+ NEONMAP0(vcgez_v),
+ NEONMAP0(vcgezq_v),
+ NEONMAP0(vcgtz_v),
+ NEONMAP0(vcgtzq_v),
+ NEONMAP0(vclez_v),
+ NEONMAP0(vclezq_v),
+ NEONMAP1(vcls_v, aarch64_neon_cls, Add1ArgType),
+ NEONMAP1(vclsq_v, aarch64_neon_cls, Add1ArgType),
+ NEONMAP0(vcltz_v),
+ NEONMAP0(vcltzq_v),
+ NEONMAP1(vclz_v, ctlz, Add1ArgType),
+ NEONMAP1(vclzq_v, ctlz, Add1ArgType),
+ NEONMAP1(vcmla_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
+ NEONMAP1(vcmla_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
+ NEONMAP1(vcmla_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
+ NEONMAP1(vcmla_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
+ NEONMAP1(vcmla_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
+ NEONMAP1(vcmla_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
+ NEONMAP1(vcmla_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
+ NEONMAP1(vcmla_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
+ NEONMAP1(vcmlaq_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
+ NEONMAP1(vcmlaq_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
+ NEONMAP1(vcmlaq_f64, aarch64_neon_vcmla_rot0, Add1ArgType),
+ NEONMAP1(vcmlaq_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
+ NEONMAP1(vcmlaq_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
+ NEONMAP1(vcmlaq_rot180_f64, aarch64_neon_vcmla_rot180, Add1ArgType),
+ NEONMAP1(vcmlaq_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
+ NEONMAP1(vcmlaq_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
+ NEONMAP1(vcmlaq_rot270_f64, aarch64_neon_vcmla_rot270, Add1ArgType),
+ NEONMAP1(vcmlaq_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
+ NEONMAP1(vcmlaq_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
+ NEONMAP1(vcmlaq_rot90_f64, aarch64_neon_vcmla_rot90, Add1ArgType),
+ NEONMAP1(vcnt_v, ctpop, Add1ArgType),
+ NEONMAP1(vcntq_v, ctpop, Add1ArgType),
+ NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0),
+ NEONMAP0(vcvt_f16_s16),
+ NEONMAP0(vcvt_f16_u16),
+ NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0),
+ NEONMAP0(vcvt_f32_v),
+ NEONMAP1(vcvt_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
+ NEONMAP1(vcvt_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
+ NEONMAP2(vcvt_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
+ NEONMAP2(vcvt_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
+ NEONMAP1(vcvt_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
+ NEONMAP1(vcvt_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
+ NEONMAP1(vcvt_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
+ NEONMAP1(vcvt_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
+ NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
+ NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
+ NEONMAP0(vcvtq_f16_s16),
+ NEONMAP0(vcvtq_f16_u16),
+ NEONMAP0(vcvtq_f32_v),
+ NEONMAP0(vcvtq_high_bf16_f32),
+ NEONMAP0(vcvtq_low_bf16_f32),
+ NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
+ NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
+ NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp,
+ 0),
+ NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp,
+ 0),
+ NEONMAP1(vcvtq_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
+ NEONMAP1(vcvtq_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
+ NEONMAP1(vcvtq_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
+ NEONMAP1(vcvtq_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
+ NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
+ NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
+ NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
+ NEONMAP1(vdot_s32, aarch64_neon_sdot, 0),
+ NEONMAP1(vdot_u32, aarch64_neon_udot, 0),
+ NEONMAP1(vdotq_s32, aarch64_neon_sdot, 0),
+ NEONMAP1(vdotq_u32, aarch64_neon_udot, 0),
+ NEONMAP2(veor3q_s16, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(veor3q_s32, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(veor3q_s64, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(veor3q_s8, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(veor3q_u16, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(veor3q_u32, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(veor3q_u64, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(veor3q_u8, aarch64_crypto_eor3u, aarch64_crypto_eor3s,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP0(vext_v),
+ NEONMAP0(vextq_v),
+ NEONMAP0(vfma_v),
+ NEONMAP0(vfmaq_v),
+ NEONMAP1(vfmlal_high_f16, aarch64_neon_fmlal2, 0),
+ NEONMAP1(vfmlal_low_f16, aarch64_neon_fmlal, 0),
+ NEONMAP1(vfmlalq_high_f16, aarch64_neon_fmlal2, 0),
+ NEONMAP1(vfmlalq_low_f16, aarch64_neon_fmlal, 0),
+ NEONMAP1(vfmlsl_high_f16, aarch64_neon_fmlsl2, 0),
+ NEONMAP1(vfmlsl_low_f16, aarch64_neon_fmlsl, 0),
+ NEONMAP1(vfmlslq_high_f16, aarch64_neon_fmlsl2, 0),
+ NEONMAP1(vfmlslq_low_f16, aarch64_neon_fmlsl, 0),
+ NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vhsubq_v, aarch64_neon_uhsub, aarch64_neon_shsub,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP1(vld1_x2_v, aarch64_neon_ld1x2, 0),
+ NEONMAP1(vld1_x3_v, aarch64_neon_ld1x3, 0),
+ NEONMAP1(vld1_x4_v, aarch64_neon_ld1x4, 0),
+ NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0),
+ NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0),
+ NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0),
+ NEONMAP1(vmmlaq_s32, aarch64_neon_smmla, 0),
+ NEONMAP1(vmmlaq_u32, aarch64_neon_ummla, 0),
+ NEONMAP0(vmovl_v),
+ NEONMAP0(vmovn_v),
+ NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType),
+ NEONMAP1(vmulq_v, aarch64_neon_pmul, Add1ArgType),
+ NEONMAP1(vpadd_v, aarch64_neon_addp, Add1ArgType),
+ NEONMAP2(vpaddl_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
+ NEONMAP2(vpaddlq_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
+ NEONMAP1(vpaddq_v, aarch64_neon_addp, Add1ArgType),
+ NEONMAP1(vqabs_v, aarch64_neon_sqabs, Add1ArgType),
+ NEONMAP1(vqabsq_v, aarch64_neon_sqabs, Add1ArgType),
+ NEONMAP2(vqadd_v, aarch64_neon_uqadd, aarch64_neon_sqadd,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
+ NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
+ NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, 0),
+ NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
+ NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
+ NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, 0),
+ NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
+ NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
+ NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
+ NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
+ NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
+ NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
+ NEONMAP1(vqrdmlah_s16, aarch64_neon_sqrdmlah, Add1ArgType),
+ NEONMAP1(vqrdmlah_s32, aarch64_neon_sqrdmlah, Add1ArgType),
+ NEONMAP1(vqrdmlahq_s16, aarch64_neon_sqrdmlah, Add1ArgType),
+ NEONMAP1(vqrdmlahq_s32, aarch64_neon_sqrdmlah, Add1ArgType),
+ NEONMAP1(vqrdmlsh_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
+ NEONMAP1(vqrdmlsh_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
+ NEONMAP1(vqrdmlshq_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
+ NEONMAP1(vqrdmlshq_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
+ NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0),
+ NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
+ NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
+ NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, 0),
+ NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
+ NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
+ NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vqshl_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl, UnsignedAlts),
+ NEONMAP2(vqshl_v, aarch64_neon_uqshl, aarch64_neon_sqshl,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vqshlq_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl, UnsignedAlts),
+ NEONMAP2(vqshlq_v, aarch64_neon_uqshl, aarch64_neon_sqshl,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP1(vqshlu_n_v, aarch64_neon_sqshlu, 0),
+ NEONMAP1(vqshluq_n_v, aarch64_neon_sqshlu, 0),
+ NEONMAP2(vqsub_v, aarch64_neon_uqsub, aarch64_neon_sqsub,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vqsubq_v, aarch64_neon_uqsub, aarch64_neon_sqsub,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP1(vraddhn_v, aarch64_neon_raddhn, Add1ArgType),
+ NEONMAP1(vrax1q_u64, aarch64_crypto_rax1, 0),
+ NEONMAP2(vrecpe_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
+ NEONMAP2(vrecpeq_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
+ NEONMAP1(vrecps_v, aarch64_neon_frecps, Add1ArgType),
+ NEONMAP1(vrecpsq_v, aarch64_neon_frecps, Add1ArgType),
+ NEONMAP2(vrhadd_v, aarch64_neon_urhadd, aarch64_neon_srhadd,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vrhaddq_v, aarch64_neon_urhadd, aarch64_neon_srhadd,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP1(vrnd32x_f32, aarch64_neon_frint32x, Add1ArgType),
+ NEONMAP1(vrnd32x_f64, aarch64_neon_frint32x, Add1ArgType),
+ NEONMAP1(vrnd32xq_f32, aarch64_neon_frint32x, Add1ArgType),
+ NEONMAP1(vrnd32xq_f64, aarch64_neon_frint32x, Add1ArgType),
+ NEONMAP1(vrnd32z_f32, aarch64_neon_frint32z, Add1ArgType),
+ NEONMAP1(vrnd32z_f64, aarch64_neon_frint32z, Add1ArgType),
+ NEONMAP1(vrnd32zq_f32, aarch64_neon_frint32z, Add1ArgType),
+ NEONMAP1(vrnd32zq_f64, aarch64_neon_frint32z, Add1ArgType),
+ NEONMAP1(vrnd64x_f32, aarch64_neon_frint64x, Add1ArgType),
+ NEONMAP1(vrnd64x_f64, aarch64_neon_frint64x, Add1ArgType),
+ NEONMAP1(vrnd64xq_f32, aarch64_neon_frint64x, Add1ArgType),
+ NEONMAP1(vrnd64xq_f64, aarch64_neon_frint64x, Add1ArgType),
+ NEONMAP1(vrnd64z_f32, aarch64_neon_frint64z, Add1ArgType),
+ NEONMAP1(vrnd64z_f64, aarch64_neon_frint64z, Add1ArgType),
+ NEONMAP1(vrnd64zq_f32, aarch64_neon_frint64z, Add1ArgType),
+ NEONMAP1(vrnd64zq_f64, aarch64_neon_frint64z, Add1ArgType),
+ NEONMAP0(vrndi_v),
+ NEONMAP0(vrndiq_v),
+ NEONMAP2(vrshl_v, aarch64_neon_urshl, aarch64_neon_srshl,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vrshlq_v, aarch64_neon_urshl, aarch64_neon_srshl,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP2(vrshr_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
+ NEONMAP2(vrshrq_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
+ NEONMAP2(vrsqrte_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
+ NEONMAP2(vrsqrteq_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
+ NEONMAP1(vrsqrts_v, aarch64_neon_frsqrts, Add1ArgType),
+ NEONMAP1(vrsqrtsq_v, aarch64_neon_frsqrts, Add1ArgType),
+ NEONMAP1(vrsubhn_v, aarch64_neon_rsubhn, Add1ArgType),
+ NEONMAP1(vsha1su0q_u32, aarch64_crypto_sha1su0, 0),
+ NEONMAP1(vsha1su1q_u32, aarch64_crypto_sha1su1, 0),
+ NEONMAP1(vsha256h2q_u32, aarch64_crypto_sha256h2, 0),
+ NEONMAP1(vsha256hq_u32, aarch64_crypto_sha256h, 0),
+ NEONMAP1(vsha256su0q_u32, aarch64_crypto_sha256su0, 0),
+ NEONMAP1(vsha256su1q_u32, aarch64_crypto_sha256su1, 0),
+ NEONMAP1(vsha512h2q_u64, aarch64_crypto_sha512h2, 0),
+ NEONMAP1(vsha512hq_u64, aarch64_crypto_sha512h, 0),
+ NEONMAP1(vsha512su0q_u64, aarch64_crypto_sha512su0, 0),
+ NEONMAP1(vsha512su1q_u64, aarch64_crypto_sha512su1, 0),
+ NEONMAP0(vshl_n_v),
+ NEONMAP2(vshl_v, aarch64_neon_ushl, aarch64_neon_sshl,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP0(vshll_n_v),
+ NEONMAP0(vshlq_n_v),
+ NEONMAP2(vshlq_v, aarch64_neon_ushl, aarch64_neon_sshl,
+ Add1ArgType | UnsignedAlts),
+ NEONMAP0(vshr_n_v),
+ NEONMAP0(vshrn_n_v),
+ NEONMAP0(vshrq_n_v),
+ NEONMAP1(vsm3partw1q_u32, aarch64_crypto_sm3partw1, 0),
+ NEONMAP1(vsm3partw2q_u32, aarch64_crypto_sm3partw2, 0),
+ NEONMAP1(vsm3ss1q_u32, aarch64_crypto_sm3ss1, 0),
+ NEONMAP1(vsm3tt1aq_u32, aarch64_crypto_sm3tt1a, 0),
+ NEONMAP1(vsm3tt1bq_u32, aarch64_crypto_sm3tt1b, 0),
+ NEONMAP1(vsm3tt2aq_u32, aarch64_crypto_sm3tt2a, 0),
+ NEONMAP1(vsm3tt2bq_u32, aarch64_crypto_sm3tt2b, 0),
+ NEONMAP1(vsm4ekeyq_u32, aarch64_crypto_sm4ekey, 0),
+ NEONMAP1(vsm4eq_u32, aarch64_crypto_sm4e, 0),
+ NEONMAP1(vst1_x2_v, aarch64_neon_st1x2, 0),
+ NEONMAP1(vst1_x3_v, aarch64_neon_st1x3, 0),
+ NEONMAP1(vst1_x4_v, aarch64_neon_st1x4, 0),
+ NEONMAP1(vst1q_x2_v, aarch64_neon_st1x2, 0),
+ NEONMAP1(vst1q_x3_v, aarch64_neon_st1x3, 0),
+ NEONMAP1(vst1q_x4_v, aarch64_neon_st1x4, 0),
+ NEONMAP0(vsubhn_v),
+ NEONMAP0(vtst_v),
+ NEONMAP0(vtstq_v),
+ NEONMAP1(vusdot_s32, aarch64_neon_usdot, 0),
+ NEONMAP1(vusdotq_s32, aarch64_neon_usdot, 0),
+ NEONMAP1(vusmmlaq_s32, aarch64_neon_usmmla, 0),
+ NEONMAP1(vxarq_u64, aarch64_crypto_xar, 0),
};
static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
- NEONMAP1(vabdd_f64, aarch64_sisd_fabd, Add1ArgType),
- NEONMAP1(vabds_f32, aarch64_sisd_fabd, Add1ArgType),
- NEONMAP1(vabsd_s64, aarch64_neon_abs, Add1ArgType),
- NEONMAP1(vaddlv_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
- NEONMAP1(vaddlv_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
- NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
- NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
- NEONMAP1(vaddv_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
- NEONMAP1(vaddv_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
- NEONMAP1(vaddv_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
- NEONMAP1(vaddvq_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
- NEONMAP1(vaddvq_f64, aarch64_neon_faddv, AddRetType | Add1ArgType),
- NEONMAP1(vaddvq_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
- NEONMAP1(vaddvq_s64, aarch64_neon_saddv, AddRetType | Add1ArgType),
- NEONMAP1(vaddvq_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
- NEONMAP1(vaddvq_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
- NEONMAP1(vcaged_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
- NEONMAP1(vcages_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
- NEONMAP1(vcagtd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
- NEONMAP1(vcagts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
- NEONMAP1(vcaled_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
- NEONMAP1(vcales_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
- NEONMAP1(vcaltd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
- NEONMAP1(vcalts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
- NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
- NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
- NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
- NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
- NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
- NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
- NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
- NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
- NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
- NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
- NEONMAP0(vcvth_bf16_f32),
- NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
- NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
- NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
- NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
- NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
- NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
- NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
- NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
- NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
- NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
- NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
- NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
- NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
- NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
- NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
- NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
- NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
- NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
- NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0),
- NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
- NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
- NEONMAP1(vmaxnmvq_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
- NEONMAP1(vmaxv_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
- NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
- NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
- NEONMAP1(vmaxvq_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
- NEONMAP1(vmaxvq_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
- NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
- NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
- NEONMAP1(vminnmv_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
- NEONMAP1(vminnmvq_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
- NEONMAP1(vminnmvq_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
- NEONMAP1(vminv_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
- NEONMAP1(vminv_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
- NEONMAP1(vminv_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
- NEONMAP1(vminvq_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
- NEONMAP1(vminvq_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
- NEONMAP1(vminvq_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
- NEONMAP1(vminvq_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
- NEONMAP1(vmull_p64, aarch64_neon_pmull64, 0),
- NEONMAP1(vmulxd_f64, aarch64_neon_fmulx, Add1ArgType),
- NEONMAP1(vmulxs_f32, aarch64_neon_fmulx, Add1ArgType),
- NEONMAP1(vpaddd_s64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
- NEONMAP1(vpaddd_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
- NEONMAP1(vpmaxnmqd_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
- NEONMAP1(vpmaxnms_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
- NEONMAP1(vpmaxqd_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
- NEONMAP1(vpmaxs_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
- NEONMAP1(vpminnmqd_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
- NEONMAP1(vpminnms_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
- NEONMAP1(vpminqd_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
- NEONMAP1(vpmins_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
- NEONMAP1(vqabsb_s8, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqabsd_s64, aarch64_neon_sqabs, Add1ArgType),
- NEONMAP1(vqabsh_s16, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqabss_s32, aarch64_neon_sqabs, Add1ArgType),
- NEONMAP1(vqaddb_s8, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqaddb_u8, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqaddd_s64, aarch64_neon_sqadd, Add1ArgType),
- NEONMAP1(vqaddd_u64, aarch64_neon_uqadd, Add1ArgType),
- NEONMAP1(vqaddh_s16, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqaddh_u16, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqadds_s32, aarch64_neon_sqadd, Add1ArgType),
- NEONMAP1(vqadds_u32, aarch64_neon_uqadd, Add1ArgType),
- NEONMAP1(vqdmulhh_s16, aarch64_neon_sqdmulh, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqdmulhs_s32, aarch64_neon_sqdmulh, Add1ArgType),
- NEONMAP1(vqdmullh_s16, aarch64_neon_sqdmull, VectorRet | Use128BitVectors),
- NEONMAP1(vqdmulls_s32, aarch64_neon_sqdmulls_scalar, 0),
- NEONMAP1(vqmovnd_s64, aarch64_neon_scalar_sqxtn, AddRetType | Add1ArgType),
- NEONMAP1(vqmovnd_u64, aarch64_neon_scalar_uqxtn, AddRetType | Add1ArgType),
- NEONMAP1(vqmovnh_s16, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
- NEONMAP1(vqmovnh_u16, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
- NEONMAP1(vqmovns_s32, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
- NEONMAP1(vqmovns_u32, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
- NEONMAP1(vqmovund_s64, aarch64_neon_scalar_sqxtun, AddRetType | Add1ArgType),
- NEONMAP1(vqmovunh_s16, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
- NEONMAP1(vqmovuns_s32, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
- NEONMAP1(vqnegb_s8, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqnegd_s64, aarch64_neon_sqneg, Add1ArgType),
- NEONMAP1(vqnegh_s16, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqnegs_s32, aarch64_neon_sqneg, Add1ArgType),
- NEONMAP1(vqrdmlahh_s16, aarch64_neon_sqrdmlah, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqrdmlahs_s32, aarch64_neon_sqrdmlah, Add1ArgType),
- NEONMAP1(vqrdmlshh_s16, aarch64_neon_sqrdmlsh, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqrdmlshs_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
- NEONMAP1(vqrdmulhh_s16, aarch64_neon_sqrdmulh, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqrdmulhs_s32, aarch64_neon_sqrdmulh, Add1ArgType),
- NEONMAP1(vqrshlb_s8, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqrshlb_u8, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqrshld_s64, aarch64_neon_sqrshl, Add1ArgType),
- NEONMAP1(vqrshld_u64, aarch64_neon_uqrshl, Add1ArgType),
- NEONMAP1(vqrshlh_s16, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqrshlh_u16, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqrshls_s32, aarch64_neon_sqrshl, Add1ArgType),
- NEONMAP1(vqrshls_u32, aarch64_neon_uqrshl, Add1ArgType),
- NEONMAP1(vqrshrnd_n_s64, aarch64_neon_sqrshrn, AddRetType),
- NEONMAP1(vqrshrnd_n_u64, aarch64_neon_uqrshrn, AddRetType),
- NEONMAP1(vqrshrnh_n_s16, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
- NEONMAP1(vqrshrnh_n_u16, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
- NEONMAP1(vqrshrns_n_s32, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
- NEONMAP1(vqrshrns_n_u32, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
- NEONMAP1(vqrshrund_n_s64, aarch64_neon_sqrshrun, AddRetType),
- NEONMAP1(vqrshrunh_n_s16, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
- NEONMAP1(vqrshruns_n_s32, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
- NEONMAP1(vqshlb_n_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqshlb_n_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqshlb_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqshlb_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqshld_s64, aarch64_neon_sqshl, Add1ArgType),
- NEONMAP1(vqshld_u64, aarch64_neon_uqshl, Add1ArgType),
- NEONMAP1(vqshlh_n_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqshlh_n_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqshlh_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqshlh_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqshls_n_s32, aarch64_neon_sqshl, Add1ArgType),
- NEONMAP1(vqshls_n_u32, aarch64_neon_uqshl, Add1ArgType),
- NEONMAP1(vqshls_s32, aarch64_neon_sqshl, Add1ArgType),
- NEONMAP1(vqshls_u32, aarch64_neon_uqshl, Add1ArgType),
- NEONMAP1(vqshlub_n_s8, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqshluh_n_s16, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqshlus_n_s32, aarch64_neon_sqshlu, Add1ArgType),
- NEONMAP1(vqshrnd_n_s64, aarch64_neon_sqshrn, AddRetType),
- NEONMAP1(vqshrnd_n_u64, aarch64_neon_uqshrn, AddRetType),
- NEONMAP1(vqshrnh_n_s16, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
- NEONMAP1(vqshrnh_n_u16, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
- NEONMAP1(vqshrns_n_s32, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
- NEONMAP1(vqshrns_n_u32, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
- NEONMAP1(vqshrund_n_s64, aarch64_neon_sqshrun, AddRetType),
- NEONMAP1(vqshrunh_n_s16, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
- NEONMAP1(vqshruns_n_s32, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
- NEONMAP1(vqsubb_s8, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqsubb_u8, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqsubd_s64, aarch64_neon_sqsub, Add1ArgType),
- NEONMAP1(vqsubd_u64, aarch64_neon_uqsub, Add1ArgType),
- NEONMAP1(vqsubh_s16, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqsubh_u16, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vqsubs_s32, aarch64_neon_sqsub, Add1ArgType),
- NEONMAP1(vqsubs_u32, aarch64_neon_uqsub, Add1ArgType),
- NEONMAP1(vrecped_f64, aarch64_neon_frecpe, Add1ArgType),
- NEONMAP1(vrecpes_f32, aarch64_neon_frecpe, Add1ArgType),
- NEONMAP1(vrecpxd_f64, aarch64_neon_frecpx, Add1ArgType),
- NEONMAP1(vrecpxs_f32, aarch64_neon_frecpx, Add1ArgType),
- NEONMAP1(vrshld_s64, aarch64_neon_srshl, Add1ArgType),
- NEONMAP1(vrshld_u64, aarch64_neon_urshl, Add1ArgType),
- NEONMAP1(vrsqrted_f64, aarch64_neon_frsqrte, Add1ArgType),
- NEONMAP1(vrsqrtes_f32, aarch64_neon_frsqrte, Add1ArgType),
- NEONMAP1(vrsqrtsd_f64, aarch64_neon_frsqrts, Add1ArgType),
- NEONMAP1(vrsqrtss_f32, aarch64_neon_frsqrts, Add1ArgType),
- NEONMAP1(vsha1cq_u32, aarch64_crypto_sha1c, 0),
- NEONMAP1(vsha1h_u32, aarch64_crypto_sha1h, 0),
- NEONMAP1(vsha1mq_u32, aarch64_crypto_sha1m, 0),
- NEONMAP1(vsha1pq_u32, aarch64_crypto_sha1p, 0),
- NEONMAP1(vshld_s64, aarch64_neon_sshl, Add1ArgType),
- NEONMAP1(vshld_u64, aarch64_neon_ushl, Add1ArgType),
- NEONMAP1(vslid_n_s64, aarch64_neon_vsli, Vectorize1ArgType),
- NEONMAP1(vslid_n_u64, aarch64_neon_vsli, Vectorize1ArgType),
- NEONMAP1(vsqaddb_u8, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vsqaddd_u64, aarch64_neon_usqadd, Add1ArgType),
- NEONMAP1(vsqaddh_u16, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vsqadds_u32, aarch64_neon_usqadd, Add1ArgType),
- NEONMAP1(vsrid_n_s64, aarch64_neon_vsri, Vectorize1ArgType),
- NEONMAP1(vsrid_n_u64, aarch64_neon_vsri, Vectorize1ArgType),
- NEONMAP1(vuqaddb_s8, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType),
- NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
- NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType),
- // FP16 scalar intrinisics go here.
- NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType),
- NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
- NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
- NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
- NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
- NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
- NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
- NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
- NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
- NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
- NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
- NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
- NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
- NEONMAP1(vcvth_s32_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
- NEONMAP1(vcvth_s64_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
- NEONMAP1(vcvth_u32_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
- NEONMAP1(vcvth_u64_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
- NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
- NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
- NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
- NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
- NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
- NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
- NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
- NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
- NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
- NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
- NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
- NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
- NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType),
- NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType),
- NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType),
- NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType),
- NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType),
+ NEONMAP1(vabdd_f64, aarch64_sisd_fabd, Add1ArgType),
+ NEONMAP1(vabds_f32, aarch64_sisd_fabd, Add1ArgType),
+ NEONMAP1(vabsd_s64, aarch64_neon_abs, Add1ArgType),
+ NEONMAP1(vaddlv_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
+ NEONMAP1(vaddlv_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
+ NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
+ NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
+ NEONMAP1(vaddv_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
+ NEONMAP1(vaddv_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
+ NEONMAP1(vaddv_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
+ NEONMAP1(vaddvq_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
+ NEONMAP1(vaddvq_f64, aarch64_neon_faddv, AddRetType | Add1ArgType),
+ NEONMAP1(vaddvq_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
+ NEONMAP1(vaddvq_s64, aarch64_neon_saddv, AddRetType | Add1ArgType),
+ NEONMAP1(vaddvq_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
+ NEONMAP1(vaddvq_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
+ NEONMAP1(vcaged_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
+ NEONMAP1(vcages_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
+ NEONMAP1(vcagtd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
+ NEONMAP1(vcagts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
+ NEONMAP1(vcaled_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
+ NEONMAP1(vcales_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
+ NEONMAP1(vcaltd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
+ NEONMAP1(vcalts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
+ NEONMAP0(vcvth_bf16_f32),
+ NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
+ NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0),
+ NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
+ NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
+ NEONMAP1(vmaxnmvq_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
+ NEONMAP1(vmaxv_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
+ NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
+ NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
+ NEONMAP1(vmaxvq_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
+ NEONMAP1(vmaxvq_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
+ NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
+ NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
+ NEONMAP1(vminnmv_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
+ NEONMAP1(vminnmvq_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
+ NEONMAP1(vminnmvq_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
+ NEONMAP1(vminv_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
+ NEONMAP1(vminv_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
+ NEONMAP1(vminv_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
+ NEONMAP1(vminvq_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
+ NEONMAP1(vminvq_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
+ NEONMAP1(vminvq_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
+ NEONMAP1(vminvq_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
+ NEONMAP1(vmull_p64, aarch64_neon_pmull64, 0),
+ NEONMAP1(vmulxd_f64, aarch64_neon_fmulx, Add1ArgType),
+ NEONMAP1(vmulxs_f32, aarch64_neon_fmulx, Add1ArgType),
+ NEONMAP1(vpaddd_s64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
+ NEONMAP1(vpaddd_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
+ NEONMAP1(vpmaxnmqd_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
+ NEONMAP1(vpmaxnms_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
+ NEONMAP1(vpmaxqd_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
+ NEONMAP1(vpmaxs_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
+ NEONMAP1(vpminnmqd_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
+ NEONMAP1(vpminnms_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
+ NEONMAP1(vpminqd_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
+ NEONMAP1(vpmins_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
+ NEONMAP1(vqabsb_s8, aarch64_neon_sqabs,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqabsd_s64, aarch64_neon_sqabs, Add1ArgType),
+ NEONMAP1(vqabsh_s16, aarch64_neon_sqabs,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqabss_s32, aarch64_neon_sqabs, Add1ArgType),
+ NEONMAP1(vqaddb_s8, aarch64_neon_sqadd,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqaddb_u8, aarch64_neon_uqadd,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqaddd_s64, aarch64_neon_sqadd, Add1ArgType),
+ NEONMAP1(vqaddd_u64, aarch64_neon_uqadd, Add1ArgType),
+ NEONMAP1(vqaddh_s16, aarch64_neon_sqadd,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqaddh_u16, aarch64_neon_uqadd,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqadds_s32, aarch64_neon_sqadd, Add1ArgType),
+ NEONMAP1(vqadds_u32, aarch64_neon_uqadd, Add1ArgType),
+ NEONMAP1(vqdmulhh_s16, aarch64_neon_sqdmulh,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqdmulhs_s32, aarch64_neon_sqdmulh, Add1ArgType),
+ NEONMAP1(vqdmullh_s16, aarch64_neon_sqdmull, VectorRet | Use128BitVectors),
+ NEONMAP1(vqdmulls_s32, aarch64_neon_sqdmulls_scalar, 0),
+ NEONMAP1(vqmovnd_s64, aarch64_neon_scalar_sqxtn, AddRetType | Add1ArgType),
+ NEONMAP1(vqmovnd_u64, aarch64_neon_scalar_uqxtn, AddRetType | Add1ArgType),
+ NEONMAP1(vqmovnh_s16, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
+ NEONMAP1(vqmovnh_u16, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
+ NEONMAP1(vqmovns_s32, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
+ NEONMAP1(vqmovns_u32, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
+ NEONMAP1(vqmovund_s64, aarch64_neon_scalar_sqxtun,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vqmovunh_s16, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
+ NEONMAP1(vqmovuns_s32, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
+ NEONMAP1(vqnegb_s8, aarch64_neon_sqneg,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqnegd_s64, aarch64_neon_sqneg, Add1ArgType),
+ NEONMAP1(vqnegh_s16, aarch64_neon_sqneg,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqnegs_s32, aarch64_neon_sqneg, Add1ArgType),
+ NEONMAP1(vqrdmlahh_s16, aarch64_neon_sqrdmlah,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqrdmlahs_s32, aarch64_neon_sqrdmlah, Add1ArgType),
+ NEONMAP1(vqrdmlshh_s16, aarch64_neon_sqrdmlsh,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqrdmlshs_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
+ NEONMAP1(vqrdmulhh_s16, aarch64_neon_sqrdmulh,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqrdmulhs_s32, aarch64_neon_sqrdmulh, Add1ArgType),
+ NEONMAP1(vqrshlb_s8, aarch64_neon_sqrshl,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqrshlb_u8, aarch64_neon_uqrshl,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqrshld_s64, aarch64_neon_sqrshl, Add1ArgType),
+ NEONMAP1(vqrshld_u64, aarch64_neon_uqrshl, Add1ArgType),
+ NEONMAP1(vqrshlh_s16, aarch64_neon_sqrshl,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqrshlh_u16, aarch64_neon_uqrshl,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqrshls_s32, aarch64_neon_sqrshl, Add1ArgType),
+ NEONMAP1(vqrshls_u32, aarch64_neon_uqrshl, Add1ArgType),
+ NEONMAP1(vqrshrnd_n_s64, aarch64_neon_sqrshrn, AddRetType),
+ NEONMAP1(vqrshrnd_n_u64, aarch64_neon_uqrshrn, AddRetType),
+ NEONMAP1(vqrshrnh_n_s16, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
+ NEONMAP1(vqrshrnh_n_u16, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
+ NEONMAP1(vqrshrns_n_s32, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
+ NEONMAP1(vqrshrns_n_u32, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
+ NEONMAP1(vqrshrund_n_s64, aarch64_neon_sqrshrun, AddRetType),
+ NEONMAP1(vqrshrunh_n_s16, aarch64_neon_sqrshrun,
+ VectorRet | Use64BitVectors),
+ NEONMAP1(vqrshruns_n_s32, aarch64_neon_sqrshrun,
+ VectorRet | Use64BitVectors),
+ NEONMAP1(vqshlb_n_s8, aarch64_neon_sqshl,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqshlb_n_u8, aarch64_neon_uqshl,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqshlb_s8, aarch64_neon_sqshl,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqshlb_u8, aarch64_neon_uqshl,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqshld_s64, aarch64_neon_sqshl, Add1ArgType),
+ NEONMAP1(vqshld_u64, aarch64_neon_uqshl, Add1ArgType),
+ NEONMAP1(vqshlh_n_s16, aarch64_neon_sqshl,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqshlh_n_u16, aarch64_neon_uqshl,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqshlh_s16, aarch64_neon_sqshl,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqshlh_u16, aarch64_neon_uqshl,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqshls_n_s32, aarch64_neon_sqshl, Add1ArgType),
+ NEONMAP1(vqshls_n_u32, aarch64_neon_uqshl, Add1ArgType),
+ NEONMAP1(vqshls_s32, aarch64_neon_sqshl, Add1ArgType),
+ NEONMAP1(vqshls_u32, aarch64_neon_uqshl, Add1ArgType),
+ NEONMAP1(vqshlub_n_s8, aarch64_neon_sqshlu,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqshluh_n_s16, aarch64_neon_sqshlu,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqshlus_n_s32, aarch64_neon_sqshlu, Add1ArgType),
+ NEONMAP1(vqshrnd_n_s64, aarch64_neon_sqshrn, AddRetType),
+ NEONMAP1(vqshrnd_n_u64, aarch64_neon_uqshrn, AddRetType),
+ NEONMAP1(vqshrnh_n_s16, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
+ NEONMAP1(vqshrnh_n_u16, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
+ NEONMAP1(vqshrns_n_s32, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
+ NEONMAP1(vqshrns_n_u32, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
+ NEONMAP1(vqshrund_n_s64, aarch64_neon_sqshrun, AddRetType),
+ NEONMAP1(vqshrunh_n_s16, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
+ NEONMAP1(vqshruns_n_s32, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
+ NEONMAP1(vqsubb_s8, aarch64_neon_sqsub,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqsubb_u8, aarch64_neon_uqsub,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqsubd_s64, aarch64_neon_sqsub, Add1ArgType),
+ NEONMAP1(vqsubd_u64, aarch64_neon_uqsub, Add1ArgType),
+ NEONMAP1(vqsubh_s16, aarch64_neon_sqsub,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqsubh_u16, aarch64_neon_uqsub,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqsubs_s32, aarch64_neon_sqsub, Add1ArgType),
+ NEONMAP1(vqsubs_u32, aarch64_neon_uqsub, Add1ArgType),
+ NEONMAP1(vrecped_f64, aarch64_neon_frecpe, Add1ArgType),
+ NEONMAP1(vrecpes_f32, aarch64_neon_frecpe, Add1ArgType),
+ NEONMAP1(vrecpxd_f64, aarch64_neon_frecpx, Add1ArgType),
+ NEONMAP1(vrecpxs_f32, aarch64_neon_frecpx, Add1ArgType),
+ NEONMAP1(vrshld_s64, aarch64_neon_srshl, Add1ArgType),
+ NEONMAP1(vrshld_u64, aarch64_neon_urshl, Add1ArgType),
+ NEONMAP1(vrsqrted_f64, aarch64_neon_frsqrte, Add1ArgType),
+ NEONMAP1(vrsqrtes_f32, aarch64_neon_frsqrte, Add1ArgType),
+ NEONMAP1(vrsqrtsd_f64, aarch64_neon_frsqrts, Add1ArgType),
+ NEONMAP1(vrsqrtss_f32, aarch64_neon_frsqrts, Add1ArgType),
+ NEONMAP1(vsha1cq_u32, aarch64_crypto_sha1c, 0),
+ NEONMAP1(vsha1h_u32, aarch64_crypto_sha1h, 0),
+ NEONMAP1(vsha1mq_u32, aarch64_crypto_sha1m, 0),
+ NEONMAP1(vsha1pq_u32, aarch64_crypto_sha1p, 0),
+ NEONMAP1(vshld_s64, aarch64_neon_sshl, Add1ArgType),
+ NEONMAP1(vshld_u64, aarch64_neon_ushl, Add1ArgType),
+ NEONMAP1(vslid_n_s64, aarch64_neon_vsli, Vectorize1ArgType),
+ NEONMAP1(vslid_n_u64, aarch64_neon_vsli, Vectorize1ArgType),
+ NEONMAP1(vsqaddb_u8, aarch64_neon_usqadd,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vsqaddd_u64, aarch64_neon_usqadd, Add1ArgType),
+ NEONMAP1(vsqaddh_u16, aarch64_neon_usqadd,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vsqadds_u32, aarch64_neon_usqadd, Add1ArgType),
+ NEONMAP1(vsrid_n_s64, aarch64_neon_vsri, Vectorize1ArgType),
+ NEONMAP1(vsrid_n_u64, aarch64_neon_vsri, Vectorize1ArgType),
+ NEONMAP1(vuqaddb_s8, aarch64_neon_suqadd,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType),
+ NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd,
+ Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType),
+ // FP16 scalar intrinisics go here.
+ NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType),
+ NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu,
+ AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_s32_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_s64_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_u32_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_u64_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
+ NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType),
+ NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType),
+ NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType),
+ NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType),
+ NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType),
};
// Some intrinsics are equivalent for codegen.
|
4b31ec9
to
deaf0a7
Compare
This started out as trying to combine bf16 fpround to BFCVT2 instructions, but ended up removing the aarch64.neon.nfcvt intrinsics in favour of generating fpround instructions directly. This simplifies the patterns and can lead to other optimizations. The BFCVT2 instruction is adjusted to makes sure the types are more valid, and a bfcvt2 is now generated in more place. The old intrinsics are auto-upgraded to fptrunc instructions too.
deaf0a7
to
ff5b628
Compare
Rebase and ping - thanks. |
// CHECK-A64-NEXT: ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] | ||
// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> | ||
// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat> | ||
// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure I am following what the result TMP4 represents here: it is an 8 element vector,
where the first 4 elements come from INACTIVE, and the other 4 elements the truncated floats. Is that right? How does that match up with "BFCVTN2 instruction writes the results to the upper half of the destination vector without affecting the other bits in the register"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep - the bfcvtn2 instruction will take the bottom half of the first input (INACTIVE, the bottom half is TMP2), and insert the top half from truncating A and inserting them into the top half. From the compilers point of view the first operand (the "destination" vector) is both an input and an output. TMP4 is the concat of TMP2, with TMP3 now being the upper bits.
I gave it another test and compiling this test with clang still produces the same assembly as before, still producing bfcvtn2 v0.8h, v1.4s
.
return Builder.CreateFPTrunc(CI->getOperand(0), | ||
Type::getBFloatTy(F->getContext())); | ||
} | ||
} else if (Name.starts_with("sve.fcvt")) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Last question about testing: if not mistaken, I don't see SVE test changes. Is it expected not to change codegen for the sve tests, or are we missing some coverage?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The sve.fcvt is what this function (upgradeAArch64IntrinsicCall) was previously handling, so will be NFC in that regard. It gets here because of these lines from upgradeArmOrAarch64IntrinsicFunction.
// 'aarch64.sve.fcvt.bf16f32' || 'aarch64.sve.fcvtnt.bf16f32'
if (Name == "fcvt.bf16f32" || Name == "fcvtnt.bf16f32") {
NewFn = nullptr;
It's now just inside an if to be more clear. It has its own tests that are still doing OK.
@@ -9053,22 +9053,19 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm> | ||
|
||
let mayRaiseFPException = 1, Uses = [FPCR] in | ||
class SIMD_BFCVTN | ||
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128, | ||
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V64, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I haven't looked at how this change is used in the base class to be honest, but is my guess correct that due to this change, we see this line disappearing in llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll:
kill: def $d0 killed $d0 killed $q0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes that sounds right. It is needed to make the patterns match properly, as a bfcvtn will naturally produce a 64bit vector, and v4bf16 is a 64bit vector. Other instructions that use SIMDMixedTwoVector like XTN use the same type.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, LGTM
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/2/builds/15273 Here is the relevant piece of the build log for the reference
|
One more failure on Windows host using MSVC
|
Ah - thanks. It is hopefully fixed in 6dc356d? |
Yes, https://lab.llvm.org/buildbot/#/builders/197/builds/927 |
Hey @davemgreen, we are looking at a runtime failure in a test from the GCC test-suite: I think to reproduce this, this will work: clang vfmash_lane_f16_1.c -mcpu=neoverse-v2 -O0 -lm -o ./vfmash_lane_f16_1.exe and
Seems to be fine when compiled with -O1 and up. There are some scary compiler warning messages, e.g.: ./arm-neon-ref.h:335:41: warning: value size does not match register size specified by the constraint and modifier [-Wasm-operand-widths] But haven't looked yet if this important, also haven't looked at the root cause yet. |
Forgot to add that a similar problems occur for another test in that same directory: |
Hi - that sounds like GISel might be miss-compiling it? It doesn't support bf16, so shouldn't be trying to use those instructions for fp16. I can try and take a look. |
This started out as trying to combine bf16 fpround to BFCVT2 instructions, but ended up removing the aarch64.neon.nfcvt intrinsics in favour of generating fpround instructions directly. This simplifies the patterns and can lead to other optimizations. The BFCVT2 instruction is adjusted to makes sure the types are valid, and a bfcvt2 is now generated in more place. The old intrinsics are auto-upgraded to fptrunc instructions too.